blob: 83efa8167a8d3c1595d1257367bb8a905cae0e89 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000168 if (unicode->defenc) {
169 Py_DECREF(unicode->defenc);
170 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000216 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000223 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000227 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000229 }
230 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 }
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237 }
238
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000239 if (!unicode->str) {
240 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000241 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000246 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000248
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000251 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = NULL;
263 unicode->length = 0;
264 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 if (unicode->defenc) {
266 Py_DECREF(unicode->defenc);
267 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000268 }
269 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000275 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000277 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
283{
284 PyUnicodeObject *unicode;
285
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
289
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294 return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
301{
302 PyUnicodeObject *unicode;
303
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
307 }
308
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
312
313 /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317 {
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
323 }
324#endif
325
326 return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
332{
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342 {
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
348 }
349#endif
350
351 return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000358 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
359}
360
361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362 const char *encoding,
363 const char *errors)
364{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 const char *s;
366 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000367 int owned = 0;
368 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370 if (obj == NULL) {
371 PyErr_BadInternalCall();
372 return NULL;
373 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000374
375 /* Coerce object */
376 if (PyInstance_Check(obj)) {
377 PyObject *func;
378 func = PyObject_GetAttrString(obj, "__str__");
379 if (func == NULL) {
380 PyErr_SetString(PyExc_TypeError,
381 "coercing to Unicode: instance doesn't define __str__");
382 return NULL;
383 }
384 obj = PyEval_CallObject(func, NULL);
385 Py_DECREF(func);
386 if (obj == NULL)
387 return NULL;
388 owned = 1;
389 }
390 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000392 v = obj;
393 if (encoding) {
394 PyErr_SetString(PyExc_TypeError,
395 "decoding Unicode is not supported");
396 return NULL;
397 }
398 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400 else if (PyString_Check(obj)) {
401 s = PyString_AS_STRING(obj);
402 len = PyString_GET_SIZE(obj);
403 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000404 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000408 PyErr_Format(PyExc_TypeError,
409 "coercing to Unicode: need string or buffer, "
410 "%.80s found",
411 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414
415 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 if (len == 0) {
417 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 else
421 v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return v;
427
428 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000429 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000430 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000431 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433}
434
435PyObject *PyUnicode_Decode(const char *s,
436 int size,
437 const char *encoding,
438 const char *errors)
439{
440 PyObject *buffer = NULL, *unicode;
441
Fred Drakee4315f52000-05-09 19:53:39 +0000442 if (encoding == NULL)
443 encoding = PyUnicode_GetDefaultEncoding();
444
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000448 else if (strcmp(encoding, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s, size, errors);
450 else if (strcmp(encoding, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452
453 /* Decode via the codec registry */
454 buffer = PyBuffer_FromMemory((void *)s, size);
455 if (buffer == NULL)
456 goto onError;
457 unicode = PyCodec_Decode(buffer, encoding, errors);
458 if (unicode == NULL)
459 goto onError;
460 if (!PyUnicode_Check(unicode)) {
461 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000462 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 unicode->ob_type->tp_name);
464 Py_DECREF(unicode);
465 goto onError;
466 }
467 Py_DECREF(buffer);
468 return unicode;
469
470 onError:
471 Py_XDECREF(buffer);
472 return NULL;
473}
474
475PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476 int size,
477 const char *encoding,
478 const char *errors)
479{
480 PyObject *v, *unicode;
481
482 unicode = PyUnicode_FromUnicode(s, size);
483 if (unicode == NULL)
484 return NULL;
485 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486 Py_DECREF(unicode);
487 return v;
488}
489
490PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491 const char *encoding,
492 const char *errors)
493{
494 PyObject *v;
495
496 if (!PyUnicode_Check(unicode)) {
497 PyErr_BadArgument();
498 goto onError;
499 }
Fred Drakee4315f52000-05-09 19:53:39 +0000500
501 if (encoding == NULL)
502 encoding = PyUnicode_GetDefaultEncoding();
503
504 /* Shortcuts for common default encodings */
505 if (errors == NULL) {
506 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000508 else if (strcmp(encoding, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode);
510 else if (strcmp(encoding, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode);
512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 /* Encode via the codec registry */
515 v = PyCodec_Encode(unicode, encoding, errors);
516 if (v == NULL)
517 goto onError;
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v)) {
520 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000521 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522 v->ob_type->tp_name);
523 Py_DECREF(v);
524 goto onError;
525 }
526 return v;
527
528 onError:
529 return NULL;
530}
531
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000532/* Return a Python string holding the default encoded value of the
533 Unicode object.
534
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
539
540 The refcount of the string is *not* incremented.
541
542 *** Exported for internal use by the interpreter only !!! ***
543
544*/
545
546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547 const char *errors)
548{
549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
550
551 if (v)
552 return v;
553 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554 if (v && errors == NULL)
555 ((PyUnicodeObject *)unicode)->defenc = v;
556 return v;
557}
558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
560{
561 if (!PyUnicode_Check(unicode)) {
562 PyErr_BadArgument();
563 goto onError;
564 }
565 return PyUnicode_AS_UNICODE(unicode);
566
567 onError:
568 return NULL;
569}
570
571int PyUnicode_GetSize(PyObject *unicode)
572{
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577 return PyUnicode_GET_SIZE(unicode);
578
579 onError:
580 return -1;
581}
582
Thomas Wouters78890102000-07-22 19:25:51 +0000583const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000584{
585 return unicode_default_encoding;
586}
587
588int PyUnicode_SetDefaultEncoding(const char *encoding)
589{
590 PyObject *v;
591
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v = _PyCodec_Lookup(encoding);
595 if (v == NULL)
596 goto onError;
597 Py_DECREF(v);
598 strncpy(unicode_default_encoding,
599 encoding,
600 sizeof(unicode_default_encoding));
601 return 0;
602
603 onError:
604 return -1;
605}
606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- UTF-8 Codec -------------------------------------------------------- */
608
609static
610char utf8_code_length[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
629};
630
631static
632int utf8_decoding_error(const char **source,
633 Py_UNICODE **dest,
634 const char *errors,
635 const char *details)
636{
637 if ((errors == NULL) ||
638 (strcmp(errors,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000640 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 details);
642 return -1;
643 }
644 else if (strcmp(errors,"ignore") == 0) {
645 (*source)++;
646 return 0;
647 }
648 else if (strcmp(errors,"replace") == 0) {
649 (*source)++;
650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651 (*dest)++;
652 return 0;
653 }
654 else {
655 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000656 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 errors);
658 return -1;
659 }
660}
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662PyObject *PyUnicode_DecodeUTF8(const char *s,
663 int size,
664 const char *errors)
665{
666 int n;
667 const char *e;
668 PyUnicodeObject *unicode;
669 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000670 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671
672 /* Note: size will always be longer than the resulting Unicode
673 character count */
674 unicode = _PyUnicode_New(size);
675 if (!unicode)
676 return NULL;
677 if (size == 0)
678 return (PyObject *)unicode;
679
680 /* Unpack UTF-8 encoded data */
681 p = unicode->str;
682 e = s + size;
683
684 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000685 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686
687 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000688 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 s++;
690 continue;
691 }
692
693 n = utf8_code_length[ch];
694
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 if (s + n > e) {
696 errmsg = "unexpected end of data";
697 goto utf8Error;
698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699
700 switch (n) {
701
702 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000703 errmsg = "unexpected code byte";
704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 break;
706
707 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000708 errmsg = "internal error";
709 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000710 break;
711
712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 if ((s[1] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (ch < 0x80) {
719 errmsg = "illegal encoding";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000723 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724 break;
725
726 case 3:
727 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 (s[2] & 0xc0) != 0x80) {
729 errmsg = "invalid data";
730 goto utf8Error;
731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734 errmsg = "illegal encoding";
735 goto utf8Error;
736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000738 *p++ = (Py_UNICODE)ch;
739 break;
740
741 case 4:
742 if ((s[1] & 0xc0) != 0x80 ||
743 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 (s[3] & 0xc0) != 0x80) {
745 errmsg = "invalid data";
746 goto utf8Error;
747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 if ((ch < 0x10000) || /* minimum value allowed for 4
752 byte encoding */
753 (ch > 0x10ffff)) { /* maximum value allowed for
754 UTF-16 */
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000758 /* compute and append the two surrogates: */
759
760 /* translate from 10000..10FFFF to 0..FFFF */
761 ch -= 0x10000;
762
763 /* high surrogate = top 10 bits added to D800 */
764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
765
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 break;
769
770 default:
771 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000772 errmsg = "unsupported Unicode code range";
773 goto utf8Error;
774 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 }
776 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 continue;
778
779 utf8Error:
780 if (utf8_decoding_error(&s, &p, errors, errmsg))
781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 }
783
784 /* Adjust length */
785 if (_PyUnicode_Resize(unicode, p - unicode->str))
786 goto onError;
787
788 return (PyObject *)unicode;
789
790onError:
791 Py_DECREF(unicode);
792 return NULL;
793}
794
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795/* Not used anymore, now that the encoder supports UTF-16
796 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000797#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798static
799int utf8_encoding_error(const Py_UNICODE **source,
800 char **dest,
801 const char *errors,
802 const char *details)
803{
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000807 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 details);
809 return -1;
810 }
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
813 }
814 else if (strcmp(errors,"replace") == 0) {
815 **dest = '?';
816 (*dest)++;
817 return 0;
818 }
819 else {
820 PyErr_Format(PyExc_ValueError,
821 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000822 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 errors);
824 return -1;
825 }
826}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828
829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830 int size,
831 const char *errors)
832{
833 PyObject *v;
834 char *p;
835 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000836 Py_UCS4 ch2;
837 unsigned int cbAllocated = 3 * size;
838 unsigned int cbWritten = 0;
839 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000842 if (v == NULL)
843 return NULL;
844 if (size == 0)
845 goto done;
846
847 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 while (i < size) {
849 Py_UCS4 ch = s[i++];
850 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000852 cbWritten++;
853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 else if (ch < 0x0800) {
855 *p++ = 0xc0 | (ch >> 6);
856 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000857 cbWritten += 2;
858 }
859 else {
860 /* Check for high surrogate */
861 if (0xD800 <= ch && ch <= 0xDBFF) {
862 if (i != size) {
863 ch2 = s[i];
864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
865
866 if (cbWritten >= (cbAllocated - 4)) {
867 /* Provide enough room for some more
868 surrogates */
869 cbAllocated += 4*10;
870 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000871 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000872 }
873
874 /* combine the two values */
875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
876
877 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 i++;
880 cbWritten += 4;
881 }
882 }
883 }
884 else {
885 *p++ = (char)(0xe0 | (ch >> 12));
886 cbWritten += 3;
887 }
888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891 }
892 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000893 if (_PyString_Resize(&v, p - q))
894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895
896 done:
897 return v;
898
899 onError:
900 Py_DECREF(v);
901 return NULL;
902}
903
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
905{
906 PyObject *str;
907
908 if (!PyUnicode_Check(unicode)) {
909 PyErr_BadArgument();
910 return NULL;
911 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000912 str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
913 PyUnicode_GET_SIZE(unicode),
914 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000915 if (str == NULL)
916 return NULL;
917 Py_INCREF(str);
918 return str;
919}
920
921/* --- UTF-16 Codec ------------------------------------------------------- */
922
923static
924int utf16_decoding_error(const Py_UNICODE **source,
925 Py_UNICODE **dest,
926 const char *errors,
927 const char *details)
928{
929 if ((errors == NULL) ||
930 (strcmp(errors,"strict") == 0)) {
931 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000932 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933 details);
934 return -1;
935 }
936 else if (strcmp(errors,"ignore") == 0) {
937 return 0;
938 }
939 else if (strcmp(errors,"replace") == 0) {
940 if (dest) {
941 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
942 (*dest)++;
943 }
944 return 0;
945 }
946 else {
947 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000948 "UTF-16 decoding error; "
949 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 errors);
951 return -1;
952 }
953}
954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955PyObject *PyUnicode_DecodeUTF16(const char *s,
956 int size,
957 const char *errors,
958 int *byteorder)
959{
960 PyUnicodeObject *unicode;
961 Py_UNICODE *p;
962 const Py_UNICODE *q, *e;
963 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000964 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965
966 /* size should be an even number */
967 if (size % sizeof(Py_UNICODE) != 0) {
968 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
969 return NULL;
970 /* The remaining input chars are ignored if we fall through
971 here... */
972 }
973
974 /* Note: size will always be longer than the resulting Unicode
975 character count */
976 unicode = _PyUnicode_New(size);
977 if (!unicode)
978 return NULL;
979 if (size == 0)
980 return (PyObject *)unicode;
981
982 /* Unpack UTF-16 encoded data */
983 p = unicode->str;
984 q = (Py_UNICODE *)s;
985 e = q + (size / sizeof(Py_UNICODE));
986
987 if (byteorder)
988 bo = *byteorder;
989
990 while (q < e) {
991 register Py_UNICODE ch = *q++;
992
993 /* Check for BOM marks (U+FEFF) in the input and adjust
994 current byte order setting accordingly. Swap input
995 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
996 !) */
997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
998 if (ch == 0xFEFF) {
999 bo = -1;
1000 continue;
1001 } else if (ch == 0xFFFE) {
1002 bo = 1;
1003 continue;
1004 }
1005 if (bo == 1)
1006 ch = (ch >> 8) | (ch << 8);
1007#else
1008 if (ch == 0xFEFF) {
1009 bo = 1;
1010 continue;
1011 } else if (ch == 0xFFFE) {
1012 bo = -1;
1013 continue;
1014 }
1015 if (bo == -1)
1016 ch = (ch >> 8) | (ch << 8);
1017#endif
1018 if (ch < 0xD800 || ch > 0xDFFF) {
1019 *p++ = ch;
1020 continue;
1021 }
1022
1023 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001024 if (q >= e) {
1025 errmsg = "unexpected end of data";
1026 goto utf16Error;
1027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 if (0xDC00 <= *q && *q <= 0xDFFF) {
1029 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001030 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 /* This is valid data (a UTF-16 surrogate pair), but
1032 we are not able to store this information since our
1033 Py_UNICODE type only has 16 bits... this might
1034 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001035 errmsg = "code pairs are not supported";
1036 goto utf16Error;
1037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 else
1039 continue;
1040 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001041 errmsg = "illegal encoding";
1042 /* Fall through to report the error */
1043
1044 utf16Error:
1045 if (utf16_decoding_error(&q, &p, errors, errmsg))
1046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 }
1048
1049 if (byteorder)
1050 *byteorder = bo;
1051
1052 /* Adjust length */
1053 if (_PyUnicode_Resize(unicode, p - unicode->str))
1054 goto onError;
1055
1056 return (PyObject *)unicode;
1057
1058onError:
1059 Py_DECREF(unicode);
1060 return NULL;
1061}
1062
1063#undef UTF16_ERROR
1064
1065PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1066 int size,
1067 const char *errors,
1068 int byteorder)
1069{
1070 PyObject *v;
1071 Py_UNICODE *p;
1072 char *q;
1073
1074 /* We don't create UTF-16 pairs... */
1075 v = PyString_FromStringAndSize(NULL,
1076 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1077 if (v == NULL)
1078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079
1080 q = PyString_AS_STRING(v);
1081 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082 if (byteorder == 0)
1083 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001084 if (size == 0)
1085 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 if (byteorder == 0 ||
1087#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1088 byteorder == -1
1089#else
1090 byteorder == 1
1091#endif
1092 )
1093 memcpy(p, s, size * sizeof(Py_UNICODE));
1094 else
1095 while (size-- > 0) {
1096 Py_UNICODE ch = *s++;
1097 *p++ = (ch >> 8) | (ch << 8);
1098 }
1099 done:
1100 return v;
1101}
1102
1103PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1104{
1105 if (!PyUnicode_Check(unicode)) {
1106 PyErr_BadArgument();
1107 return NULL;
1108 }
1109 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1110 PyUnicode_GET_SIZE(unicode),
1111 NULL,
1112 0);
1113}
1114
1115/* --- Unicode Escape Codec ----------------------------------------------- */
1116
1117static
1118int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001119 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 const char *errors,
1121 const char *details)
1122{
1123 if ((errors == NULL) ||
1124 (strcmp(errors,"strict") == 0)) {
1125 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001126 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 details);
1128 return -1;
1129 }
1130 else if (strcmp(errors,"ignore") == 0) {
1131 return 0;
1132 }
1133 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001134 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 return 0;
1136 }
1137 else {
1138 PyErr_Format(PyExc_ValueError,
1139 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001140 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 errors);
1142 return -1;
1143 }
1144}
1145
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001146static _Py_UCNHashAPI *pucnHash = NULL;
1147
1148static
1149int mystrnicmp(const char *s1, const char *s2, size_t count)
1150{
1151 char c1, c2;
1152
1153 if (count)
1154 {
1155 do
1156 {
1157 c1 = tolower(*(s1++));
1158 c2 = tolower(*(s2++));
1159 }
1160 while(--count && c1 == c2);
1161
1162 return c1 - c2;
1163 }
1164
1165 return 0;
1166}
1167
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1169 int size,
1170 const char *errors)
1171{
1172 PyUnicodeObject *v;
1173 Py_UNICODE *p = NULL, *buf = NULL;
1174 const char *end;
1175
1176 /* Escaped strings will always be longer than the resulting
1177 Unicode string, so we start with size here and then reduce the
1178 length after conversion to the true value. */
1179 v = _PyUnicode_New(size);
1180 if (v == NULL)
1181 goto onError;
1182 if (size == 0)
1183 return (PyObject *)v;
1184 p = buf = PyUnicode_AS_UNICODE(v);
1185 end = s + size;
1186 while (s < end) {
1187 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001188 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 int i;
1190
1191 /* Non-escape characters are interpreted as Unicode ordinals */
1192 if (*s != '\\') {
1193 *p++ = (unsigned char)*s++;
1194 continue;
1195 }
1196
1197 /* \ - Escapes */
1198 s++;
1199 switch (*s++) {
1200
1201 /* \x escapes */
1202 case '\n': break;
1203 case '\\': *p++ = '\\'; break;
1204 case '\'': *p++ = '\''; break;
1205 case '\"': *p++ = '\"'; break;
1206 case 'b': *p++ = '\b'; break;
1207 case 'f': *p++ = '\014'; break; /* FF */
1208 case 't': *p++ = '\t'; break;
1209 case 'n': *p++ = '\n'; break;
1210 case 'r': *p++ = '\r'; break;
1211 case 'v': *p++ = '\013'; break; /* VT */
1212 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1213
1214 /* \OOO (octal) escapes */
1215 case '0': case '1': case '2': case '3':
1216 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001217 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001219 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001221 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001223 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 break;
1225
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001226 /* \xXXXX escape with 1-n hex digits. for compatibility
1227 with 8-bit strings, this code ignores all but the last
1228 two digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 case 'x':
1230 x = 0;
1231 c = (unsigned char)*s;
1232 if (isxdigit(c)) {
1233 do {
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001234 x = (x<<4) & 0xF0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 if ('0' <= c && c <= '9')
1236 x += c - '0';
1237 else if ('a' <= c && c <= 'f')
1238 x += 10 + c - 'a';
1239 else
1240 x += 10 + c - 'A';
1241 c = (unsigned char)*++s;
1242 } while (isxdigit(c));
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001243 *p++ = (unsigned char) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 } else {
1245 *p++ = '\\';
1246 *p++ = (unsigned char)s[-1];
1247 }
1248 break;
1249
1250 /* \uXXXX with 4 hex digits */
1251 case 'u':
1252 for (x = 0, i = 0; i < 4; i++) {
1253 c = (unsigned char)s[i];
1254 if (!isxdigit(c)) {
1255 if (unicodeescape_decoding_error(&s, &x, errors,
1256 "truncated \\uXXXX"))
1257 goto onError;
1258 i++;
1259 break;
1260 }
1261 x = (x<<4) & ~0xF;
1262 if (c >= '0' && c <= '9')
1263 x += c - '0';
1264 else if (c >= 'a' && c <= 'f')
1265 x += 10 + c - 'a';
1266 else
1267 x += 10 + c - 'A';
1268 }
1269 s += i;
1270 *p++ = x;
1271 break;
1272
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001273 case 'N':
1274 /* Ok, we need to deal with Unicode Character Names now,
1275 * make sure we've imported the hash table data...
1276 */
1277 if (pucnHash == NULL)
1278 {
1279 PyObject *mod = 0, *v = 0;
1280
1281 mod = PyImport_ImportModule("ucnhash");
1282 if (mod == NULL)
1283 goto onError;
1284 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1285 Py_DECREF(mod);
1286 if (v == NULL)
1287 {
1288 goto onError;
1289 }
1290 pucnHash = PyCObject_AsVoidPtr(v);
1291 Py_DECREF(v);
1292 if (pucnHash == NULL)
1293 {
1294 goto onError;
1295 }
1296 }
1297
1298 if (*s == '{')
1299 {
1300 const char *start = s + 1;
1301 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001303 unsigned long j;
1304
1305 /* look for either the closing brace, or we
1306 * exceed the maximum length of the unicode character names
1307 */
1308 while (*endBrace != '}' &&
1309 (unsigned int)(endBrace - start) <=
1310 pucnHash->cchMax &&
1311 endBrace < end)
1312 {
1313 endBrace++;
1314 }
1315 if (endBrace != end && *endBrace == '}')
1316 {
1317 j = pucnHash->hash(start, endBrace - start);
1318 if (j > pucnHash->cKeys ||
1319 mystrnicmp(
1320 start,
1321 ((_Py_UnicodeCharacterName *)
1322 (pucnHash->getValue(j)))->pszUCN,
1323 (int)(endBrace - start)) != 0)
1324 {
1325 if (unicodeescape_decoding_error(
1326 &s, &x, errors,
1327 "Invalid Unicode Character Name"))
1328 {
1329 goto onError;
1330 }
1331 goto ucnFallthrough;
1332 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 value = ((_Py_UnicodeCharacterName *)
1334 (pucnHash->getValue(j)))->value;
1335 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001336 {
1337 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001338 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001339 }
1340 else
1341 {
1342 /* Oops, its in UCS-4 space, */
1343 /* compute and append the two surrogates: */
1344 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001345 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001346
1347 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001349
1350 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001352 }
1353 s = endBrace + 1;
1354 }
1355 else
1356 {
1357 if (unicodeescape_decoding_error(
1358 &s, &x, errors,
1359 "Unicode name missing closing brace"))
1360 goto onError;
1361 goto ucnFallthrough;
1362 }
1363 break;
1364 }
1365 if (unicodeescape_decoding_error(
1366 &s, &x, errors,
1367 "Missing opening brace for Unicode Character Name escape"))
1368 goto onError;
1369ucnFallthrough:
1370 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001371 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 *p++ = '\\';
1373 *p++ = (unsigned char)s[-1];
1374 break;
1375 }
1376 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001377 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 return (PyObject *)v;
1380
1381 onError:
1382 Py_XDECREF(v);
1383 return NULL;
1384}
1385
1386/* Return a Unicode-Escape string version of the Unicode object.
1387
1388 If quotes is true, the string is enclosed in u"" or u'' quotes as
1389 appropriate.
1390
1391*/
1392
Barry Warsaw51ac5802000-03-20 16:36:48 +00001393static const Py_UNICODE *findchar(const Py_UNICODE *s,
1394 int size,
1395 Py_UNICODE ch);
1396
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397static
1398PyObject *unicodeescape_string(const Py_UNICODE *s,
1399 int size,
1400 int quotes)
1401{
1402 PyObject *repr;
1403 char *p;
1404 char *q;
1405
1406 static const char *hexdigit = "0123456789ABCDEF";
1407
1408 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1409 if (repr == NULL)
1410 return NULL;
1411
1412 p = q = PyString_AS_STRING(repr);
1413
1414 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 *p++ = 'u';
1416 *p++ = (findchar(s, size, '\'') &&
1417 !findchar(s, size, '"')) ? '"' : '\'';
1418 }
1419 while (size-- > 0) {
1420 Py_UNICODE ch = *s++;
1421 /* Escape quotes */
1422 if (quotes && (ch == q[1] || ch == '\\')) {
1423 *p++ = '\\';
1424 *p++ = (char) ch;
1425 }
1426 /* Map 16-bit characters to '\uxxxx' */
1427 else if (ch >= 256) {
1428 *p++ = '\\';
1429 *p++ = 'u';
1430 *p++ = hexdigit[(ch >> 12) & 0xf];
1431 *p++ = hexdigit[(ch >> 8) & 0xf];
1432 *p++ = hexdigit[(ch >> 4) & 0xf];
1433 *p++ = hexdigit[ch & 15];
1434 }
1435 /* Map non-printable US ASCII to '\ooo' */
1436 else if (ch < ' ' || ch >= 128) {
1437 *p++ = '\\';
1438 *p++ = hexdigit[(ch >> 6) & 7];
1439 *p++ = hexdigit[(ch >> 3) & 7];
1440 *p++ = hexdigit[ch & 7];
1441 }
1442 /* Copy everything else as-is */
1443 else
1444 *p++ = (char) ch;
1445 }
1446 if (quotes)
1447 *p++ = q[1];
1448
1449 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001450 if (_PyString_Resize(&repr, p - q))
1451 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452
1453 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001454
1455 onError:
1456 Py_DECREF(repr);
1457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458}
1459
1460PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1461 int size)
1462{
1463 return unicodeescape_string(s, size, 0);
1464}
1465
1466PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1467{
1468 if (!PyUnicode_Check(unicode)) {
1469 PyErr_BadArgument();
1470 return NULL;
1471 }
1472 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1473 PyUnicode_GET_SIZE(unicode));
1474}
1475
1476/* --- Raw Unicode Escape Codec ------------------------------------------- */
1477
1478PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1479 int size,
1480 const char *errors)
1481{
1482 PyUnicodeObject *v;
1483 Py_UNICODE *p, *buf;
1484 const char *end;
1485 const char *bs;
1486
1487 /* Escaped strings will always be longer than the resulting
1488 Unicode string, so we start with size here and then reduce the
1489 length after conversion to the true value. */
1490 v = _PyUnicode_New(size);
1491 if (v == NULL)
1492 goto onError;
1493 if (size == 0)
1494 return (PyObject *)v;
1495 p = buf = PyUnicode_AS_UNICODE(v);
1496 end = s + size;
1497 while (s < end) {
1498 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001499 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 int i;
1501
1502 /* Non-escape characters are interpreted as Unicode ordinals */
1503 if (*s != '\\') {
1504 *p++ = (unsigned char)*s++;
1505 continue;
1506 }
1507
1508 /* \u-escapes are only interpreted iff the number of leading
1509 backslashes if odd */
1510 bs = s;
1511 for (;s < end;) {
1512 if (*s != '\\')
1513 break;
1514 *p++ = (unsigned char)*s++;
1515 }
1516 if (((s - bs) & 1) == 0 ||
1517 s >= end ||
1518 *s != 'u') {
1519 continue;
1520 }
1521 p--;
1522 s++;
1523
1524 /* \uXXXX with 4 hex digits */
1525 for (x = 0, i = 0; i < 4; i++) {
1526 c = (unsigned char)s[i];
1527 if (!isxdigit(c)) {
1528 if (unicodeescape_decoding_error(&s, &x, errors,
1529 "truncated \\uXXXX"))
1530 goto onError;
1531 i++;
1532 break;
1533 }
1534 x = (x<<4) & ~0xF;
1535 if (c >= '0' && c <= '9')
1536 x += c - '0';
1537 else if (c >= 'a' && c <= 'f')
1538 x += 10 + c - 'a';
1539 else
1540 x += 10 + c - 'A';
1541 }
1542 s += i;
1543 *p++ = x;
1544 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001545 if (_PyUnicode_Resize(v, (int)(p - buf)))
1546 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547 return (PyObject *)v;
1548
1549 onError:
1550 Py_XDECREF(v);
1551 return NULL;
1552}
1553
1554PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1555 int size)
1556{
1557 PyObject *repr;
1558 char *p;
1559 char *q;
1560
1561 static const char *hexdigit = "0123456789ABCDEF";
1562
1563 repr = PyString_FromStringAndSize(NULL, 6 * size);
1564 if (repr == NULL)
1565 return NULL;
1566
1567 p = q = PyString_AS_STRING(repr);
1568 while (size-- > 0) {
1569 Py_UNICODE ch = *s++;
1570 /* Map 16-bit characters to '\uxxxx' */
1571 if (ch >= 256) {
1572 *p++ = '\\';
1573 *p++ = 'u';
1574 *p++ = hexdigit[(ch >> 12) & 0xf];
1575 *p++ = hexdigit[(ch >> 8) & 0xf];
1576 *p++ = hexdigit[(ch >> 4) & 0xf];
1577 *p++ = hexdigit[ch & 15];
1578 }
1579 /* Copy everything else as-is */
1580 else
1581 *p++ = (char) ch;
1582 }
1583 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001584 if (_PyString_Resize(&repr, p - q))
1585 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586
1587 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001588
1589 onError:
1590 Py_DECREF(repr);
1591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
1594PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1595{
1596 if (!PyUnicode_Check(unicode)) {
1597 PyErr_BadArgument();
1598 return NULL;
1599 }
1600 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1601 PyUnicode_GET_SIZE(unicode));
1602}
1603
1604/* --- Latin-1 Codec ------------------------------------------------------ */
1605
1606PyObject *PyUnicode_DecodeLatin1(const char *s,
1607 int size,
1608 const char *errors)
1609{
1610 PyUnicodeObject *v;
1611 Py_UNICODE *p;
1612
1613 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1614 v = _PyUnicode_New(size);
1615 if (v == NULL)
1616 goto onError;
1617 if (size == 0)
1618 return (PyObject *)v;
1619 p = PyUnicode_AS_UNICODE(v);
1620 while (size-- > 0)
1621 *p++ = (unsigned char)*s++;
1622 return (PyObject *)v;
1623
1624 onError:
1625 Py_XDECREF(v);
1626 return NULL;
1627}
1628
1629static
1630int latin1_encoding_error(const Py_UNICODE **source,
1631 char **dest,
1632 const char *errors,
1633 const char *details)
1634{
1635 if ((errors == NULL) ||
1636 (strcmp(errors,"strict") == 0)) {
1637 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001638 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 details);
1640 return -1;
1641 }
1642 else if (strcmp(errors,"ignore") == 0) {
1643 return 0;
1644 }
1645 else if (strcmp(errors,"replace") == 0) {
1646 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001647 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648 return 0;
1649 }
1650 else {
1651 PyErr_Format(PyExc_ValueError,
1652 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001653 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 errors);
1655 return -1;
1656 }
1657}
1658
1659PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1660 int size,
1661 const char *errors)
1662{
1663 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001664 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 repr = PyString_FromStringAndSize(NULL, size);
1666 if (repr == NULL)
1667 return NULL;
1668
1669 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001670 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 while (size-- > 0) {
1672 Py_UNICODE ch = *p++;
1673 if (ch >= 256) {
1674 if (latin1_encoding_error(&p, &s, errors,
1675 "ordinal not in range(256)"))
1676 goto onError;
1677 }
1678 else
1679 *s++ = (char)ch;
1680 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001681 /* Resize if error handling skipped some characters */
1682 if (s - start < PyString_GET_SIZE(repr))
1683 if (_PyString_Resize(&repr, s - start))
1684 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 return repr;
1686
1687 onError:
1688 Py_DECREF(repr);
1689 return NULL;
1690}
1691
1692PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1693{
1694 if (!PyUnicode_Check(unicode)) {
1695 PyErr_BadArgument();
1696 return NULL;
1697 }
1698 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1699 PyUnicode_GET_SIZE(unicode),
1700 NULL);
1701}
1702
1703/* --- 7-bit ASCII Codec -------------------------------------------------- */
1704
1705static
1706int ascii_decoding_error(const char **source,
1707 Py_UNICODE **dest,
1708 const char *errors,
1709 const char *details)
1710{
1711 if ((errors == NULL) ||
1712 (strcmp(errors,"strict") == 0)) {
1713 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001714 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 details);
1716 return -1;
1717 }
1718 else if (strcmp(errors,"ignore") == 0) {
1719 return 0;
1720 }
1721 else if (strcmp(errors,"replace") == 0) {
1722 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1723 (*dest)++;
1724 return 0;
1725 }
1726 else {
1727 PyErr_Format(PyExc_ValueError,
1728 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001729 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 errors);
1731 return -1;
1732 }
1733}
1734
1735PyObject *PyUnicode_DecodeASCII(const char *s,
1736 int size,
1737 const char *errors)
1738{
1739 PyUnicodeObject *v;
1740 Py_UNICODE *p;
1741
1742 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1743 v = _PyUnicode_New(size);
1744 if (v == NULL)
1745 goto onError;
1746 if (size == 0)
1747 return (PyObject *)v;
1748 p = PyUnicode_AS_UNICODE(v);
1749 while (size-- > 0) {
1750 register unsigned char c;
1751
1752 c = (unsigned char)*s++;
1753 if (c < 128)
1754 *p++ = c;
1755 else if (ascii_decoding_error(&s, &p, errors,
1756 "ordinal not in range(128)"))
1757 goto onError;
1758 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001759 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1760 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1761 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 return (PyObject *)v;
1763
1764 onError:
1765 Py_XDECREF(v);
1766 return NULL;
1767}
1768
1769static
1770int ascii_encoding_error(const Py_UNICODE **source,
1771 char **dest,
1772 const char *errors,
1773 const char *details)
1774{
1775 if ((errors == NULL) ||
1776 (strcmp(errors,"strict") == 0)) {
1777 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001778 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 details);
1780 return -1;
1781 }
1782 else if (strcmp(errors,"ignore") == 0) {
1783 return 0;
1784 }
1785 else if (strcmp(errors,"replace") == 0) {
1786 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001787 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 return 0;
1789 }
1790 else {
1791 PyErr_Format(PyExc_ValueError,
1792 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001793 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 errors);
1795 return -1;
1796 }
1797}
1798
1799PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1800 int size,
1801 const char *errors)
1802{
1803 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001804 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 repr = PyString_FromStringAndSize(NULL, size);
1806 if (repr == NULL)
1807 return NULL;
1808
1809 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001810 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 while (size-- > 0) {
1812 Py_UNICODE ch = *p++;
1813 if (ch >= 128) {
1814 if (ascii_encoding_error(&p, &s, errors,
1815 "ordinal not in range(128)"))
1816 goto onError;
1817 }
1818 else
1819 *s++ = (char)ch;
1820 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001821 /* Resize if error handling skipped some characters */
1822 if (s - start < PyString_GET_SIZE(repr))
1823 if (_PyString_Resize(&repr, s - start))
1824 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 return repr;
1826
1827 onError:
1828 Py_DECREF(repr);
1829 return NULL;
1830}
1831
1832PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1833{
1834 if (!PyUnicode_Check(unicode)) {
1835 PyErr_BadArgument();
1836 return NULL;
1837 }
1838 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1839 PyUnicode_GET_SIZE(unicode),
1840 NULL);
1841}
1842
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001843#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001844
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001845/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001847PyObject *PyUnicode_DecodeMBCS(const char *s,
1848 int size,
1849 const char *errors)
1850{
1851 PyUnicodeObject *v;
1852 Py_UNICODE *p;
1853
1854 /* First get the size of the result */
1855 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001856 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001857 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1858
1859 v = _PyUnicode_New(usize);
1860 if (v == NULL)
1861 return NULL;
1862 if (usize == 0)
1863 return (PyObject *)v;
1864 p = PyUnicode_AS_UNICODE(v);
1865 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1866 Py_DECREF(v);
1867 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1868 }
1869
1870 return (PyObject *)v;
1871}
1872
1873PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1874 int size,
1875 const char *errors)
1876{
1877 PyObject *repr;
1878 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001879 DWORD mbcssize;
1880
1881 /* If there are no characters, bail now! */
1882 if (size==0)
1883 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001884
1885 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001886 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001887 if (mbcssize==0)
1888 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1889
1890 repr = PyString_FromStringAndSize(NULL, mbcssize);
1891 if (repr == NULL)
1892 return NULL;
1893 if (mbcssize==0)
1894 return repr;
1895
1896 /* Do the conversion */
1897 s = PyString_AS_STRING(repr);
1898 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1899 Py_DECREF(repr);
1900 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1901 }
1902 return repr;
1903}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001904
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001905#endif /* MS_WIN32 */
1906
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907/* --- Character Mapping Codec -------------------------------------------- */
1908
1909static
1910int charmap_decoding_error(const char **source,
1911 Py_UNICODE **dest,
1912 const char *errors,
1913 const char *details)
1914{
1915 if ((errors == NULL) ||
1916 (strcmp(errors,"strict") == 0)) {
1917 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001918 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 details);
1920 return -1;
1921 }
1922 else if (strcmp(errors,"ignore") == 0) {
1923 return 0;
1924 }
1925 else if (strcmp(errors,"replace") == 0) {
1926 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1927 (*dest)++;
1928 return 0;
1929 }
1930 else {
1931 PyErr_Format(PyExc_ValueError,
1932 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001933 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 errors);
1935 return -1;
1936 }
1937}
1938
1939PyObject *PyUnicode_DecodeCharmap(const char *s,
1940 int size,
1941 PyObject *mapping,
1942 const char *errors)
1943{
1944 PyUnicodeObject *v;
1945 Py_UNICODE *p;
1946
1947 /* Default to Latin-1 */
1948 if (mapping == NULL)
1949 return PyUnicode_DecodeLatin1(s, size, errors);
1950
1951 v = _PyUnicode_New(size);
1952 if (v == NULL)
1953 goto onError;
1954 if (size == 0)
1955 return (PyObject *)v;
1956 p = PyUnicode_AS_UNICODE(v);
1957 while (size-- > 0) {
1958 unsigned char ch = *s++;
1959 PyObject *w, *x;
1960
1961 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1962 w = PyInt_FromLong((long)ch);
1963 if (w == NULL)
1964 goto onError;
1965 x = PyObject_GetItem(mapping, w);
1966 Py_DECREF(w);
1967 if (x == NULL) {
1968 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1969 /* No mapping found: default to Latin-1 mapping */
1970 PyErr_Clear();
1971 *p++ = (Py_UNICODE)ch;
1972 continue;
1973 }
1974 goto onError;
1975 }
1976
1977 /* Apply mapping */
1978 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001979 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 if (value < 0 || value > 65535) {
1981 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001982 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 Py_DECREF(x);
1984 goto onError;
1985 }
1986 *p++ = (Py_UNICODE)value;
1987 }
1988 else if (x == Py_None) {
1989 /* undefined mapping */
1990 if (charmap_decoding_error(&s, &p, errors,
1991 "character maps to <undefined>")) {
1992 Py_DECREF(x);
1993 goto onError;
1994 }
1995 }
1996 else if (PyUnicode_Check(x)) {
1997 if (PyUnicode_GET_SIZE(x) != 1) {
1998 /* 1-n mapping */
1999 PyErr_SetString(PyExc_NotImplementedError,
2000 "1-n mappings are currently not implemented");
2001 Py_DECREF(x);
2002 goto onError;
2003 }
2004 *p++ = *PyUnicode_AS_UNICODE(x);
2005 }
2006 else {
2007 /* wrong return value */
2008 PyErr_SetString(PyExc_TypeError,
2009 "character mapping must return integer, None or unicode");
2010 Py_DECREF(x);
2011 goto onError;
2012 }
2013 Py_DECREF(x);
2014 }
2015 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2016 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2017 goto onError;
2018 return (PyObject *)v;
2019
2020 onError:
2021 Py_XDECREF(v);
2022 return NULL;
2023}
2024
2025static
2026int charmap_encoding_error(const Py_UNICODE **source,
2027 char **dest,
2028 const char *errors,
2029 const char *details)
2030{
2031 if ((errors == NULL) ||
2032 (strcmp(errors,"strict") == 0)) {
2033 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002034 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 details);
2036 return -1;
2037 }
2038 else if (strcmp(errors,"ignore") == 0) {
2039 return 0;
2040 }
2041 else if (strcmp(errors,"replace") == 0) {
2042 **dest = '?';
2043 (*dest)++;
2044 return 0;
2045 }
2046 else {
2047 PyErr_Format(PyExc_ValueError,
2048 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002049 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 errors);
2051 return -1;
2052 }
2053}
2054
2055PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2056 int size,
2057 PyObject *mapping,
2058 const char *errors)
2059{
2060 PyObject *v;
2061 char *s;
2062
2063 /* Default to Latin-1 */
2064 if (mapping == NULL)
2065 return PyUnicode_EncodeLatin1(p, size, errors);
2066
2067 v = PyString_FromStringAndSize(NULL, size);
2068 if (v == NULL)
2069 return NULL;
2070 s = PyString_AS_STRING(v);
2071 while (size-- > 0) {
2072 Py_UNICODE ch = *p++;
2073 PyObject *w, *x;
2074
2075 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2076 w = PyInt_FromLong((long)ch);
2077 if (w == NULL)
2078 goto onError;
2079 x = PyObject_GetItem(mapping, w);
2080 Py_DECREF(w);
2081 if (x == NULL) {
2082 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2083 /* No mapping found: default to Latin-1 mapping if possible */
2084 PyErr_Clear();
2085 if (ch < 256) {
2086 *s++ = (char)ch;
2087 continue;
2088 }
2089 else if (!charmap_encoding_error(&p, &s, errors,
2090 "missing character mapping"))
2091 continue;
2092 }
2093 goto onError;
2094 }
2095
2096 /* Apply mapping */
2097 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002098 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 if (value < 0 || value > 255) {
2100 PyErr_SetString(PyExc_TypeError,
2101 "character mapping must be in range(256)");
2102 Py_DECREF(x);
2103 goto onError;
2104 }
2105 *s++ = (char)value;
2106 }
2107 else if (x == Py_None) {
2108 /* undefined mapping */
2109 if (charmap_encoding_error(&p, &s, errors,
2110 "character maps to <undefined>")) {
2111 Py_DECREF(x);
2112 goto onError;
2113 }
2114 }
2115 else if (PyString_Check(x)) {
2116 if (PyString_GET_SIZE(x) != 1) {
2117 /* 1-n mapping */
2118 PyErr_SetString(PyExc_NotImplementedError,
2119 "1-n mappings are currently not implemented");
2120 Py_DECREF(x);
2121 goto onError;
2122 }
2123 *s++ = *PyString_AS_STRING(x);
2124 }
2125 else {
2126 /* wrong return value */
2127 PyErr_SetString(PyExc_TypeError,
2128 "character mapping must return integer, None or unicode");
2129 Py_DECREF(x);
2130 goto onError;
2131 }
2132 Py_DECREF(x);
2133 }
2134 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2135 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2136 goto onError;
2137 return v;
2138
2139 onError:
2140 Py_DECREF(v);
2141 return NULL;
2142}
2143
2144PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2145 PyObject *mapping)
2146{
2147 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2148 PyErr_BadArgument();
2149 return NULL;
2150 }
2151 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2152 PyUnicode_GET_SIZE(unicode),
2153 mapping,
2154 NULL);
2155}
2156
2157static
2158int translate_error(const Py_UNICODE **source,
2159 Py_UNICODE **dest,
2160 const char *errors,
2161 const char *details)
2162{
2163 if ((errors == NULL) ||
2164 (strcmp(errors,"strict") == 0)) {
2165 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002166 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 details);
2168 return -1;
2169 }
2170 else if (strcmp(errors,"ignore") == 0) {
2171 return 0;
2172 }
2173 else if (strcmp(errors,"replace") == 0) {
2174 **dest = '?';
2175 (*dest)++;
2176 return 0;
2177 }
2178 else {
2179 PyErr_Format(PyExc_ValueError,
2180 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002181 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 errors);
2183 return -1;
2184 }
2185}
2186
2187PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2188 int size,
2189 PyObject *mapping,
2190 const char *errors)
2191{
2192 PyUnicodeObject *v;
2193 Py_UNICODE *p;
2194
2195 if (mapping == NULL) {
2196 PyErr_BadArgument();
2197 return NULL;
2198 }
2199
2200 /* Output will never be longer than input */
2201 v = _PyUnicode_New(size);
2202 if (v == NULL)
2203 goto onError;
2204 if (size == 0)
2205 goto done;
2206 p = PyUnicode_AS_UNICODE(v);
2207 while (size-- > 0) {
2208 Py_UNICODE ch = *s++;
2209 PyObject *w, *x;
2210
2211 /* Get mapping */
2212 w = PyInt_FromLong(ch);
2213 if (w == NULL)
2214 goto onError;
2215 x = PyObject_GetItem(mapping, w);
2216 Py_DECREF(w);
2217 if (x == NULL) {
2218 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2219 /* No mapping found: default to 1-1 mapping */
2220 PyErr_Clear();
2221 *p++ = ch;
2222 continue;
2223 }
2224 goto onError;
2225 }
2226
2227 /* Apply mapping */
2228 if (PyInt_Check(x))
2229 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2230 else if (x == Py_None) {
2231 /* undefined mapping */
2232 if (translate_error(&s, &p, errors,
2233 "character maps to <undefined>")) {
2234 Py_DECREF(x);
2235 goto onError;
2236 }
2237 }
2238 else if (PyUnicode_Check(x)) {
2239 if (PyUnicode_GET_SIZE(x) != 1) {
2240 /* 1-n mapping */
2241 PyErr_SetString(PyExc_NotImplementedError,
2242 "1-n mappings are currently not implemented");
2243 Py_DECREF(x);
2244 goto onError;
2245 }
2246 *p++ = *PyUnicode_AS_UNICODE(x);
2247 }
2248 else {
2249 /* wrong return value */
2250 PyErr_SetString(PyExc_TypeError,
2251 "translate mapping must return integer, None or unicode");
2252 Py_DECREF(x);
2253 goto onError;
2254 }
2255 Py_DECREF(x);
2256 }
2257 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002258 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260
2261 done:
2262 return (PyObject *)v;
2263
2264 onError:
2265 Py_XDECREF(v);
2266 return NULL;
2267}
2268
2269PyObject *PyUnicode_Translate(PyObject *str,
2270 PyObject *mapping,
2271 const char *errors)
2272{
2273 PyObject *result;
2274
2275 str = PyUnicode_FromObject(str);
2276 if (str == NULL)
2277 goto onError;
2278 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2279 PyUnicode_GET_SIZE(str),
2280 mapping,
2281 errors);
2282 Py_DECREF(str);
2283 return result;
2284
2285 onError:
2286 Py_XDECREF(str);
2287 return NULL;
2288}
2289
Guido van Rossum9e896b32000-04-05 20:11:21 +00002290/* --- Decimal Encoder ---------------------------------------------------- */
2291
2292int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2293 int length,
2294 char *output,
2295 const char *errors)
2296{
2297 Py_UNICODE *p, *end;
2298
2299 if (output == NULL) {
2300 PyErr_BadArgument();
2301 return -1;
2302 }
2303
2304 p = s;
2305 end = s + length;
2306 while (p < end) {
2307 register Py_UNICODE ch = *p++;
2308 int decimal;
2309
2310 if (Py_UNICODE_ISSPACE(ch)) {
2311 *output++ = ' ';
2312 continue;
2313 }
2314 decimal = Py_UNICODE_TODECIMAL(ch);
2315 if (decimal >= 0) {
2316 *output++ = '0' + decimal;
2317 continue;
2318 }
Guido van Rossumba477042000-04-06 18:18:10 +00002319 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002320 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002321 continue;
2322 }
2323 /* All other characters are considered invalid */
2324 if (errors == NULL || strcmp(errors, "strict") == 0) {
2325 PyErr_SetString(PyExc_ValueError,
2326 "invalid decimal Unicode string");
2327 goto onError;
2328 }
2329 else if (strcmp(errors, "ignore") == 0)
2330 continue;
2331 else if (strcmp(errors, "replace") == 0) {
2332 *output++ = '?';
2333 continue;
2334 }
2335 }
2336 /* 0-terminate the output string */
2337 *output++ = '\0';
2338 return 0;
2339
2340 onError:
2341 return -1;
2342}
2343
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344/* --- Helpers ------------------------------------------------------------ */
2345
2346static
2347int count(PyUnicodeObject *self,
2348 int start,
2349 int end,
2350 PyUnicodeObject *substring)
2351{
2352 int count = 0;
2353
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002354 if (substring->length == 0)
2355 return (end - start + 1);
2356
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 end -= substring->length;
2358
2359 while (start <= end)
2360 if (Py_UNICODE_MATCH(self, start, substring)) {
2361 count++;
2362 start += substring->length;
2363 } else
2364 start++;
2365
2366 return count;
2367}
2368
2369int PyUnicode_Count(PyObject *str,
2370 PyObject *substr,
2371 int start,
2372 int end)
2373{
2374 int result;
2375
2376 str = PyUnicode_FromObject(str);
2377 if (str == NULL)
2378 return -1;
2379 substr = PyUnicode_FromObject(substr);
2380 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002381 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382 return -1;
2383 }
2384
2385 result = count((PyUnicodeObject *)str,
2386 start, end,
2387 (PyUnicodeObject *)substr);
2388
2389 Py_DECREF(str);
2390 Py_DECREF(substr);
2391 return result;
2392}
2393
2394static
2395int findstring(PyUnicodeObject *self,
2396 PyUnicodeObject *substring,
2397 int start,
2398 int end,
2399 int direction)
2400{
2401 if (start < 0)
2402 start += self->length;
2403 if (start < 0)
2404 start = 0;
2405
2406 if (substring->length == 0)
2407 return start;
2408
2409 if (end > self->length)
2410 end = self->length;
2411 if (end < 0)
2412 end += self->length;
2413 if (end < 0)
2414 end = 0;
2415
2416 end -= substring->length;
2417
2418 if (direction < 0) {
2419 for (; end >= start; end--)
2420 if (Py_UNICODE_MATCH(self, end, substring))
2421 return end;
2422 } else {
2423 for (; start <= end; start++)
2424 if (Py_UNICODE_MATCH(self, start, substring))
2425 return start;
2426 }
2427
2428 return -1;
2429}
2430
2431int PyUnicode_Find(PyObject *str,
2432 PyObject *substr,
2433 int start,
2434 int end,
2435 int direction)
2436{
2437 int result;
2438
2439 str = PyUnicode_FromObject(str);
2440 if (str == NULL)
2441 return -1;
2442 substr = PyUnicode_FromObject(substr);
2443 if (substr == NULL) {
2444 Py_DECREF(substr);
2445 return -1;
2446 }
2447
2448 result = findstring((PyUnicodeObject *)str,
2449 (PyUnicodeObject *)substr,
2450 start, end, direction);
2451 Py_DECREF(str);
2452 Py_DECREF(substr);
2453 return result;
2454}
2455
2456static
2457int tailmatch(PyUnicodeObject *self,
2458 PyUnicodeObject *substring,
2459 int start,
2460 int end,
2461 int direction)
2462{
2463 if (start < 0)
2464 start += self->length;
2465 if (start < 0)
2466 start = 0;
2467
2468 if (substring->length == 0)
2469 return 1;
2470
2471 if (end > self->length)
2472 end = self->length;
2473 if (end < 0)
2474 end += self->length;
2475 if (end < 0)
2476 end = 0;
2477
2478 end -= substring->length;
2479 if (end < start)
2480 return 0;
2481
2482 if (direction > 0) {
2483 if (Py_UNICODE_MATCH(self, end, substring))
2484 return 1;
2485 } else {
2486 if (Py_UNICODE_MATCH(self, start, substring))
2487 return 1;
2488 }
2489
2490 return 0;
2491}
2492
2493int PyUnicode_Tailmatch(PyObject *str,
2494 PyObject *substr,
2495 int start,
2496 int end,
2497 int direction)
2498{
2499 int result;
2500
2501 str = PyUnicode_FromObject(str);
2502 if (str == NULL)
2503 return -1;
2504 substr = PyUnicode_FromObject(substr);
2505 if (substr == NULL) {
2506 Py_DECREF(substr);
2507 return -1;
2508 }
2509
2510 result = tailmatch((PyUnicodeObject *)str,
2511 (PyUnicodeObject *)substr,
2512 start, end, direction);
2513 Py_DECREF(str);
2514 Py_DECREF(substr);
2515 return result;
2516}
2517
2518static
2519const Py_UNICODE *findchar(const Py_UNICODE *s,
2520 int size,
2521 Py_UNICODE ch)
2522{
2523 /* like wcschr, but doesn't stop at NULL characters */
2524
2525 while (size-- > 0) {
2526 if (*s == ch)
2527 return s;
2528 s++;
2529 }
2530
2531 return NULL;
2532}
2533
2534/* Apply fixfct filter to the Unicode object self and return a
2535 reference to the modified object */
2536
2537static
2538PyObject *fixup(PyUnicodeObject *self,
2539 int (*fixfct)(PyUnicodeObject *s))
2540{
2541
2542 PyUnicodeObject *u;
2543
2544 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2545 self->length);
2546 if (u == NULL)
2547 return NULL;
2548 if (!fixfct(u)) {
2549 /* fixfct should return TRUE if it modified the buffer. If
2550 FALSE, return a reference to the original buffer instead
2551 (to save space, not time) */
2552 Py_INCREF(self);
2553 Py_DECREF(u);
2554 return (PyObject*) self;
2555 }
2556 return (PyObject*) u;
2557}
2558
2559static
2560int fixupper(PyUnicodeObject *self)
2561{
2562 int len = self->length;
2563 Py_UNICODE *s = self->str;
2564 int status = 0;
2565
2566 while (len-- > 0) {
2567 register Py_UNICODE ch;
2568
2569 ch = Py_UNICODE_TOUPPER(*s);
2570 if (ch != *s) {
2571 status = 1;
2572 *s = ch;
2573 }
2574 s++;
2575 }
2576
2577 return status;
2578}
2579
2580static
2581int fixlower(PyUnicodeObject *self)
2582{
2583 int len = self->length;
2584 Py_UNICODE *s = self->str;
2585 int status = 0;
2586
2587 while (len-- > 0) {
2588 register Py_UNICODE ch;
2589
2590 ch = Py_UNICODE_TOLOWER(*s);
2591 if (ch != *s) {
2592 status = 1;
2593 *s = ch;
2594 }
2595 s++;
2596 }
2597
2598 return status;
2599}
2600
2601static
2602int fixswapcase(PyUnicodeObject *self)
2603{
2604 int len = self->length;
2605 Py_UNICODE *s = self->str;
2606 int status = 0;
2607
2608 while (len-- > 0) {
2609 if (Py_UNICODE_ISUPPER(*s)) {
2610 *s = Py_UNICODE_TOLOWER(*s);
2611 status = 1;
2612 } else if (Py_UNICODE_ISLOWER(*s)) {
2613 *s = Py_UNICODE_TOUPPER(*s);
2614 status = 1;
2615 }
2616 s++;
2617 }
2618
2619 return status;
2620}
2621
2622static
2623int fixcapitalize(PyUnicodeObject *self)
2624{
2625 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2626 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2627 return 1;
2628 }
2629 return 0;
2630}
2631
2632static
2633int fixtitle(PyUnicodeObject *self)
2634{
2635 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2636 register Py_UNICODE *e;
2637 int previous_is_cased;
2638
2639 /* Shortcut for single character strings */
2640 if (PyUnicode_GET_SIZE(self) == 1) {
2641 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2642 if (*p != ch) {
2643 *p = ch;
2644 return 1;
2645 }
2646 else
2647 return 0;
2648 }
2649
2650 e = p + PyUnicode_GET_SIZE(self);
2651 previous_is_cased = 0;
2652 for (; p < e; p++) {
2653 register const Py_UNICODE ch = *p;
2654
2655 if (previous_is_cased)
2656 *p = Py_UNICODE_TOLOWER(ch);
2657 else
2658 *p = Py_UNICODE_TOTITLE(ch);
2659
2660 if (Py_UNICODE_ISLOWER(ch) ||
2661 Py_UNICODE_ISUPPER(ch) ||
2662 Py_UNICODE_ISTITLE(ch))
2663 previous_is_cased = 1;
2664 else
2665 previous_is_cased = 0;
2666 }
2667 return 1;
2668}
2669
2670PyObject *PyUnicode_Join(PyObject *separator,
2671 PyObject *seq)
2672{
2673 Py_UNICODE *sep;
2674 int seplen;
2675 PyUnicodeObject *res = NULL;
2676 int reslen = 0;
2677 Py_UNICODE *p;
2678 int seqlen = 0;
2679 int sz = 100;
2680 int i;
2681
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002682 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 if (seqlen < 0 && PyErr_Occurred())
2684 return NULL;
2685
2686 if (separator == NULL) {
2687 Py_UNICODE blank = ' ';
2688 sep = &blank;
2689 seplen = 1;
2690 }
2691 else {
2692 separator = PyUnicode_FromObject(separator);
2693 if (separator == NULL)
2694 return NULL;
2695 sep = PyUnicode_AS_UNICODE(separator);
2696 seplen = PyUnicode_GET_SIZE(separator);
2697 }
2698
2699 res = _PyUnicode_New(sz);
2700 if (res == NULL)
2701 goto onError;
2702 p = PyUnicode_AS_UNICODE(res);
2703 reslen = 0;
2704
2705 for (i = 0; i < seqlen; i++) {
2706 int itemlen;
2707 PyObject *item;
2708
2709 item = PySequence_GetItem(seq, i);
2710 if (item == NULL)
2711 goto onError;
2712 if (!PyUnicode_Check(item)) {
2713 PyObject *v;
2714 v = PyUnicode_FromObject(item);
2715 Py_DECREF(item);
2716 item = v;
2717 if (item == NULL)
2718 goto onError;
2719 }
2720 itemlen = PyUnicode_GET_SIZE(item);
2721 while (reslen + itemlen + seplen >= sz) {
2722 if (_PyUnicode_Resize(res, sz*2))
2723 goto onError;
2724 sz *= 2;
2725 p = PyUnicode_AS_UNICODE(res) + reslen;
2726 }
2727 if (i > 0) {
2728 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2729 p += seplen;
2730 reslen += seplen;
2731 }
2732 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2733 p += itemlen;
2734 reslen += itemlen;
2735 Py_DECREF(item);
2736 }
2737 if (_PyUnicode_Resize(res, reslen))
2738 goto onError;
2739
2740 Py_XDECREF(separator);
2741 return (PyObject *)res;
2742
2743 onError:
2744 Py_XDECREF(separator);
2745 Py_DECREF(res);
2746 return NULL;
2747}
2748
2749static
2750PyUnicodeObject *pad(PyUnicodeObject *self,
2751 int left,
2752 int right,
2753 Py_UNICODE fill)
2754{
2755 PyUnicodeObject *u;
2756
2757 if (left < 0)
2758 left = 0;
2759 if (right < 0)
2760 right = 0;
2761
2762 if (left == 0 && right == 0) {
2763 Py_INCREF(self);
2764 return self;
2765 }
2766
2767 u = _PyUnicode_New(left + self->length + right);
2768 if (u) {
2769 if (left)
2770 Py_UNICODE_FILL(u->str, fill, left);
2771 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2772 if (right)
2773 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2774 }
2775
2776 return u;
2777}
2778
2779#define SPLIT_APPEND(data, left, right) \
2780 str = PyUnicode_FromUnicode(data + left, right - left); \
2781 if (!str) \
2782 goto onError; \
2783 if (PyList_Append(list, str)) { \
2784 Py_DECREF(str); \
2785 goto onError; \
2786 } \
2787 else \
2788 Py_DECREF(str);
2789
2790static
2791PyObject *split_whitespace(PyUnicodeObject *self,
2792 PyObject *list,
2793 int maxcount)
2794{
2795 register int i;
2796 register int j;
2797 int len = self->length;
2798 PyObject *str;
2799
2800 for (i = j = 0; i < len; ) {
2801 /* find a token */
2802 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2803 i++;
2804 j = i;
2805 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2806 i++;
2807 if (j < i) {
2808 if (maxcount-- <= 0)
2809 break;
2810 SPLIT_APPEND(self->str, j, i);
2811 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2812 i++;
2813 j = i;
2814 }
2815 }
2816 if (j < len) {
2817 SPLIT_APPEND(self->str, j, len);
2818 }
2819 return list;
2820
2821 onError:
2822 Py_DECREF(list);
2823 return NULL;
2824}
2825
2826PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002827 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828{
2829 register int i;
2830 register int j;
2831 int len;
2832 PyObject *list;
2833 PyObject *str;
2834 Py_UNICODE *data;
2835
2836 string = PyUnicode_FromObject(string);
2837 if (string == NULL)
2838 return NULL;
2839 data = PyUnicode_AS_UNICODE(string);
2840 len = PyUnicode_GET_SIZE(string);
2841
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 list = PyList_New(0);
2843 if (!list)
2844 goto onError;
2845
2846 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002847 int eol;
2848
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 /* Find a line and append it */
2850 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2851 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852
2853 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002854 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 if (i < len) {
2856 if (data[i] == '\r' && i + 1 < len &&
2857 data[i+1] == '\n')
2858 i += 2;
2859 else
2860 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002861 if (keepends)
2862 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 }
Guido van Rossum86662912000-04-11 15:38:46 +00002864 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 j = i;
2866 }
2867 if (j < len) {
2868 SPLIT_APPEND(data, j, len);
2869 }
2870
2871 Py_DECREF(string);
2872 return list;
2873
2874 onError:
2875 Py_DECREF(list);
2876 Py_DECREF(string);
2877 return NULL;
2878}
2879
2880static
2881PyObject *split_char(PyUnicodeObject *self,
2882 PyObject *list,
2883 Py_UNICODE ch,
2884 int maxcount)
2885{
2886 register int i;
2887 register int j;
2888 int len = self->length;
2889 PyObject *str;
2890
2891 for (i = j = 0; i < len; ) {
2892 if (self->str[i] == ch) {
2893 if (maxcount-- <= 0)
2894 break;
2895 SPLIT_APPEND(self->str, j, i);
2896 i = j = i + 1;
2897 } else
2898 i++;
2899 }
2900 if (j <= len) {
2901 SPLIT_APPEND(self->str, j, len);
2902 }
2903 return list;
2904
2905 onError:
2906 Py_DECREF(list);
2907 return NULL;
2908}
2909
2910static
2911PyObject *split_substring(PyUnicodeObject *self,
2912 PyObject *list,
2913 PyUnicodeObject *substring,
2914 int maxcount)
2915{
2916 register int i;
2917 register int j;
2918 int len = self->length;
2919 int sublen = substring->length;
2920 PyObject *str;
2921
2922 for (i = j = 0; i < len - sublen; ) {
2923 if (Py_UNICODE_MATCH(self, i, substring)) {
2924 if (maxcount-- <= 0)
2925 break;
2926 SPLIT_APPEND(self->str, j, i);
2927 i = j = i + sublen;
2928 } else
2929 i++;
2930 }
2931 if (j <= len) {
2932 SPLIT_APPEND(self->str, j, len);
2933 }
2934 return list;
2935
2936 onError:
2937 Py_DECREF(list);
2938 return NULL;
2939}
2940
2941#undef SPLIT_APPEND
2942
2943static
2944PyObject *split(PyUnicodeObject *self,
2945 PyUnicodeObject *substring,
2946 int maxcount)
2947{
2948 PyObject *list;
2949
2950 if (maxcount < 0)
2951 maxcount = INT_MAX;
2952
2953 list = PyList_New(0);
2954 if (!list)
2955 return NULL;
2956
2957 if (substring == NULL)
2958 return split_whitespace(self,list,maxcount);
2959
2960 else if (substring->length == 1)
2961 return split_char(self,list,substring->str[0],maxcount);
2962
2963 else if (substring->length == 0) {
2964 Py_DECREF(list);
2965 PyErr_SetString(PyExc_ValueError, "empty separator");
2966 return NULL;
2967 }
2968 else
2969 return split_substring(self,list,substring,maxcount);
2970}
2971
2972static
2973PyObject *strip(PyUnicodeObject *self,
2974 int left,
2975 int right)
2976{
2977 Py_UNICODE *p = self->str;
2978 int start = 0;
2979 int end = self->length;
2980
2981 if (left)
2982 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2983 start++;
2984
2985 if (right)
2986 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2987 end--;
2988
2989 if (start == 0 && end == self->length) {
2990 /* couldn't strip anything off, return original string */
2991 Py_INCREF(self);
2992 return (PyObject*) self;
2993 }
2994
2995 return (PyObject*) PyUnicode_FromUnicode(
2996 self->str + start,
2997 end - start
2998 );
2999}
3000
3001static
3002PyObject *replace(PyUnicodeObject *self,
3003 PyUnicodeObject *str1,
3004 PyUnicodeObject *str2,
3005 int maxcount)
3006{
3007 PyUnicodeObject *u;
3008
3009 if (maxcount < 0)
3010 maxcount = INT_MAX;
3011
3012 if (str1->length == 1 && str2->length == 1) {
3013 int i;
3014
3015 /* replace characters */
3016 if (!findchar(self->str, self->length, str1->str[0])) {
3017 /* nothing to replace, return original string */
3018 Py_INCREF(self);
3019 u = self;
3020 } else {
3021 Py_UNICODE u1 = str1->str[0];
3022 Py_UNICODE u2 = str2->str[0];
3023
3024 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3025 self->str,
3026 self->length
3027 );
3028 if (u)
3029 for (i = 0; i < u->length; i++)
3030 if (u->str[i] == u1) {
3031 if (--maxcount < 0)
3032 break;
3033 u->str[i] = u2;
3034 }
3035 }
3036
3037 } else {
3038 int n, i;
3039 Py_UNICODE *p;
3040
3041 /* replace strings */
3042 n = count(self, 0, self->length, str1);
3043 if (n > maxcount)
3044 n = maxcount;
3045 if (n == 0) {
3046 /* nothing to replace, return original string */
3047 Py_INCREF(self);
3048 u = self;
3049 } else {
3050 u = _PyUnicode_New(
3051 self->length + n * (str2->length - str1->length));
3052 if (u) {
3053 i = 0;
3054 p = u->str;
3055 while (i <= self->length - str1->length)
3056 if (Py_UNICODE_MATCH(self, i, str1)) {
3057 /* replace string segment */
3058 Py_UNICODE_COPY(p, str2->str, str2->length);
3059 p += str2->length;
3060 i += str1->length;
3061 if (--n <= 0) {
3062 /* copy remaining part */
3063 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3064 break;
3065 }
3066 } else
3067 *p++ = self->str[i++];
3068 }
3069 }
3070 }
3071
3072 return (PyObject *) u;
3073}
3074
3075/* --- Unicode Object Methods --------------------------------------------- */
3076
3077static char title__doc__[] =
3078"S.title() -> unicode\n\
3079\n\
3080Return a titlecased version of S, i.e. words start with title case\n\
3081characters, all remaining cased characters have lower case.";
3082
3083static PyObject*
3084unicode_title(PyUnicodeObject *self, PyObject *args)
3085{
3086 if (!PyArg_NoArgs(args))
3087 return NULL;
3088 return fixup(self, fixtitle);
3089}
3090
3091static char capitalize__doc__[] =
3092"S.capitalize() -> unicode\n\
3093\n\
3094Return a capitalized version of S, i.e. make the first character\n\
3095have upper case.";
3096
3097static PyObject*
3098unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3099{
3100 if (!PyArg_NoArgs(args))
3101 return NULL;
3102 return fixup(self, fixcapitalize);
3103}
3104
3105#if 0
3106static char capwords__doc__[] =
3107"S.capwords() -> unicode\n\
3108\n\
3109Apply .capitalize() to all words in S and return the result with\n\
3110normalized whitespace (all whitespace strings are replaced by ' ').";
3111
3112static PyObject*
3113unicode_capwords(PyUnicodeObject *self, PyObject *args)
3114{
3115 PyObject *list;
3116 PyObject *item;
3117 int i;
3118
3119 if (!PyArg_NoArgs(args))
3120 return NULL;
3121
3122 /* Split into words */
3123 list = split(self, NULL, -1);
3124 if (!list)
3125 return NULL;
3126
3127 /* Capitalize each word */
3128 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3129 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3130 fixcapitalize);
3131 if (item == NULL)
3132 goto onError;
3133 Py_DECREF(PyList_GET_ITEM(list, i));
3134 PyList_SET_ITEM(list, i, item);
3135 }
3136
3137 /* Join the words to form a new string */
3138 item = PyUnicode_Join(NULL, list);
3139
3140onError:
3141 Py_DECREF(list);
3142 return (PyObject *)item;
3143}
3144#endif
3145
3146static char center__doc__[] =
3147"S.center(width) -> unicode\n\
3148\n\
3149Return S centered in a Unicode string of length width. Padding is done\n\
3150using spaces.";
3151
3152static PyObject *
3153unicode_center(PyUnicodeObject *self, PyObject *args)
3154{
3155 int marg, left;
3156 int width;
3157
3158 if (!PyArg_ParseTuple(args, "i:center", &width))
3159 return NULL;
3160
3161 if (self->length >= width) {
3162 Py_INCREF(self);
3163 return (PyObject*) self;
3164 }
3165
3166 marg = width - self->length;
3167 left = marg / 2 + (marg & width & 1);
3168
3169 return (PyObject*) pad(self, left, marg - left, ' ');
3170}
3171
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003172/* speedy UTF-16 code point order comparison */
3173/* gleaned from: */
3174/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3175
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003176static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003177{
3178 0, 0, 0, 0, 0, 0, 0, 0,
3179 0, 0, 0, 0, 0, 0, 0, 0,
3180 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003181 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003182};
3183
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184static int
3185unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3186{
3187 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003188
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 Py_UNICODE *s1 = str1->str;
3190 Py_UNICODE *s2 = str2->str;
3191
3192 len1 = str1->length;
3193 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003194
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003196 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003197 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003198
3199 c1 = *s1++;
3200 c2 = *s2++;
3201 if (c1 > (1<<11) * 26)
3202 c1 += utf16Fixup[c1>>11];
3203 if (c2 > (1<<11) * 26)
3204 c2 += utf16Fixup[c2>>11];
3205
3206 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003207 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003208 if (diff)
3209 return (diff < 0) ? -1 : (diff != 0);
3210 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 }
3212
3213 return (len1 < len2) ? -1 : (len1 != len2);
3214}
3215
3216int PyUnicode_Compare(PyObject *left,
3217 PyObject *right)
3218{
3219 PyUnicodeObject *u = NULL, *v = NULL;
3220 int result;
3221
3222 /* Coerce the two arguments */
3223 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3224 if (u == NULL)
3225 goto onError;
3226 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3227 if (v == NULL)
3228 goto onError;
3229
Thomas Wouters7e474022000-07-16 12:04:32 +00003230 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 if (v == u) {
3232 Py_DECREF(u);
3233 Py_DECREF(v);
3234 return 0;
3235 }
3236
3237 result = unicode_compare(u, v);
3238
3239 Py_DECREF(u);
3240 Py_DECREF(v);
3241 return result;
3242
3243onError:
3244 Py_XDECREF(u);
3245 Py_XDECREF(v);
3246 return -1;
3247}
3248
Guido van Rossum403d68b2000-03-13 15:55:09 +00003249int PyUnicode_Contains(PyObject *container,
3250 PyObject *element)
3251{
3252 PyUnicodeObject *u = NULL, *v = NULL;
3253 int result;
3254 register const Py_UNICODE *p, *e;
3255 register Py_UNICODE ch;
3256
3257 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003258 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003259 if (v == NULL) {
3260 PyErr_SetString(PyExc_TypeError,
3261 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003262 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003263 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003264 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3265 if (u == NULL) {
3266 Py_DECREF(v);
3267 goto onError;
3268 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003269
3270 /* Check v in u */
3271 if (PyUnicode_GET_SIZE(v) != 1) {
3272 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003273 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003274 goto onError;
3275 }
3276 ch = *PyUnicode_AS_UNICODE(v);
3277 p = PyUnicode_AS_UNICODE(u);
3278 e = p + PyUnicode_GET_SIZE(u);
3279 result = 0;
3280 while (p < e) {
3281 if (*p++ == ch) {
3282 result = 1;
3283 break;
3284 }
3285 }
3286
3287 Py_DECREF(u);
3288 Py_DECREF(v);
3289 return result;
3290
3291onError:
3292 Py_XDECREF(u);
3293 Py_XDECREF(v);
3294 return -1;
3295}
3296
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297/* Concat to string or Unicode object giving a new Unicode object. */
3298
3299PyObject *PyUnicode_Concat(PyObject *left,
3300 PyObject *right)
3301{
3302 PyUnicodeObject *u = NULL, *v = NULL, *w;
3303
3304 /* Coerce the two arguments */
3305 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3306 if (u == NULL)
3307 goto onError;
3308 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3309 if (v == NULL)
3310 goto onError;
3311
3312 /* Shortcuts */
3313 if (v == unicode_empty) {
3314 Py_DECREF(v);
3315 return (PyObject *)u;
3316 }
3317 if (u == unicode_empty) {
3318 Py_DECREF(u);
3319 return (PyObject *)v;
3320 }
3321
3322 /* Concat the two Unicode strings */
3323 w = _PyUnicode_New(u->length + v->length);
3324 if (w == NULL)
3325 goto onError;
3326 Py_UNICODE_COPY(w->str, u->str, u->length);
3327 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3328
3329 Py_DECREF(u);
3330 Py_DECREF(v);
3331 return (PyObject *)w;
3332
3333onError:
3334 Py_XDECREF(u);
3335 Py_XDECREF(v);
3336 return NULL;
3337}
3338
3339static char count__doc__[] =
3340"S.count(sub[, start[, end]]) -> int\n\
3341\n\
3342Return the number of occurrences of substring sub in Unicode string\n\
3343S[start:end]. Optional arguments start and end are\n\
3344interpreted as in slice notation.";
3345
3346static PyObject *
3347unicode_count(PyUnicodeObject *self, PyObject *args)
3348{
3349 PyUnicodeObject *substring;
3350 int start = 0;
3351 int end = INT_MAX;
3352 PyObject *result;
3353
Guido van Rossumb8872e62000-05-09 14:14:27 +00003354 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3355 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 return NULL;
3357
3358 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3359 (PyObject *)substring);
3360 if (substring == NULL)
3361 return NULL;
3362
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 if (start < 0)
3364 start += self->length;
3365 if (start < 0)
3366 start = 0;
3367 if (end > self->length)
3368 end = self->length;
3369 if (end < 0)
3370 end += self->length;
3371 if (end < 0)
3372 end = 0;
3373
3374 result = PyInt_FromLong((long) count(self, start, end, substring));
3375
3376 Py_DECREF(substring);
3377 return result;
3378}
3379
3380static char encode__doc__[] =
3381"S.encode([encoding[,errors]]) -> string\n\
3382\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003383Return an encoded string version of S. Default encoding is the current\n\
3384default string encoding. errors may be given to set a different error\n\
3385handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3386a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387
3388static PyObject *
3389unicode_encode(PyUnicodeObject *self, PyObject *args)
3390{
3391 char *encoding = NULL;
3392 char *errors = NULL;
3393 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3394 return NULL;
3395 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3396}
3397
3398static char expandtabs__doc__[] =
3399"S.expandtabs([tabsize]) -> unicode\n\
3400\n\
3401Return a copy of S where all tab characters are expanded using spaces.\n\
3402If tabsize is not given, a tab size of 8 characters is assumed.";
3403
3404static PyObject*
3405unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3406{
3407 Py_UNICODE *e;
3408 Py_UNICODE *p;
3409 Py_UNICODE *q;
3410 int i, j;
3411 PyUnicodeObject *u;
3412 int tabsize = 8;
3413
3414 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3415 return NULL;
3416
Thomas Wouters7e474022000-07-16 12:04:32 +00003417 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 i = j = 0;
3419 e = self->str + self->length;
3420 for (p = self->str; p < e; p++)
3421 if (*p == '\t') {
3422 if (tabsize > 0)
3423 j += tabsize - (j % tabsize);
3424 }
3425 else {
3426 j++;
3427 if (*p == '\n' || *p == '\r') {
3428 i += j;
3429 j = 0;
3430 }
3431 }
3432
3433 /* Second pass: create output string and fill it */
3434 u = _PyUnicode_New(i + j);
3435 if (!u)
3436 return NULL;
3437
3438 j = 0;
3439 q = u->str;
3440
3441 for (p = self->str; p < e; p++)
3442 if (*p == '\t') {
3443 if (tabsize > 0) {
3444 i = tabsize - (j % tabsize);
3445 j += i;
3446 while (i--)
3447 *q++ = ' ';
3448 }
3449 }
3450 else {
3451 j++;
3452 *q++ = *p;
3453 if (*p == '\n' || *p == '\r')
3454 j = 0;
3455 }
3456
3457 return (PyObject*) u;
3458}
3459
3460static char find__doc__[] =
3461"S.find(sub [,start [,end]]) -> int\n\
3462\n\
3463Return the lowest index in S where substring sub is found,\n\
3464such that sub is contained within s[start,end]. Optional\n\
3465arguments start and end are interpreted as in slice notation.\n\
3466\n\
3467Return -1 on failure.";
3468
3469static PyObject *
3470unicode_find(PyUnicodeObject *self, PyObject *args)
3471{
3472 PyUnicodeObject *substring;
3473 int start = 0;
3474 int end = INT_MAX;
3475 PyObject *result;
3476
Guido van Rossumb8872e62000-05-09 14:14:27 +00003477 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3478 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 return NULL;
3480 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3481 (PyObject *)substring);
3482 if (substring == NULL)
3483 return NULL;
3484
3485 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3486
3487 Py_DECREF(substring);
3488 return result;
3489}
3490
3491static PyObject *
3492unicode_getitem(PyUnicodeObject *self, int index)
3493{
3494 if (index < 0 || index >= self->length) {
3495 PyErr_SetString(PyExc_IndexError, "string index out of range");
3496 return NULL;
3497 }
3498
3499 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3500}
3501
3502static long
3503unicode_hash(PyUnicodeObject *self)
3504{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003505 /* Since Unicode objects compare equal to their ASCII string
3506 counterparts, they should use the individual character values
3507 as basis for their hash value. This is needed to assure that
3508 strings and Unicode objects behave in the same way as
3509 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510
Fredrik Lundhdde61642000-07-10 18:27:47 +00003511 register int len;
3512 register Py_UNICODE *p;
3513 register long x;
3514
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 if (self->hash != -1)
3516 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003517 len = PyUnicode_GET_SIZE(self);
3518 p = PyUnicode_AS_UNICODE(self);
3519 x = *p << 7;
3520 while (--len >= 0)
3521 x = (1000003*x) ^ *p++;
3522 x ^= PyUnicode_GET_SIZE(self);
3523 if (x == -1)
3524 x = -2;
3525 self->hash = x;
3526 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527}
3528
3529static char index__doc__[] =
3530"S.index(sub [,start [,end]]) -> int\n\
3531\n\
3532Like S.find() but raise ValueError when the substring is not found.";
3533
3534static PyObject *
3535unicode_index(PyUnicodeObject *self, PyObject *args)
3536{
3537 int result;
3538 PyUnicodeObject *substring;
3539 int start = 0;
3540 int end = INT_MAX;
3541
Guido van Rossumb8872e62000-05-09 14:14:27 +00003542 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3543 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 return NULL;
3545
3546 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3547 (PyObject *)substring);
3548 if (substring == NULL)
3549 return NULL;
3550
3551 result = findstring(self, substring, start, end, 1);
3552
3553 Py_DECREF(substring);
3554 if (result < 0) {
3555 PyErr_SetString(PyExc_ValueError, "substring not found");
3556 return NULL;
3557 }
3558 return PyInt_FromLong(result);
3559}
3560
3561static char islower__doc__[] =
3562"S.islower() -> int\n\
3563\n\
3564Return 1 if all cased characters in S are lowercase and there is\n\
3565at least one cased character in S, 0 otherwise.";
3566
3567static PyObject*
3568unicode_islower(PyUnicodeObject *self, PyObject *args)
3569{
3570 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3571 register const Py_UNICODE *e;
3572 int cased;
3573
3574 if (!PyArg_NoArgs(args))
3575 return NULL;
3576
3577 /* Shortcut for single character strings */
3578 if (PyUnicode_GET_SIZE(self) == 1)
3579 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3580
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003581 /* Special case for empty strings */
3582 if (PyString_GET_SIZE(self) == 0)
3583 return PyInt_FromLong(0);
3584
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 e = p + PyUnicode_GET_SIZE(self);
3586 cased = 0;
3587 for (; p < e; p++) {
3588 register const Py_UNICODE ch = *p;
3589
3590 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3591 return PyInt_FromLong(0);
3592 else if (!cased && Py_UNICODE_ISLOWER(ch))
3593 cased = 1;
3594 }
3595 return PyInt_FromLong(cased);
3596}
3597
3598static char isupper__doc__[] =
3599"S.isupper() -> int\n\
3600\n\
3601Return 1 if all cased characters in S are uppercase and there is\n\
3602at least one cased character in S, 0 otherwise.";
3603
3604static PyObject*
3605unicode_isupper(PyUnicodeObject *self, PyObject *args)
3606{
3607 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3608 register const Py_UNICODE *e;
3609 int cased;
3610
3611 if (!PyArg_NoArgs(args))
3612 return NULL;
3613
3614 /* Shortcut for single character strings */
3615 if (PyUnicode_GET_SIZE(self) == 1)
3616 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3617
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003618 /* Special case for empty strings */
3619 if (PyString_GET_SIZE(self) == 0)
3620 return PyInt_FromLong(0);
3621
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 e = p + PyUnicode_GET_SIZE(self);
3623 cased = 0;
3624 for (; p < e; p++) {
3625 register const Py_UNICODE ch = *p;
3626
3627 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3628 return PyInt_FromLong(0);
3629 else if (!cased && Py_UNICODE_ISUPPER(ch))
3630 cased = 1;
3631 }
3632 return PyInt_FromLong(cased);
3633}
3634
3635static char istitle__doc__[] =
3636"S.istitle() -> int\n\
3637\n\
3638Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3639may only follow uncased characters and lowercase characters only cased\n\
3640ones. Return 0 otherwise.";
3641
3642static PyObject*
3643unicode_istitle(PyUnicodeObject *self, PyObject *args)
3644{
3645 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3646 register const Py_UNICODE *e;
3647 int cased, previous_is_cased;
3648
3649 if (!PyArg_NoArgs(args))
3650 return NULL;
3651
3652 /* Shortcut for single character strings */
3653 if (PyUnicode_GET_SIZE(self) == 1)
3654 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3655 (Py_UNICODE_ISUPPER(*p) != 0));
3656
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003657 /* Special case for empty strings */
3658 if (PyString_GET_SIZE(self) == 0)
3659 return PyInt_FromLong(0);
3660
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 e = p + PyUnicode_GET_SIZE(self);
3662 cased = 0;
3663 previous_is_cased = 0;
3664 for (; p < e; p++) {
3665 register const Py_UNICODE ch = *p;
3666
3667 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3668 if (previous_is_cased)
3669 return PyInt_FromLong(0);
3670 previous_is_cased = 1;
3671 cased = 1;
3672 }
3673 else if (Py_UNICODE_ISLOWER(ch)) {
3674 if (!previous_is_cased)
3675 return PyInt_FromLong(0);
3676 previous_is_cased = 1;
3677 cased = 1;
3678 }
3679 else
3680 previous_is_cased = 0;
3681 }
3682 return PyInt_FromLong(cased);
3683}
3684
3685static char isspace__doc__[] =
3686"S.isspace() -> int\n\
3687\n\
3688Return 1 if there are only whitespace characters in S,\n\
36890 otherwise.";
3690
3691static PyObject*
3692unicode_isspace(PyUnicodeObject *self, PyObject *args)
3693{
3694 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3695 register const Py_UNICODE *e;
3696
3697 if (!PyArg_NoArgs(args))
3698 return NULL;
3699
3700 /* Shortcut for single character strings */
3701 if (PyUnicode_GET_SIZE(self) == 1 &&
3702 Py_UNICODE_ISSPACE(*p))
3703 return PyInt_FromLong(1);
3704
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003705 /* Special case for empty strings */
3706 if (PyString_GET_SIZE(self) == 0)
3707 return PyInt_FromLong(0);
3708
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 e = p + PyUnicode_GET_SIZE(self);
3710 for (; p < e; p++) {
3711 if (!Py_UNICODE_ISSPACE(*p))
3712 return PyInt_FromLong(0);
3713 }
3714 return PyInt_FromLong(1);
3715}
3716
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003717static char isalpha__doc__[] =
3718"S.isalpha() -> int\n\
3719\n\
3720Return 1 if all characters in S are alphabetic\n\
3721and there is at least one character in S, 0 otherwise.";
3722
3723static PyObject*
3724unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3725{
3726 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3727 register const Py_UNICODE *e;
3728
3729 if (!PyArg_NoArgs(args))
3730 return NULL;
3731
3732 /* Shortcut for single character strings */
3733 if (PyUnicode_GET_SIZE(self) == 1 &&
3734 Py_UNICODE_ISALPHA(*p))
3735 return PyInt_FromLong(1);
3736
3737 /* Special case for empty strings */
3738 if (PyString_GET_SIZE(self) == 0)
3739 return PyInt_FromLong(0);
3740
3741 e = p + PyUnicode_GET_SIZE(self);
3742 for (; p < e; p++) {
3743 if (!Py_UNICODE_ISALPHA(*p))
3744 return PyInt_FromLong(0);
3745 }
3746 return PyInt_FromLong(1);
3747}
3748
3749static char isalnum__doc__[] =
3750"S.isalnum() -> int\n\
3751\n\
3752Return 1 if all characters in S are alphanumeric\n\
3753and there is at least one character in S, 0 otherwise.";
3754
3755static PyObject*
3756unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3757{
3758 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3759 register const Py_UNICODE *e;
3760
3761 if (!PyArg_NoArgs(args))
3762 return NULL;
3763
3764 /* Shortcut for single character strings */
3765 if (PyUnicode_GET_SIZE(self) == 1 &&
3766 Py_UNICODE_ISALNUM(*p))
3767 return PyInt_FromLong(1);
3768
3769 /* Special case for empty strings */
3770 if (PyString_GET_SIZE(self) == 0)
3771 return PyInt_FromLong(0);
3772
3773 e = p + PyUnicode_GET_SIZE(self);
3774 for (; p < e; p++) {
3775 if (!Py_UNICODE_ISALNUM(*p))
3776 return PyInt_FromLong(0);
3777 }
3778 return PyInt_FromLong(1);
3779}
3780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781static char isdecimal__doc__[] =
3782"S.isdecimal() -> int\n\
3783\n\
3784Return 1 if there are only decimal characters in S,\n\
37850 otherwise.";
3786
3787static PyObject*
3788unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3789{
3790 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3791 register const Py_UNICODE *e;
3792
3793 if (!PyArg_NoArgs(args))
3794 return NULL;
3795
3796 /* Shortcut for single character strings */
3797 if (PyUnicode_GET_SIZE(self) == 1 &&
3798 Py_UNICODE_ISDECIMAL(*p))
3799 return PyInt_FromLong(1);
3800
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003801 /* Special case for empty strings */
3802 if (PyString_GET_SIZE(self) == 0)
3803 return PyInt_FromLong(0);
3804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 e = p + PyUnicode_GET_SIZE(self);
3806 for (; p < e; p++) {
3807 if (!Py_UNICODE_ISDECIMAL(*p))
3808 return PyInt_FromLong(0);
3809 }
3810 return PyInt_FromLong(1);
3811}
3812
3813static char isdigit__doc__[] =
3814"S.isdigit() -> int\n\
3815\n\
3816Return 1 if there are only digit characters in S,\n\
38170 otherwise.";
3818
3819static PyObject*
3820unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3821{
3822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3823 register const Py_UNICODE *e;
3824
3825 if (!PyArg_NoArgs(args))
3826 return NULL;
3827
3828 /* Shortcut for single character strings */
3829 if (PyUnicode_GET_SIZE(self) == 1 &&
3830 Py_UNICODE_ISDIGIT(*p))
3831 return PyInt_FromLong(1);
3832
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003833 /* Special case for empty strings */
3834 if (PyString_GET_SIZE(self) == 0)
3835 return PyInt_FromLong(0);
3836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 e = p + PyUnicode_GET_SIZE(self);
3838 for (; p < e; p++) {
3839 if (!Py_UNICODE_ISDIGIT(*p))
3840 return PyInt_FromLong(0);
3841 }
3842 return PyInt_FromLong(1);
3843}
3844
3845static char isnumeric__doc__[] =
3846"S.isnumeric() -> int\n\
3847\n\
3848Return 1 if there are only numeric characters in S,\n\
38490 otherwise.";
3850
3851static PyObject*
3852unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3853{
3854 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3855 register const Py_UNICODE *e;
3856
3857 if (!PyArg_NoArgs(args))
3858 return NULL;
3859
3860 /* Shortcut for single character strings */
3861 if (PyUnicode_GET_SIZE(self) == 1 &&
3862 Py_UNICODE_ISNUMERIC(*p))
3863 return PyInt_FromLong(1);
3864
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003865 /* Special case for empty strings */
3866 if (PyString_GET_SIZE(self) == 0)
3867 return PyInt_FromLong(0);
3868
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 e = p + PyUnicode_GET_SIZE(self);
3870 for (; p < e; p++) {
3871 if (!Py_UNICODE_ISNUMERIC(*p))
3872 return PyInt_FromLong(0);
3873 }
3874 return PyInt_FromLong(1);
3875}
3876
3877static char join__doc__[] =
3878"S.join(sequence) -> unicode\n\
3879\n\
3880Return a string which is the concatenation of the strings in the\n\
3881sequence. The separator between elements is S.";
3882
3883static PyObject*
3884unicode_join(PyUnicodeObject *self, PyObject *args)
3885{
3886 PyObject *data;
3887 if (!PyArg_ParseTuple(args, "O:join", &data))
3888 return NULL;
3889
3890 return PyUnicode_Join((PyObject *)self, data);
3891}
3892
3893static int
3894unicode_length(PyUnicodeObject *self)
3895{
3896 return self->length;
3897}
3898
3899static char ljust__doc__[] =
3900"S.ljust(width) -> unicode\n\
3901\n\
3902Return S left justified in a Unicode string of length width. Padding is\n\
3903done using spaces.";
3904
3905static PyObject *
3906unicode_ljust(PyUnicodeObject *self, PyObject *args)
3907{
3908 int width;
3909 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3910 return NULL;
3911
3912 if (self->length >= width) {
3913 Py_INCREF(self);
3914 return (PyObject*) self;
3915 }
3916
3917 return (PyObject*) pad(self, 0, width - self->length, ' ');
3918}
3919
3920static char lower__doc__[] =
3921"S.lower() -> unicode\n\
3922\n\
3923Return a copy of the string S converted to lowercase.";
3924
3925static PyObject*
3926unicode_lower(PyUnicodeObject *self, PyObject *args)
3927{
3928 if (!PyArg_NoArgs(args))
3929 return NULL;
3930 return fixup(self, fixlower);
3931}
3932
3933static char lstrip__doc__[] =
3934"S.lstrip() -> unicode\n\
3935\n\
3936Return a copy of the string S with leading whitespace removed.";
3937
3938static PyObject *
3939unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3940{
3941 if (!PyArg_NoArgs(args))
3942 return NULL;
3943 return strip(self, 1, 0);
3944}
3945
3946static PyObject*
3947unicode_repeat(PyUnicodeObject *str, int len)
3948{
3949 PyUnicodeObject *u;
3950 Py_UNICODE *p;
3951
3952 if (len < 0)
3953 len = 0;
3954
3955 if (len == 1) {
3956 /* no repeat, return original string */
3957 Py_INCREF(str);
3958 return (PyObject*) str;
3959 }
3960
3961 u = _PyUnicode_New(len * str->length);
3962 if (!u)
3963 return NULL;
3964
3965 p = u->str;
3966
3967 while (len-- > 0) {
3968 Py_UNICODE_COPY(p, str->str, str->length);
3969 p += str->length;
3970 }
3971
3972 return (PyObject*) u;
3973}
3974
3975PyObject *PyUnicode_Replace(PyObject *obj,
3976 PyObject *subobj,
3977 PyObject *replobj,
3978 int maxcount)
3979{
3980 PyObject *self;
3981 PyObject *str1;
3982 PyObject *str2;
3983 PyObject *result;
3984
3985 self = PyUnicode_FromObject(obj);
3986 if (self == NULL)
3987 return NULL;
3988 str1 = PyUnicode_FromObject(subobj);
3989 if (str1 == NULL) {
3990 Py_DECREF(self);
3991 return NULL;
3992 }
3993 str2 = PyUnicode_FromObject(replobj);
3994 if (str2 == NULL) {
3995 Py_DECREF(self);
3996 Py_DECREF(str1);
3997 return NULL;
3998 }
3999 result = replace((PyUnicodeObject *)self,
4000 (PyUnicodeObject *)str1,
4001 (PyUnicodeObject *)str2,
4002 maxcount);
4003 Py_DECREF(self);
4004 Py_DECREF(str1);
4005 Py_DECREF(str2);
4006 return result;
4007}
4008
4009static char replace__doc__[] =
4010"S.replace (old, new[, maxsplit]) -> unicode\n\
4011\n\
4012Return a copy of S with all occurrences of substring\n\
4013old replaced by new. If the optional argument maxsplit is\n\
4014given, only the first maxsplit occurrences are replaced.";
4015
4016static PyObject*
4017unicode_replace(PyUnicodeObject *self, PyObject *args)
4018{
4019 PyUnicodeObject *str1;
4020 PyUnicodeObject *str2;
4021 int maxcount = -1;
4022 PyObject *result;
4023
4024 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4025 return NULL;
4026 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4027 if (str1 == NULL)
4028 return NULL;
4029 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4030 if (str2 == NULL)
4031 return NULL;
4032
4033 result = replace(self, str1, str2, maxcount);
4034
4035 Py_DECREF(str1);
4036 Py_DECREF(str2);
4037 return result;
4038}
4039
4040static
4041PyObject *unicode_repr(PyObject *unicode)
4042{
4043 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4044 PyUnicode_GET_SIZE(unicode),
4045 1);
4046}
4047
4048static char rfind__doc__[] =
4049"S.rfind(sub [,start [,end]]) -> int\n\
4050\n\
4051Return the highest index in S where substring sub is found,\n\
4052such that sub is contained within s[start,end]. Optional\n\
4053arguments start and end are interpreted as in slice notation.\n\
4054\n\
4055Return -1 on failure.";
4056
4057static PyObject *
4058unicode_rfind(PyUnicodeObject *self, PyObject *args)
4059{
4060 PyUnicodeObject *substring;
4061 int start = 0;
4062 int end = INT_MAX;
4063 PyObject *result;
4064
Guido van Rossumb8872e62000-05-09 14:14:27 +00004065 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4066 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 return NULL;
4068 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4069 (PyObject *)substring);
4070 if (substring == NULL)
4071 return NULL;
4072
4073 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4074
4075 Py_DECREF(substring);
4076 return result;
4077}
4078
4079static char rindex__doc__[] =
4080"S.rindex(sub [,start [,end]]) -> int\n\
4081\n\
4082Like S.rfind() but raise ValueError when the substring is not found.";
4083
4084static PyObject *
4085unicode_rindex(PyUnicodeObject *self, PyObject *args)
4086{
4087 int result;
4088 PyUnicodeObject *substring;
4089 int start = 0;
4090 int end = INT_MAX;
4091
Guido van Rossumb8872e62000-05-09 14:14:27 +00004092 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4093 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 return NULL;
4095 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4096 (PyObject *)substring);
4097 if (substring == NULL)
4098 return NULL;
4099
4100 result = findstring(self, substring, start, end, -1);
4101
4102 Py_DECREF(substring);
4103 if (result < 0) {
4104 PyErr_SetString(PyExc_ValueError, "substring not found");
4105 return NULL;
4106 }
4107 return PyInt_FromLong(result);
4108}
4109
4110static char rjust__doc__[] =
4111"S.rjust(width) -> unicode\n\
4112\n\
4113Return S right justified in a Unicode string of length width. Padding is\n\
4114done using spaces.";
4115
4116static PyObject *
4117unicode_rjust(PyUnicodeObject *self, PyObject *args)
4118{
4119 int width;
4120 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4121 return NULL;
4122
4123 if (self->length >= width) {
4124 Py_INCREF(self);
4125 return (PyObject*) self;
4126 }
4127
4128 return (PyObject*) pad(self, width - self->length, 0, ' ');
4129}
4130
4131static char rstrip__doc__[] =
4132"S.rstrip() -> unicode\n\
4133\n\
4134Return a copy of the string S with trailing whitespace removed.";
4135
4136static PyObject *
4137unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4138{
4139 if (!PyArg_NoArgs(args))
4140 return NULL;
4141 return strip(self, 0, 1);
4142}
4143
4144static PyObject*
4145unicode_slice(PyUnicodeObject *self, int start, int end)
4146{
4147 /* standard clamping */
4148 if (start < 0)
4149 start = 0;
4150 if (end < 0)
4151 end = 0;
4152 if (end > self->length)
4153 end = self->length;
4154 if (start == 0 && end == self->length) {
4155 /* full slice, return original string */
4156 Py_INCREF(self);
4157 return (PyObject*) self;
4158 }
4159 if (start > end)
4160 start = end;
4161 /* copy slice */
4162 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4163 end - start);
4164}
4165
4166PyObject *PyUnicode_Split(PyObject *s,
4167 PyObject *sep,
4168 int maxsplit)
4169{
4170 PyObject *result;
4171
4172 s = PyUnicode_FromObject(s);
4173 if (s == NULL)
4174 return NULL;
4175 if (sep != NULL) {
4176 sep = PyUnicode_FromObject(sep);
4177 if (sep == NULL) {
4178 Py_DECREF(s);
4179 return NULL;
4180 }
4181 }
4182
4183 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4184
4185 Py_DECREF(s);
4186 Py_XDECREF(sep);
4187 return result;
4188}
4189
4190static char split__doc__[] =
4191"S.split([sep [,maxsplit]]) -> list of strings\n\
4192\n\
4193Return a list of the words in S, using sep as the\n\
4194delimiter string. If maxsplit is given, at most maxsplit\n\
4195splits are done. If sep is not specified, any whitespace string\n\
4196is a separator.";
4197
4198static PyObject*
4199unicode_split(PyUnicodeObject *self, PyObject *args)
4200{
4201 PyObject *substring = Py_None;
4202 int maxcount = -1;
4203
4204 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4205 return NULL;
4206
4207 if (substring == Py_None)
4208 return split(self, NULL, maxcount);
4209 else if (PyUnicode_Check(substring))
4210 return split(self, (PyUnicodeObject *)substring, maxcount);
4211 else
4212 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4213}
4214
4215static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004216"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217\n\
4218Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004219Line breaks are not included in the resulting list unless keepends\n\
4220is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221
4222static PyObject*
4223unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4224{
Guido van Rossum86662912000-04-11 15:38:46 +00004225 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226
Guido van Rossum86662912000-04-11 15:38:46 +00004227 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 return NULL;
4229
Guido van Rossum86662912000-04-11 15:38:46 +00004230 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231}
4232
4233static
4234PyObject *unicode_str(PyUnicodeObject *self)
4235{
Fred Drakee4315f52000-05-09 19:53:39 +00004236 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237}
4238
4239static char strip__doc__[] =
4240"S.strip() -> unicode\n\
4241\n\
4242Return a copy of S with leading and trailing whitespace removed.";
4243
4244static PyObject *
4245unicode_strip(PyUnicodeObject *self, PyObject *args)
4246{
4247 if (!PyArg_NoArgs(args))
4248 return NULL;
4249 return strip(self, 1, 1);
4250}
4251
4252static char swapcase__doc__[] =
4253"S.swapcase() -> unicode\n\
4254\n\
4255Return a copy of S with uppercase characters converted to lowercase\n\
4256and vice versa.";
4257
4258static PyObject*
4259unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4260{
4261 if (!PyArg_NoArgs(args))
4262 return NULL;
4263 return fixup(self, fixswapcase);
4264}
4265
4266static char translate__doc__[] =
4267"S.translate(table) -> unicode\n\
4268\n\
4269Return a copy of the string S, where all characters have been mapped\n\
4270through the given translation table, which must be a mapping of\n\
4271Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4272are left untouched. Characters mapped to None are deleted.";
4273
4274static PyObject*
4275unicode_translate(PyUnicodeObject *self, PyObject *args)
4276{
4277 PyObject *table;
4278
4279 if (!PyArg_ParseTuple(args, "O:translate", &table))
4280 return NULL;
4281 return PyUnicode_TranslateCharmap(self->str,
4282 self->length,
4283 table,
4284 "ignore");
4285}
4286
4287static char upper__doc__[] =
4288"S.upper() -> unicode\n\
4289\n\
4290Return a copy of S converted to uppercase.";
4291
4292static PyObject*
4293unicode_upper(PyUnicodeObject *self, PyObject *args)
4294{
4295 if (!PyArg_NoArgs(args))
4296 return NULL;
4297 return fixup(self, fixupper);
4298}
4299
4300#if 0
4301static char zfill__doc__[] =
4302"S.zfill(width) -> unicode\n\
4303\n\
4304Pad a numeric string x with zeros on the left, to fill a field\n\
4305of the specified width. The string x is never truncated.";
4306
4307static PyObject *
4308unicode_zfill(PyUnicodeObject *self, PyObject *args)
4309{
4310 int fill;
4311 PyUnicodeObject *u;
4312
4313 int width;
4314 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4315 return NULL;
4316
4317 if (self->length >= width) {
4318 Py_INCREF(self);
4319 return (PyObject*) self;
4320 }
4321
4322 fill = width - self->length;
4323
4324 u = pad(self, fill, 0, '0');
4325
4326 if (u->str[fill] == '+' || u->str[fill] == '-') {
4327 /* move sign to beginning of string */
4328 u->str[0] = u->str[fill];
4329 u->str[fill] = '0';
4330 }
4331
4332 return (PyObject*) u;
4333}
4334#endif
4335
4336#if 0
4337static PyObject*
4338unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4339{
4340 if (!PyArg_NoArgs(args))
4341 return NULL;
4342 return PyInt_FromLong(unicode_freelist_size);
4343}
4344#endif
4345
4346static char startswith__doc__[] =
4347"S.startswith(prefix[, start[, end]]) -> int\n\
4348\n\
4349Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4350optional start, test S beginning at that position. With optional end, stop\n\
4351comparing S at that position.";
4352
4353static PyObject *
4354unicode_startswith(PyUnicodeObject *self,
4355 PyObject *args)
4356{
4357 PyUnicodeObject *substring;
4358 int start = 0;
4359 int end = INT_MAX;
4360 PyObject *result;
4361
Guido van Rossumb8872e62000-05-09 14:14:27 +00004362 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4363 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 return NULL;
4365 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4366 (PyObject *)substring);
4367 if (substring == NULL)
4368 return NULL;
4369
4370 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4371
4372 Py_DECREF(substring);
4373 return result;
4374}
4375
4376
4377static char endswith__doc__[] =
4378"S.endswith(suffix[, start[, end]]) -> int\n\
4379\n\
4380Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4381optional start, test S beginning at that position. With optional end, stop\n\
4382comparing S at that position.";
4383
4384static PyObject *
4385unicode_endswith(PyUnicodeObject *self,
4386 PyObject *args)
4387{
4388 PyUnicodeObject *substring;
4389 int start = 0;
4390 int end = INT_MAX;
4391 PyObject *result;
4392
Guido van Rossumb8872e62000-05-09 14:14:27 +00004393 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4394 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 return NULL;
4396 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4397 (PyObject *)substring);
4398 if (substring == NULL)
4399 return NULL;
4400
4401 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4402
4403 Py_DECREF(substring);
4404 return result;
4405}
4406
4407
4408static PyMethodDef unicode_methods[] = {
4409
4410 /* Order is according to common usage: often used methods should
4411 appear first, since lookup is done sequentially. */
4412
4413 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4414 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4415 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4416 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4417 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4418 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4419 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4420 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4421 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4422 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4423 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4424 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4425 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4426 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4427/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4428 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4429 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4430 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4431 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4432 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4433 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4434 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4435 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4436 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4437 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4438 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4439 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4440 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4441 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4442 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4443 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4444 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4445 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004446 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4447 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448#if 0
4449 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4450 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4451#endif
4452
4453#if 0
4454 /* This one is just used for debugging the implementation. */
4455 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4456#endif
4457
4458 {NULL, NULL}
4459};
4460
4461static PyObject *
4462unicode_getattr(PyUnicodeObject *self, char *name)
4463{
4464 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4465}
4466
4467static PySequenceMethods unicode_as_sequence = {
4468 (inquiry) unicode_length, /* sq_length */
4469 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4470 (intargfunc) unicode_repeat, /* sq_repeat */
4471 (intargfunc) unicode_getitem, /* sq_item */
4472 (intintargfunc) unicode_slice, /* sq_slice */
4473 0, /* sq_ass_item */
4474 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004475 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476};
4477
4478static int
4479unicode_buffer_getreadbuf(PyUnicodeObject *self,
4480 int index,
4481 const void **ptr)
4482{
4483 if (index != 0) {
4484 PyErr_SetString(PyExc_SystemError,
4485 "accessing non-existent unicode segment");
4486 return -1;
4487 }
4488 *ptr = (void *) self->str;
4489 return PyUnicode_GET_DATA_SIZE(self);
4490}
4491
4492static int
4493unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4494 const void **ptr)
4495{
4496 PyErr_SetString(PyExc_TypeError,
4497 "cannot use unicode as modifyable buffer");
4498 return -1;
4499}
4500
4501static int
4502unicode_buffer_getsegcount(PyUnicodeObject *self,
4503 int *lenp)
4504{
4505 if (lenp)
4506 *lenp = PyUnicode_GET_DATA_SIZE(self);
4507 return 1;
4508}
4509
4510static int
4511unicode_buffer_getcharbuf(PyUnicodeObject *self,
4512 int index,
4513 const void **ptr)
4514{
4515 PyObject *str;
4516
4517 if (index != 0) {
4518 PyErr_SetString(PyExc_SystemError,
4519 "accessing non-existent unicode segment");
4520 return -1;
4521 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004522 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 if (str == NULL)
4524 return -1;
4525 *ptr = (void *) PyString_AS_STRING(str);
4526 return PyString_GET_SIZE(str);
4527}
4528
4529/* Helpers for PyUnicode_Format() */
4530
4531static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004532getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533{
4534 int argidx = *p_argidx;
4535 if (argidx < arglen) {
4536 (*p_argidx)++;
4537 if (arglen < 0)
4538 return args;
4539 else
4540 return PyTuple_GetItem(args, argidx);
4541 }
4542 PyErr_SetString(PyExc_TypeError,
4543 "not enough arguments for format string");
4544 return NULL;
4545}
4546
4547#define F_LJUST (1<<0)
4548#define F_SIGN (1<<1)
4549#define F_BLANK (1<<2)
4550#define F_ALT (1<<3)
4551#define F_ZERO (1<<4)
4552
4553static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555{
4556 register int i;
4557 int len;
4558 va_list va;
4559 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561
4562 /* First, format the string as char array, then expand to Py_UNICODE
4563 array. */
4564 charbuffer = (char *)buffer;
4565 len = vsprintf(charbuffer, format, va);
4566 for (i = len - 1; i >= 0; i--)
4567 buffer[i] = (Py_UNICODE) charbuffer[i];
4568
4569 va_end(va);
4570 return len;
4571}
4572
4573static int
4574formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004575 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 int flags,
4577 int prec,
4578 int type,
4579 PyObject *v)
4580{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004581 /* fmt = '%#.' + `prec` + `type`
4582 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 char fmt[20];
4584 double x;
4585
4586 x = PyFloat_AsDouble(v);
4587 if (x == -1.0 && PyErr_Occurred())
4588 return -1;
4589 if (prec < 0)
4590 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4592 type = 'g';
4593 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004594 /* worst case length calc to ensure no buffer overrun:
4595 fmt = %#.<prec>g
4596 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4597 for any double rep.)
4598 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4599 If prec=0 the effective precision is 1 (the leading digit is
4600 always given), therefore increase by one to 10+prec. */
4601 if (buflen <= (size_t)10 + (size_t)prec) {
4602 PyErr_SetString(PyExc_OverflowError,
4603 "formatted float is too long (precision too long?)");
4604 return -1;
4605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 return usprintf(buf, fmt, x);
4607}
4608
4609static int
4610formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004611 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 int flags,
4613 int prec,
4614 int type,
4615 PyObject *v)
4616{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004617 /* fmt = '%#.' + `prec` + 'l' + `type`
4618 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 char fmt[20];
4620 long x;
4621
4622 x = PyInt_AsLong(v);
4623 if (x == -1 && PyErr_Occurred())
4624 return -1;
4625 if (prec < 0)
4626 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004627 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4628 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4629 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4630 PyErr_SetString(PyExc_OverflowError,
4631 "formatted integer is too long (precision too long?)");
4632 return -1;
4633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4635 return usprintf(buf, fmt, x);
4636}
4637
4638static int
4639formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004640 size_t buflen,
4641 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004643 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004644 if (PyUnicode_Check(v)) {
4645 if (PyUnicode_GET_SIZE(v) != 1)
4646 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004650 else if (PyString_Check(v)) {
4651 if (PyString_GET_SIZE(v) != 1)
4652 goto onError;
4653 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655
4656 else {
4657 /* Integer input truncated to a character */
4658 long x;
4659 x = PyInt_AsLong(v);
4660 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 buf[0] = (char) x;
4663 }
4664 buf[1] = '\0';
4665 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004666
4667 onError:
4668 PyErr_SetString(PyExc_TypeError,
4669 "%c requires int or char");
4670 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671}
4672
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004673/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4674
4675 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4676 chars are formatted. XXX This is a magic number. Each formatting
4677 routine does bounds checking to ensure no overflow, but a better
4678 solution may be to malloc a buffer of appropriate size for each
4679 format. For now, the current solution is sufficient.
4680*/
4681#define FORMATBUFLEN (size_t)120
4682
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683PyObject *PyUnicode_Format(PyObject *format,
4684 PyObject *args)
4685{
4686 Py_UNICODE *fmt, *res;
4687 int fmtcnt, rescnt, reslen, arglen, argidx;
4688 int args_owned = 0;
4689 PyUnicodeObject *result = NULL;
4690 PyObject *dict = NULL;
4691 PyObject *uformat;
4692
4693 if (format == NULL || args == NULL) {
4694 PyErr_BadInternalCall();
4695 return NULL;
4696 }
4697 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004698 if (uformat == NULL)
4699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 fmt = PyUnicode_AS_UNICODE(uformat);
4701 fmtcnt = PyUnicode_GET_SIZE(uformat);
4702
4703 reslen = rescnt = fmtcnt + 100;
4704 result = _PyUnicode_New(reslen);
4705 if (result == NULL)
4706 goto onError;
4707 res = PyUnicode_AS_UNICODE(result);
4708
4709 if (PyTuple_Check(args)) {
4710 arglen = PyTuple_Size(args);
4711 argidx = 0;
4712 }
4713 else {
4714 arglen = -1;
4715 argidx = -2;
4716 }
4717 if (args->ob_type->tp_as_mapping)
4718 dict = args;
4719
4720 while (--fmtcnt >= 0) {
4721 if (*fmt != '%') {
4722 if (--rescnt < 0) {
4723 rescnt = fmtcnt + 100;
4724 reslen += rescnt;
4725 if (_PyUnicode_Resize(result, reslen) < 0)
4726 return NULL;
4727 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4728 --rescnt;
4729 }
4730 *res++ = *fmt++;
4731 }
4732 else {
4733 /* Got a format specifier */
4734 int flags = 0;
4735 int width = -1;
4736 int prec = -1;
4737 int size = 0;
4738 Py_UNICODE c = '\0';
4739 Py_UNICODE fill;
4740 PyObject *v = NULL;
4741 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004742 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_UNICODE sign;
4744 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004745 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
4747 fmt++;
4748 if (*fmt == '(') {
4749 Py_UNICODE *keystart;
4750 int keylen;
4751 PyObject *key;
4752 int pcount = 1;
4753
4754 if (dict == NULL) {
4755 PyErr_SetString(PyExc_TypeError,
4756 "format requires a mapping");
4757 goto onError;
4758 }
4759 ++fmt;
4760 --fmtcnt;
4761 keystart = fmt;
4762 /* Skip over balanced parentheses */
4763 while (pcount > 0 && --fmtcnt >= 0) {
4764 if (*fmt == ')')
4765 --pcount;
4766 else if (*fmt == '(')
4767 ++pcount;
4768 fmt++;
4769 }
4770 keylen = fmt - keystart - 1;
4771 if (fmtcnt < 0 || pcount > 0) {
4772 PyErr_SetString(PyExc_ValueError,
4773 "incomplete format key");
4774 goto onError;
4775 }
Fred Drakee4315f52000-05-09 19:53:39 +00004776 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 then looked up since Python uses strings to hold
4778 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004779 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 key = PyUnicode_EncodeUTF8(keystart,
4781 keylen,
4782 NULL);
4783 if (key == NULL)
4784 goto onError;
4785 if (args_owned) {
4786 Py_DECREF(args);
4787 args_owned = 0;
4788 }
4789 args = PyObject_GetItem(dict, key);
4790 Py_DECREF(key);
4791 if (args == NULL) {
4792 goto onError;
4793 }
4794 args_owned = 1;
4795 arglen = -1;
4796 argidx = -2;
4797 }
4798 while (--fmtcnt >= 0) {
4799 switch (c = *fmt++) {
4800 case '-': flags |= F_LJUST; continue;
4801 case '+': flags |= F_SIGN; continue;
4802 case ' ': flags |= F_BLANK; continue;
4803 case '#': flags |= F_ALT; continue;
4804 case '0': flags |= F_ZERO; continue;
4805 }
4806 break;
4807 }
4808 if (c == '*') {
4809 v = getnextarg(args, arglen, &argidx);
4810 if (v == NULL)
4811 goto onError;
4812 if (!PyInt_Check(v)) {
4813 PyErr_SetString(PyExc_TypeError,
4814 "* wants int");
4815 goto onError;
4816 }
4817 width = PyInt_AsLong(v);
4818 if (width < 0) {
4819 flags |= F_LJUST;
4820 width = -width;
4821 }
4822 if (--fmtcnt >= 0)
4823 c = *fmt++;
4824 }
4825 else if (c >= '0' && c <= '9') {
4826 width = c - '0';
4827 while (--fmtcnt >= 0) {
4828 c = *fmt++;
4829 if (c < '0' || c > '9')
4830 break;
4831 if ((width*10) / 10 != width) {
4832 PyErr_SetString(PyExc_ValueError,
4833 "width too big");
4834 goto onError;
4835 }
4836 width = width*10 + (c - '0');
4837 }
4838 }
4839 if (c == '.') {
4840 prec = 0;
4841 if (--fmtcnt >= 0)
4842 c = *fmt++;
4843 if (c == '*') {
4844 v = getnextarg(args, arglen, &argidx);
4845 if (v == NULL)
4846 goto onError;
4847 if (!PyInt_Check(v)) {
4848 PyErr_SetString(PyExc_TypeError,
4849 "* wants int");
4850 goto onError;
4851 }
4852 prec = PyInt_AsLong(v);
4853 if (prec < 0)
4854 prec = 0;
4855 if (--fmtcnt >= 0)
4856 c = *fmt++;
4857 }
4858 else if (c >= '0' && c <= '9') {
4859 prec = c - '0';
4860 while (--fmtcnt >= 0) {
4861 c = Py_CHARMASK(*fmt++);
4862 if (c < '0' || c > '9')
4863 break;
4864 if ((prec*10) / 10 != prec) {
4865 PyErr_SetString(PyExc_ValueError,
4866 "prec too big");
4867 goto onError;
4868 }
4869 prec = prec*10 + (c - '0');
4870 }
4871 }
4872 } /* prec */
4873 if (fmtcnt >= 0) {
4874 if (c == 'h' || c == 'l' || c == 'L') {
4875 size = c;
4876 if (--fmtcnt >= 0)
4877 c = *fmt++;
4878 }
4879 }
4880 if (fmtcnt < 0) {
4881 PyErr_SetString(PyExc_ValueError,
4882 "incomplete format");
4883 goto onError;
4884 }
4885 if (c != '%') {
4886 v = getnextarg(args, arglen, &argidx);
4887 if (v == NULL)
4888 goto onError;
4889 }
4890 sign = 0;
4891 fill = ' ';
4892 switch (c) {
4893
4894 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004895 pbuf = formatbuf;
4896 /* presume that buffer length is at least 1 */
4897 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 len = 1;
4899 break;
4900
4901 case 's':
4902 case 'r':
4903 if (PyUnicode_Check(v) && c == 's') {
4904 temp = v;
4905 Py_INCREF(temp);
4906 }
4907 else {
4908 PyObject *unicode;
4909 if (c == 's')
4910 temp = PyObject_Str(v);
4911 else
4912 temp = PyObject_Repr(v);
4913 if (temp == NULL)
4914 goto onError;
4915 if (!PyString_Check(temp)) {
4916 /* XXX Note: this should never happen, since
4917 PyObject_Repr() and PyObject_Str() assure
4918 this */
4919 Py_DECREF(temp);
4920 PyErr_SetString(PyExc_TypeError,
4921 "%s argument has non-string str()");
4922 goto onError;
4923 }
Fred Drakee4315f52000-05-09 19:53:39 +00004924 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004926 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 "strict");
4928 Py_DECREF(temp);
4929 temp = unicode;
4930 if (temp == NULL)
4931 goto onError;
4932 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004933 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 len = PyUnicode_GET_SIZE(temp);
4935 if (prec >= 0 && len > prec)
4936 len = prec;
4937 break;
4938
4939 case 'i':
4940 case 'd':
4941 case 'u':
4942 case 'o':
4943 case 'x':
4944 case 'X':
4945 if (c == 'i')
4946 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004947 pbuf = formatbuf;
4948 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4949 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 if (len < 0)
4951 goto onError;
4952 sign = (c == 'd');
4953 if (flags & F_ZERO) {
4954 fill = '0';
4955 if ((flags&F_ALT) &&
4956 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004957 pbuf[0] == '0' && pbuf[1] == c) {
4958 *res++ = *pbuf++;
4959 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 rescnt -= 2;
4961 len -= 2;
4962 width -= 2;
4963 if (width < 0)
4964 width = 0;
4965 }
4966 }
4967 break;
4968
4969 case 'e':
4970 case 'E':
4971 case 'f':
4972 case 'g':
4973 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004974 pbuf = formatbuf;
4975 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4976 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 if (len < 0)
4978 goto onError;
4979 sign = 1;
4980 if (flags&F_ZERO)
4981 fill = '0';
4982 break;
4983
4984 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004985 pbuf = formatbuf;
4986 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 if (len < 0)
4988 goto onError;
4989 break;
4990
4991 default:
4992 PyErr_Format(PyExc_ValueError,
4993 "unsupported format character '%c' (0x%x)",
4994 c, c);
4995 goto onError;
4996 }
4997 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004998 if (*pbuf == '-' || *pbuf == '+') {
4999 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 len--;
5001 }
5002 else if (flags & F_SIGN)
5003 sign = '+';
5004 else if (flags & F_BLANK)
5005 sign = ' ';
5006 else
5007 sign = 0;
5008 }
5009 if (width < len)
5010 width = len;
5011 if (rescnt < width + (sign != 0)) {
5012 reslen -= rescnt;
5013 rescnt = width + fmtcnt + 100;
5014 reslen += rescnt;
5015 if (_PyUnicode_Resize(result, reslen) < 0)
5016 return NULL;
5017 res = PyUnicode_AS_UNICODE(result)
5018 + reslen - rescnt;
5019 }
5020 if (sign) {
5021 if (fill != ' ')
5022 *res++ = sign;
5023 rescnt--;
5024 if (width > len)
5025 width--;
5026 }
5027 if (width > len && !(flags & F_LJUST)) {
5028 do {
5029 --rescnt;
5030 *res++ = fill;
5031 } while (--width > len);
5032 }
5033 if (sign && fill == ' ')
5034 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005035 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 res += len;
5037 rescnt -= len;
5038 while (--width >= len) {
5039 --rescnt;
5040 *res++ = ' ';
5041 }
5042 if (dict && (argidx < arglen) && c != '%') {
5043 PyErr_SetString(PyExc_TypeError,
5044 "not all arguments converted");
5045 goto onError;
5046 }
5047 Py_XDECREF(temp);
5048 } /* '%' */
5049 } /* until end */
5050 if (argidx < arglen && !dict) {
5051 PyErr_SetString(PyExc_TypeError,
5052 "not all arguments converted");
5053 goto onError;
5054 }
5055
5056 if (args_owned) {
5057 Py_DECREF(args);
5058 }
5059 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005060 if (_PyUnicode_Resize(result, reslen - rescnt))
5061 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 return (PyObject *)result;
5063
5064 onError:
5065 Py_XDECREF(result);
5066 Py_DECREF(uformat);
5067 if (args_owned) {
5068 Py_DECREF(args);
5069 }
5070 return NULL;
5071}
5072
5073static PyBufferProcs unicode_as_buffer = {
5074 (getreadbufferproc) unicode_buffer_getreadbuf,
5075 (getwritebufferproc) unicode_buffer_getwritebuf,
5076 (getsegcountproc) unicode_buffer_getsegcount,
5077 (getcharbufferproc) unicode_buffer_getcharbuf,
5078};
5079
5080PyTypeObject PyUnicode_Type = {
5081 PyObject_HEAD_INIT(&PyType_Type)
5082 0, /* ob_size */
5083 "unicode", /* tp_name */
5084 sizeof(PyUnicodeObject), /* tp_size */
5085 0, /* tp_itemsize */
5086 /* Slots */
5087 (destructor)_PyUnicode_Free, /* tp_dealloc */
5088 0, /* tp_print */
5089 (getattrfunc)unicode_getattr, /* tp_getattr */
5090 0, /* tp_setattr */
5091 (cmpfunc) unicode_compare, /* tp_compare */
5092 (reprfunc) unicode_repr, /* tp_repr */
5093 0, /* tp_as_number */
5094 &unicode_as_sequence, /* tp_as_sequence */
5095 0, /* tp_as_mapping */
5096 (hashfunc) unicode_hash, /* tp_hash*/
5097 0, /* tp_call*/
5098 (reprfunc) unicode_str, /* tp_str */
5099 (getattrofunc) NULL, /* tp_getattro */
5100 (setattrofunc) NULL, /* tp_setattro */
5101 &unicode_as_buffer, /* tp_as_buffer */
5102 Py_TPFLAGS_DEFAULT, /* tp_flags */
5103};
5104
5105/* Initialize the Unicode implementation */
5106
Thomas Wouters78890102000-07-22 19:25:51 +00005107void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108{
5109 /* Doublecheck the configuration... */
5110 if (sizeof(Py_UNICODE) != 2)
5111 Py_FatalError("Unicode configuration error: "
5112 "sizeof(Py_UNICODE) != 2 bytes");
5113
Fred Drakee4315f52000-05-09 19:53:39 +00005114 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005115 unicode_freelist = NULL;
5116 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005118 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119}
5120
5121/* Finalize the Unicode implementation */
5122
5123void
Thomas Wouters78890102000-07-22 19:25:51 +00005124_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125{
5126 PyUnicodeObject *u = unicode_freelist;
5127
5128 while (u != NULL) {
5129 PyUnicodeObject *v = u;
5130 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005131 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005132 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005133 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005134 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005136 unicode_freelist = NULL;
5137 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005139 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140}