blob: 359a9872ff52fa7585c20be4948b2549ec6e5cac [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000069#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71#if defined(HAVE_LIMITS_H)
72#include <limits.h>
73#else
74#define INT_MAX 2147483647
75#endif
76
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000077#ifdef MS_WIN32
78#include <windows.h>
79#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080
Guido van Rossumd57fd912000-03-10 22:53:23 +000081/* Limit for the Unicode object free list */
82
83#define MAX_UNICODE_FREELIST_SIZE 1024
84
85/* Limit for the Unicode object free list stay alive optimization.
86
87 The implementation will keep allocated Unicode memory intact for
88 all objects on the free list having a size less than this
89 limit. This reduces malloc() overhead for small Unicode objects.
90
Barry Warsaw51ac5802000-03-20 16:36:48 +000091 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000092 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000093 malloc()-overhead) bytes of unused garbage.
94
95 Setting the limit to 0 effectively turns the feature off.
96
Guido van Rossumfd4b9572000-04-10 13:51:10 +000097 Note: This is an experimental feature ! If you get core dumps when
98 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
100*/
101
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000102#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103
104/* Endianness switches; defaults to little endian */
105
106#ifdef WORDS_BIGENDIAN
107# define BYTEORDER_IS_BIG_ENDIAN
108#else
109# define BYTEORDER_IS_LITTLE_ENDIAN
110#endif
111
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000112/* --- Globals ------------------------------------------------------------
113
114 The globals are initialized by the _PyUnicode_Init() API and should
115 not be used before calling that API.
116
117*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
119/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000120static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000121
122/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000123static PyUnicodeObject *unicode_freelist;
124static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125
Fred Drakee4315f52000-05-09 19:53:39 +0000126/* Default encoding to use and assume when NULL is passed as encoding
127 parameter; it is initialized by _PyUnicode_Init().
128
129 Always use the PyUnicode_SetDefaultEncoding() and
130 PyUnicode_GetDefaultEncoding() APIs to access this global.
131
132*/
133
134static char unicode_default_encoding[100];
135
Guido van Rossumd57fd912000-03-10 22:53:23 +0000136/* --- Unicode Object ----------------------------------------------------- */
137
138static
139int _PyUnicode_Resize(register PyUnicodeObject *unicode,
140 int length)
141{
142 void *oldstr;
143
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000144 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000146 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147
148 /* Resizing unicode_empty is not allowed. */
149 if (unicode == unicode_empty) {
150 PyErr_SetString(PyExc_SystemError,
151 "can't resize empty unicode object");
152 return -1;
153 }
154
155 /* We allocate one more byte to make sure the string is
156 Ux0000 terminated -- XXX is this needed ? */
157 oldstr = unicode->str;
158 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
159 if (!unicode->str) {
160 unicode->str = oldstr;
161 PyErr_NoMemory();
162 return -1;
163 }
164 unicode->str[length] = 0;
165 unicode->length = length;
166
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000167 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 /* Reset the object caches */
169 if (unicode->utf8str) {
170 Py_DECREF(unicode->utf8str);
171 unicode->utf8str = NULL;
172 }
173 unicode->hash = -1;
174
175 return 0;
176}
177
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178int PyUnicode_Resize(PyObject **unicode,
179 int length)
180{
181 PyUnicodeObject *v;
182
183 if (unicode == NULL) {
184 PyErr_BadInternalCall();
185 return -1;
186 }
187 v = (PyUnicodeObject *)*unicode;
188 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
189 PyErr_BadInternalCall();
190 return -1;
191 }
192 return _PyUnicode_Resize(v, length);
193}
194
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195/* We allocate one more byte to make sure the string is
196 Ux0000 terminated -- XXX is this needed ?
197
198 XXX This allocator could further be enhanced by assuring that the
199 free list never reduces its size below 1.
200
201*/
202
203static
204PyUnicodeObject *_PyUnicode_New(int length)
205{
206 register PyUnicodeObject *unicode;
207
208 /* Optimization for empty strings */
209 if (length == 0 && unicode_empty != NULL) {
210 Py_INCREF(unicode_empty);
211 return unicode_empty;
212 }
213
214 /* Unicode freelist & memory allocation */
215 if (unicode_freelist) {
216 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000217 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000228 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000230 }
231 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
234 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
235 if (unicode == NULL)
236 return NULL;
237 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
238 }
239
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000240 if (!unicode->str) {
241 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 unicode->str[length] = 0;
245 unicode->length = length;
246 unicode->hash = -1;
247 unicode->utf8str = NULL;
248 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000249
250 onError:
251 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254}
255
256static
257void _PyUnicode_Free(register PyUnicodeObject *unicode)
258{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization */
261 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = NULL;
264 unicode->length = 0;
265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 if (unicode->utf8str) {
267 Py_DECREF(unicode->utf8str);
268 unicode->utf8str = NULL;
269 }
270 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 *(PyUnicodeObject **)unicode = unicode_freelist;
272 unicode_freelist = unicode;
273 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000277 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000278 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280}
281
282PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
283 int size)
284{
285 PyUnicodeObject *unicode;
286
287 unicode = _PyUnicode_New(size);
288 if (!unicode)
289 return NULL;
290
291 /* Copy the Unicode data into the new object */
292 if (u != NULL)
293 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294
295 return (PyObject *)unicode;
296}
297
298#ifdef HAVE_WCHAR_H
299
300PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
301 int size)
302{
303 PyUnicodeObject *unicode;
304
305 if (w == NULL) {
306 PyErr_BadInternalCall();
307 return NULL;
308 }
309
310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the wchar_t data into the new object */
315#ifdef HAVE_USABLE_WCHAR_T
316 memcpy(unicode->str, w, size * sizeof(wchar_t));
317#else
318 {
319 register Py_UNICODE *u;
320 register int i;
321 u = PyUnicode_AS_UNICODE(unicode);
322 for (i = size; i >= 0; i--)
323 *u++ = *w++;
324 }
325#endif
326
327 return (PyObject *)unicode;
328}
329
330int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
331 register wchar_t *w,
332 int size)
333{
334 if (unicode == NULL) {
335 PyErr_BadInternalCall();
336 return -1;
337 }
338 if (size > PyUnicode_GET_SIZE(unicode))
339 size = PyUnicode_GET_SIZE(unicode);
340#ifdef HAVE_USABLE_WCHAR_T
341 memcpy(w, unicode->str, size * sizeof(wchar_t));
342#else
343 {
344 register Py_UNICODE *u;
345 register int i;
346 u = PyUnicode_AS_UNICODE(unicode);
347 for (i = size; i >= 0; i--)
348 *w++ = *u++;
349 }
350#endif
351
352 return size;
353}
354
355#endif
356
357PyObject *PyUnicode_FromObject(register PyObject *obj)
358{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000359 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
360}
361
362PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
363 const char *encoding,
364 const char *errors)
365{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 const char *s;
367 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368 int owned = 0;
369 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370
371 if (obj == NULL) {
372 PyErr_BadInternalCall();
373 return NULL;
374 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000375
376 /* Coerce object */
377 if (PyInstance_Check(obj)) {
378 PyObject *func;
379 func = PyObject_GetAttrString(obj, "__str__");
380 if (func == NULL) {
381 PyErr_SetString(PyExc_TypeError,
382 "coercing to Unicode: instance doesn't define __str__");
383 return NULL;
384 }
385 obj = PyEval_CallObject(func, NULL);
386 Py_DECREF(func);
387 if (obj == NULL)
388 return NULL;
389 owned = 1;
390 }
391 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 v = obj;
394 if (encoding) {
395 PyErr_SetString(PyExc_TypeError,
396 "decoding Unicode is not supported");
397 return NULL;
398 }
399 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 }
401 else if (PyString_Check(obj)) {
402 s = PyString_AS_STRING(obj);
403 len = PyString_GET_SIZE(obj);
404 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000405 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
406 /* Overwrite the error message with something more useful in
407 case of a TypeError. */
408 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000409 PyErr_Format(PyExc_TypeError,
410 "coercing to Unicode: need string or buffer, "
411 "%.80s found",
412 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000413 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000414 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000415
416 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000417 if (len == 0) {
418 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000421 else
422 v = PyUnicode_Decode(s, len, encoding, errors);
423 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000424 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000425 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000426 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000427 return v;
428
429 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000430 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000431 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000432 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434}
435
436PyObject *PyUnicode_Decode(const char *s,
437 int size,
438 const char *encoding,
439 const char *errors)
440{
441 PyObject *buffer = NULL, *unicode;
442
Fred Drakee4315f52000-05-09 19:53:39 +0000443 if (encoding == NULL)
444 encoding = PyUnicode_GetDefaultEncoding();
445
446 /* Shortcuts for common default encodings */
447 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000449 else if (strcmp(encoding, "latin-1") == 0)
450 return PyUnicode_DecodeLatin1(s, size, errors);
451 else if (strcmp(encoding, "ascii") == 0)
452 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453
454 /* Decode via the codec registry */
455 buffer = PyBuffer_FromMemory((void *)s, size);
456 if (buffer == NULL)
457 goto onError;
458 unicode = PyCodec_Decode(buffer, encoding, errors);
459 if (unicode == NULL)
460 goto onError;
461 if (!PyUnicode_Check(unicode)) {
462 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000463 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 unicode->ob_type->tp_name);
465 Py_DECREF(unicode);
466 goto onError;
467 }
468 Py_DECREF(buffer);
469 return unicode;
470
471 onError:
472 Py_XDECREF(buffer);
473 return NULL;
474}
475
476PyObject *PyUnicode_Encode(const Py_UNICODE *s,
477 int size,
478 const char *encoding,
479 const char *errors)
480{
481 PyObject *v, *unicode;
482
483 unicode = PyUnicode_FromUnicode(s, size);
484 if (unicode == NULL)
485 return NULL;
486 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
487 Py_DECREF(unicode);
488 return v;
489}
490
491PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
492 const char *encoding,
493 const char *errors)
494{
495 PyObject *v;
496
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_BadArgument();
499 goto onError;
500 }
Fred Drakee4315f52000-05-09 19:53:39 +0000501
502 if (encoding == NULL)
503 encoding = PyUnicode_GetDefaultEncoding();
504
505 /* Shortcuts for common default encodings */
506 if (errors == NULL) {
507 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000509 else if (strcmp(encoding, "latin-1") == 0)
510 return PyUnicode_AsLatin1String(unicode);
511 else if (strcmp(encoding, "ascii") == 0)
512 return PyUnicode_AsASCIIString(unicode);
513 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514
515 /* Encode via the codec registry */
516 v = PyCodec_Encode(unicode, encoding, errors);
517 if (v == NULL)
518 goto onError;
519 /* XXX Should we really enforce this ? */
520 if (!PyString_Check(v)) {
521 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000522 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523 v->ob_type->tp_name);
524 Py_DECREF(v);
525 goto onError;
526 }
527 return v;
528
529 onError:
530 return NULL;
531}
532
533Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
534{
535 if (!PyUnicode_Check(unicode)) {
536 PyErr_BadArgument();
537 goto onError;
538 }
539 return PyUnicode_AS_UNICODE(unicode);
540
541 onError:
542 return NULL;
543}
544
545int PyUnicode_GetSize(PyObject *unicode)
546{
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
551 return PyUnicode_GET_SIZE(unicode);
552
553 onError:
554 return -1;
555}
556
Thomas Wouters78890102000-07-22 19:25:51 +0000557const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000558{
559 return unicode_default_encoding;
560}
561
562int PyUnicode_SetDefaultEncoding(const char *encoding)
563{
564 PyObject *v;
565
566 /* Make sure the encoding is valid. As side effect, this also
567 loads the encoding into the codec registry cache. */
568 v = _PyCodec_Lookup(encoding);
569 if (v == NULL)
570 goto onError;
571 Py_DECREF(v);
572 strncpy(unicode_default_encoding,
573 encoding,
574 sizeof(unicode_default_encoding));
575 return 0;
576
577 onError:
578 return -1;
579}
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581/* --- UTF-8 Codec -------------------------------------------------------- */
582
583static
584char utf8_code_length[256] = {
585 /* Map UTF-8 encoded prefix byte to sequence length. zero means
586 illegal prefix. see RFC 2279 for details */
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
591 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
592 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
593 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
594 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
600 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
601 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
602 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
603};
604
605static
606int utf8_decoding_error(const char **source,
607 Py_UNICODE **dest,
608 const char *errors,
609 const char *details)
610{
611 if ((errors == NULL) ||
612 (strcmp(errors,"strict") == 0)) {
613 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000614 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 details);
616 return -1;
617 }
618 else if (strcmp(errors,"ignore") == 0) {
619 (*source)++;
620 return 0;
621 }
622 else if (strcmp(errors,"replace") == 0) {
623 (*source)++;
624 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
625 (*dest)++;
626 return 0;
627 }
628 else {
629 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000630 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 errors);
632 return -1;
633 }
634}
635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636PyObject *PyUnicode_DecodeUTF8(const char *s,
637 int size,
638 const char *errors)
639{
640 int n;
641 const char *e;
642 PyUnicodeObject *unicode;
643 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000644 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645
646 /* Note: size will always be longer than the resulting Unicode
647 character count */
648 unicode = _PyUnicode_New(size);
649 if (!unicode)
650 return NULL;
651 if (size == 0)
652 return (PyObject *)unicode;
653
654 /* Unpack UTF-8 encoded data */
655 p = unicode->str;
656 e = s + size;
657
658 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000659 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660
661 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000662 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 s++;
664 continue;
665 }
666
667 n = utf8_code_length[ch];
668
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000669 if (s + n > e) {
670 errmsg = "unexpected end of data";
671 goto utf8Error;
672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000673
674 switch (n) {
675
676 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "unexpected code byte";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 errmsg = "internal error";
683 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 break;
685
686 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if ((s[1] & 0xc0) != 0x80) {
688 errmsg = "invalid data";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 if (ch < 0x80) {
693 errmsg = "illegal encoding";
694 goto utf8Error;
695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698 break;
699
700 case 3:
701 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 (s[2] & 0xc0) != 0x80) {
703 errmsg = "invalid data";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
708 errmsg = "illegal encoding";
709 goto utf8Error;
710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000712 *p++ = (Py_UNICODE)ch;
713 break;
714
715 case 4:
716 if ((s[1] & 0xc0) != 0x80 ||
717 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 (s[3] & 0xc0) != 0x80) {
719 errmsg = "invalid data";
720 goto utf8Error;
721 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000722 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
723 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
724 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000725 if ((ch < 0x10000) || /* minimum value allowed for 4
726 byte encoding */
727 (ch > 0x10ffff)) { /* maximum value allowed for
728 UTF-16 */
729 errmsg = "illegal encoding";
730 goto utf8Error;
731 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000732 /* compute and append the two surrogates: */
733
734 /* translate from 10000..10FFFF to 0..FFFF */
735 ch -= 0x10000;
736
737 /* high surrogate = top 10 bits added to D800 */
738 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
739
740 /* low surrogate = bottom 10 bits added to DC00 */
741 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742 break;
743
744 default:
745 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 errmsg = "unsupported Unicode code range";
747 goto utf8Error;
748 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749 }
750 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 continue;
752
753 utf8Error:
754 if (utf8_decoding_error(&s, &p, errors, errmsg))
755 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 }
757
758 /* Adjust length */
759 if (_PyUnicode_Resize(unicode, p - unicode->str))
760 goto onError;
761
762 return (PyObject *)unicode;
763
764onError:
765 Py_DECREF(unicode);
766 return NULL;
767}
768
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000769/* Not used anymore, now that the encoder supports UTF-16
770 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000771#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772static
773int utf8_encoding_error(const Py_UNICODE **source,
774 char **dest,
775 const char *errors,
776 const char *details)
777{
778 if ((errors == NULL) ||
779 (strcmp(errors,"strict") == 0)) {
780 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000781 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 details);
783 return -1;
784 }
785 else if (strcmp(errors,"ignore") == 0) {
786 return 0;
787 }
788 else if (strcmp(errors,"replace") == 0) {
789 **dest = '?';
790 (*dest)++;
791 return 0;
792 }
793 else {
794 PyErr_Format(PyExc_ValueError,
795 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000796 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797 errors);
798 return -1;
799 }
800}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000801#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
803PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
804 int size,
805 const char *errors)
806{
807 PyObject *v;
808 char *p;
809 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 Py_UCS4 ch2;
811 unsigned int cbAllocated = 3 * size;
812 unsigned int cbWritten = 0;
813 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000815 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 if (v == NULL)
817 return NULL;
818 if (size == 0)
819 goto done;
820
821 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000822 while (i < size) {
823 Py_UCS4 ch = s[i++];
824 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten++;
827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828 else if (ch < 0x0800) {
829 *p++ = 0xc0 | (ch >> 6);
830 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000831 cbWritten += 2;
832 }
833 else {
834 /* Check for high surrogate */
835 if (0xD800 <= ch && ch <= 0xDBFF) {
836 if (i != size) {
837 ch2 = s[i];
838 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
839
840 if (cbWritten >= (cbAllocated - 4)) {
841 /* Provide enough room for some more
842 surrogates */
843 cbAllocated += 4*10;
844 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000845 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000846 }
847
848 /* combine the two values */
849 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
850
851 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000852 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000853 i++;
854 cbWritten += 4;
855 }
856 }
857 }
858 else {
859 *p++ = (char)(0xe0 | (ch >> 12));
860 cbWritten += 3;
861 }
862 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
863 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 }
865 }
866 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000867 if (_PyString_Resize(&v, p - q))
868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869
870 done:
871 return v;
872
873 onError:
874 Py_DECREF(v);
875 return NULL;
876}
877
878/* Return a Python string holding the UTF-8 encoded value of the
879 Unicode object.
880
881 The resulting string is cached in the Unicode object for subsequent
882 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000883 the character buffer interface and will live (at least) as long as
884 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
886 The refcount of the string is *not* incremented.
887
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000888 *** Exported for internal use by the interpreter only !!! ***
889
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890*/
891
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000892PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +0000893 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000895 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896
897 if (v)
898 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000899 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
900 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 errors);
902 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000903 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 return v;
905}
906
907PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
908{
909 PyObject *str;
910
911 if (!PyUnicode_Check(unicode)) {
912 PyErr_BadArgument();
913 return NULL;
914 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000915 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916 if (str == NULL)
917 return NULL;
918 Py_INCREF(str);
919 return str;
920}
921
922/* --- UTF-16 Codec ------------------------------------------------------- */
923
924static
925int utf16_decoding_error(const Py_UNICODE **source,
926 Py_UNICODE **dest,
927 const char *errors,
928 const char *details)
929{
930 if ((errors == NULL) ||
931 (strcmp(errors,"strict") == 0)) {
932 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000933 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 details);
935 return -1;
936 }
937 else if (strcmp(errors,"ignore") == 0) {
938 return 0;
939 }
940 else if (strcmp(errors,"replace") == 0) {
941 if (dest) {
942 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
943 (*dest)++;
944 }
945 return 0;
946 }
947 else {
948 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000949 "UTF-16 decoding error; "
950 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951 errors);
952 return -1;
953 }
954}
955
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956PyObject *PyUnicode_DecodeUTF16(const char *s,
957 int size,
958 const char *errors,
959 int *byteorder)
960{
961 PyUnicodeObject *unicode;
962 Py_UNICODE *p;
963 const Py_UNICODE *q, *e;
964 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000965 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000966
967 /* size should be an even number */
968 if (size % sizeof(Py_UNICODE) != 0) {
969 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
970 return NULL;
971 /* The remaining input chars are ignored if we fall through
972 here... */
973 }
974
975 /* Note: size will always be longer than the resulting Unicode
976 character count */
977 unicode = _PyUnicode_New(size);
978 if (!unicode)
979 return NULL;
980 if (size == 0)
981 return (PyObject *)unicode;
982
983 /* Unpack UTF-16 encoded data */
984 p = unicode->str;
985 q = (Py_UNICODE *)s;
986 e = q + (size / sizeof(Py_UNICODE));
987
988 if (byteorder)
989 bo = *byteorder;
990
991 while (q < e) {
992 register Py_UNICODE ch = *q++;
993
994 /* Check for BOM marks (U+FEFF) in the input and adjust
995 current byte order setting accordingly. Swap input
996 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
997 !) */
998#ifdef BYTEORDER_IS_LITTLE_ENDIAN
999 if (ch == 0xFEFF) {
1000 bo = -1;
1001 continue;
1002 } else if (ch == 0xFFFE) {
1003 bo = 1;
1004 continue;
1005 }
1006 if (bo == 1)
1007 ch = (ch >> 8) | (ch << 8);
1008#else
1009 if (ch == 0xFEFF) {
1010 bo = 1;
1011 continue;
1012 } else if (ch == 0xFFFE) {
1013 bo = -1;
1014 continue;
1015 }
1016 if (bo == -1)
1017 ch = (ch >> 8) | (ch << 8);
1018#endif
1019 if (ch < 0xD800 || ch > 0xDFFF) {
1020 *p++ = ch;
1021 continue;
1022 }
1023
1024 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001025 if (q >= e) {
1026 errmsg = "unexpected end of data";
1027 goto utf16Error;
1028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 if (0xDC00 <= *q && *q <= 0xDFFF) {
1030 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001031 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032 /* This is valid data (a UTF-16 surrogate pair), but
1033 we are not able to store this information since our
1034 Py_UNICODE type only has 16 bits... this might
1035 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001036 errmsg = "code pairs are not supported";
1037 goto utf16Error;
1038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 else
1040 continue;
1041 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001042 errmsg = "illegal encoding";
1043 /* Fall through to report the error */
1044
1045 utf16Error:
1046 if (utf16_decoding_error(&q, &p, errors, errmsg))
1047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048 }
1049
1050 if (byteorder)
1051 *byteorder = bo;
1052
1053 /* Adjust length */
1054 if (_PyUnicode_Resize(unicode, p - unicode->str))
1055 goto onError;
1056
1057 return (PyObject *)unicode;
1058
1059onError:
1060 Py_DECREF(unicode);
1061 return NULL;
1062}
1063
1064#undef UTF16_ERROR
1065
1066PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1067 int size,
1068 const char *errors,
1069 int byteorder)
1070{
1071 PyObject *v;
1072 Py_UNICODE *p;
1073 char *q;
1074
1075 /* We don't create UTF-16 pairs... */
1076 v = PyString_FromStringAndSize(NULL,
1077 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1078 if (v == NULL)
1079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080
1081 q = PyString_AS_STRING(v);
1082 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083 if (byteorder == 0)
1084 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001085 if (size == 0)
1086 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 if (byteorder == 0 ||
1088#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1089 byteorder == -1
1090#else
1091 byteorder == 1
1092#endif
1093 )
1094 memcpy(p, s, size * sizeof(Py_UNICODE));
1095 else
1096 while (size-- > 0) {
1097 Py_UNICODE ch = *s++;
1098 *p++ = (ch >> 8) | (ch << 8);
1099 }
1100 done:
1101 return v;
1102}
1103
1104PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1105{
1106 if (!PyUnicode_Check(unicode)) {
1107 PyErr_BadArgument();
1108 return NULL;
1109 }
1110 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1111 PyUnicode_GET_SIZE(unicode),
1112 NULL,
1113 0);
1114}
1115
1116/* --- Unicode Escape Codec ----------------------------------------------- */
1117
1118static
1119int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001120 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 const char *errors,
1122 const char *details)
1123{
1124 if ((errors == NULL) ||
1125 (strcmp(errors,"strict") == 0)) {
1126 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001127 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 details);
1129 return -1;
1130 }
1131 else if (strcmp(errors,"ignore") == 0) {
1132 return 0;
1133 }
1134 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001135 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 return 0;
1137 }
1138 else {
1139 PyErr_Format(PyExc_ValueError,
1140 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001141 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 errors);
1143 return -1;
1144 }
1145}
1146
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001147static _Py_UCNHashAPI *pucnHash = NULL;
1148
1149static
1150int mystrnicmp(const char *s1, const char *s2, size_t count)
1151{
1152 char c1, c2;
1153
1154 if (count)
1155 {
1156 do
1157 {
1158 c1 = tolower(*(s1++));
1159 c2 = tolower(*(s2++));
1160 }
1161 while(--count && c1 == c2);
1162
1163 return c1 - c2;
1164 }
1165
1166 return 0;
1167}
1168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1170 int size,
1171 const char *errors)
1172{
1173 PyUnicodeObject *v;
1174 Py_UNICODE *p = NULL, *buf = NULL;
1175 const char *end;
1176
1177 /* Escaped strings will always be longer than the resulting
1178 Unicode string, so we start with size here and then reduce the
1179 length after conversion to the true value. */
1180 v = _PyUnicode_New(size);
1181 if (v == NULL)
1182 goto onError;
1183 if (size == 0)
1184 return (PyObject *)v;
1185 p = buf = PyUnicode_AS_UNICODE(v);
1186 end = s + size;
1187 while (s < end) {
1188 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001189 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 int i;
1191
1192 /* Non-escape characters are interpreted as Unicode ordinals */
1193 if (*s != '\\') {
1194 *p++ = (unsigned char)*s++;
1195 continue;
1196 }
1197
1198 /* \ - Escapes */
1199 s++;
1200 switch (*s++) {
1201
1202 /* \x escapes */
1203 case '\n': break;
1204 case '\\': *p++ = '\\'; break;
1205 case '\'': *p++ = '\''; break;
1206 case '\"': *p++ = '\"'; break;
1207 case 'b': *p++ = '\b'; break;
1208 case 'f': *p++ = '\014'; break; /* FF */
1209 case 't': *p++ = '\t'; break;
1210 case 'n': *p++ = '\n'; break;
1211 case 'r': *p++ = '\r'; break;
1212 case 'v': *p++ = '\013'; break; /* VT */
1213 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1214
1215 /* \OOO (octal) escapes */
1216 case '0': case '1': case '2': case '3':
1217 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001218 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001220 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001222 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001224 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225 break;
1226
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001227 /* \xXXXX escape with 1-n hex digits. for compatibility
1228 with 8-bit strings, this code ignores all but the last
1229 two digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 case 'x':
1231 x = 0;
1232 c = (unsigned char)*s;
1233 if (isxdigit(c)) {
1234 do {
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001235 x = (x<<4) & 0xF0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 if ('0' <= c && c <= '9')
1237 x += c - '0';
1238 else if ('a' <= c && c <= 'f')
1239 x += 10 + c - 'a';
1240 else
1241 x += 10 + c - 'A';
1242 c = (unsigned char)*++s;
1243 } while (isxdigit(c));
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001244 *p++ = (unsigned char) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 } else {
1246 *p++ = '\\';
1247 *p++ = (unsigned char)s[-1];
1248 }
1249 break;
1250
1251 /* \uXXXX with 4 hex digits */
1252 case 'u':
1253 for (x = 0, i = 0; i < 4; i++) {
1254 c = (unsigned char)s[i];
1255 if (!isxdigit(c)) {
1256 if (unicodeescape_decoding_error(&s, &x, errors,
1257 "truncated \\uXXXX"))
1258 goto onError;
1259 i++;
1260 break;
1261 }
1262 x = (x<<4) & ~0xF;
1263 if (c >= '0' && c <= '9')
1264 x += c - '0';
1265 else if (c >= 'a' && c <= 'f')
1266 x += 10 + c - 'a';
1267 else
1268 x += 10 + c - 'A';
1269 }
1270 s += i;
1271 *p++ = x;
1272 break;
1273
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001274 case 'N':
1275 /* Ok, we need to deal with Unicode Character Names now,
1276 * make sure we've imported the hash table data...
1277 */
1278 if (pucnHash == NULL)
1279 {
1280 PyObject *mod = 0, *v = 0;
1281
1282 mod = PyImport_ImportModule("ucnhash");
1283 if (mod == NULL)
1284 goto onError;
1285 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1286 Py_DECREF(mod);
1287 if (v == NULL)
1288 {
1289 goto onError;
1290 }
1291 pucnHash = PyCObject_AsVoidPtr(v);
1292 Py_DECREF(v);
1293 if (pucnHash == NULL)
1294 {
1295 goto onError;
1296 }
1297 }
1298
1299 if (*s == '{')
1300 {
1301 const char *start = s + 1;
1302 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001303 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001304 unsigned long j;
1305
1306 /* look for either the closing brace, or we
1307 * exceed the maximum length of the unicode character names
1308 */
1309 while (*endBrace != '}' &&
1310 (unsigned int)(endBrace - start) <=
1311 pucnHash->cchMax &&
1312 endBrace < end)
1313 {
1314 endBrace++;
1315 }
1316 if (endBrace != end && *endBrace == '}')
1317 {
1318 j = pucnHash->hash(start, endBrace - start);
1319 if (j > pucnHash->cKeys ||
1320 mystrnicmp(
1321 start,
1322 ((_Py_UnicodeCharacterName *)
1323 (pucnHash->getValue(j)))->pszUCN,
1324 (int)(endBrace - start)) != 0)
1325 {
1326 if (unicodeescape_decoding_error(
1327 &s, &x, errors,
1328 "Invalid Unicode Character Name"))
1329 {
1330 goto onError;
1331 }
1332 goto ucnFallthrough;
1333 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 value = ((_Py_UnicodeCharacterName *)
1335 (pucnHash->getValue(j)))->value;
1336 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001337 {
1338 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001339 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001340 }
1341 else
1342 {
1343 /* Oops, its in UCS-4 space, */
1344 /* compute and append the two surrogates: */
1345 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001347
1348 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001350
1351 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001352 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001353 }
1354 s = endBrace + 1;
1355 }
1356 else
1357 {
1358 if (unicodeescape_decoding_error(
1359 &s, &x, errors,
1360 "Unicode name missing closing brace"))
1361 goto onError;
1362 goto ucnFallthrough;
1363 }
1364 break;
1365 }
1366 if (unicodeescape_decoding_error(
1367 &s, &x, errors,
1368 "Missing opening brace for Unicode Character Name escape"))
1369 goto onError;
1370ucnFallthrough:
1371 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001372 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373 *p++ = '\\';
1374 *p++ = (unsigned char)s[-1];
1375 break;
1376 }
1377 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001378 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001379 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 return (PyObject *)v;
1381
1382 onError:
1383 Py_XDECREF(v);
1384 return NULL;
1385}
1386
1387/* Return a Unicode-Escape string version of the Unicode object.
1388
1389 If quotes is true, the string is enclosed in u"" or u'' quotes as
1390 appropriate.
1391
1392*/
1393
Barry Warsaw51ac5802000-03-20 16:36:48 +00001394static const Py_UNICODE *findchar(const Py_UNICODE *s,
1395 int size,
1396 Py_UNICODE ch);
1397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398static
1399PyObject *unicodeescape_string(const Py_UNICODE *s,
1400 int size,
1401 int quotes)
1402{
1403 PyObject *repr;
1404 char *p;
1405 char *q;
1406
1407 static const char *hexdigit = "0123456789ABCDEF";
1408
1409 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1410 if (repr == NULL)
1411 return NULL;
1412
1413 p = q = PyString_AS_STRING(repr);
1414
1415 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 *p++ = 'u';
1417 *p++ = (findchar(s, size, '\'') &&
1418 !findchar(s, size, '"')) ? '"' : '\'';
1419 }
1420 while (size-- > 0) {
1421 Py_UNICODE ch = *s++;
1422 /* Escape quotes */
1423 if (quotes && (ch == q[1] || ch == '\\')) {
1424 *p++ = '\\';
1425 *p++ = (char) ch;
1426 }
1427 /* Map 16-bit characters to '\uxxxx' */
1428 else if (ch >= 256) {
1429 *p++ = '\\';
1430 *p++ = 'u';
1431 *p++ = hexdigit[(ch >> 12) & 0xf];
1432 *p++ = hexdigit[(ch >> 8) & 0xf];
1433 *p++ = hexdigit[(ch >> 4) & 0xf];
1434 *p++ = hexdigit[ch & 15];
1435 }
1436 /* Map non-printable US ASCII to '\ooo' */
1437 else if (ch < ' ' || ch >= 128) {
1438 *p++ = '\\';
1439 *p++ = hexdigit[(ch >> 6) & 7];
1440 *p++ = hexdigit[(ch >> 3) & 7];
1441 *p++ = hexdigit[ch & 7];
1442 }
1443 /* Copy everything else as-is */
1444 else
1445 *p++ = (char) ch;
1446 }
1447 if (quotes)
1448 *p++ = q[1];
1449
1450 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001451 if (_PyString_Resize(&repr, p - q))
1452 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453
1454 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001455
1456 onError:
1457 Py_DECREF(repr);
1458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459}
1460
1461PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1462 int size)
1463{
1464 return unicodeescape_string(s, size, 0);
1465}
1466
1467PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1468{
1469 if (!PyUnicode_Check(unicode)) {
1470 PyErr_BadArgument();
1471 return NULL;
1472 }
1473 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1474 PyUnicode_GET_SIZE(unicode));
1475}
1476
1477/* --- Raw Unicode Escape Codec ------------------------------------------- */
1478
1479PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1480 int size,
1481 const char *errors)
1482{
1483 PyUnicodeObject *v;
1484 Py_UNICODE *p, *buf;
1485 const char *end;
1486 const char *bs;
1487
1488 /* Escaped strings will always be longer than the resulting
1489 Unicode string, so we start with size here and then reduce the
1490 length after conversion to the true value. */
1491 v = _PyUnicode_New(size);
1492 if (v == NULL)
1493 goto onError;
1494 if (size == 0)
1495 return (PyObject *)v;
1496 p = buf = PyUnicode_AS_UNICODE(v);
1497 end = s + size;
1498 while (s < end) {
1499 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001500 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 int i;
1502
1503 /* Non-escape characters are interpreted as Unicode ordinals */
1504 if (*s != '\\') {
1505 *p++ = (unsigned char)*s++;
1506 continue;
1507 }
1508
1509 /* \u-escapes are only interpreted iff the number of leading
1510 backslashes if odd */
1511 bs = s;
1512 for (;s < end;) {
1513 if (*s != '\\')
1514 break;
1515 *p++ = (unsigned char)*s++;
1516 }
1517 if (((s - bs) & 1) == 0 ||
1518 s >= end ||
1519 *s != 'u') {
1520 continue;
1521 }
1522 p--;
1523 s++;
1524
1525 /* \uXXXX with 4 hex digits */
1526 for (x = 0, i = 0; i < 4; i++) {
1527 c = (unsigned char)s[i];
1528 if (!isxdigit(c)) {
1529 if (unicodeescape_decoding_error(&s, &x, errors,
1530 "truncated \\uXXXX"))
1531 goto onError;
1532 i++;
1533 break;
1534 }
1535 x = (x<<4) & ~0xF;
1536 if (c >= '0' && c <= '9')
1537 x += c - '0';
1538 else if (c >= 'a' && c <= 'f')
1539 x += 10 + c - 'a';
1540 else
1541 x += 10 + c - 'A';
1542 }
1543 s += i;
1544 *p++ = x;
1545 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001546 if (_PyUnicode_Resize(v, (int)(p - buf)))
1547 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return (PyObject *)v;
1549
1550 onError:
1551 Py_XDECREF(v);
1552 return NULL;
1553}
1554
1555PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1556 int size)
1557{
1558 PyObject *repr;
1559 char *p;
1560 char *q;
1561
1562 static const char *hexdigit = "0123456789ABCDEF";
1563
1564 repr = PyString_FromStringAndSize(NULL, 6 * size);
1565 if (repr == NULL)
1566 return NULL;
1567
1568 p = q = PyString_AS_STRING(repr);
1569 while (size-- > 0) {
1570 Py_UNICODE ch = *s++;
1571 /* Map 16-bit characters to '\uxxxx' */
1572 if (ch >= 256) {
1573 *p++ = '\\';
1574 *p++ = 'u';
1575 *p++ = hexdigit[(ch >> 12) & 0xf];
1576 *p++ = hexdigit[(ch >> 8) & 0xf];
1577 *p++ = hexdigit[(ch >> 4) & 0xf];
1578 *p++ = hexdigit[ch & 15];
1579 }
1580 /* Copy everything else as-is */
1581 else
1582 *p++ = (char) ch;
1583 }
1584 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001585 if (_PyString_Resize(&repr, p - q))
1586 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
1588 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001589
1590 onError:
1591 Py_DECREF(repr);
1592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593}
1594
1595PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1596{
1597 if (!PyUnicode_Check(unicode)) {
1598 PyErr_BadArgument();
1599 return NULL;
1600 }
1601 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1602 PyUnicode_GET_SIZE(unicode));
1603}
1604
1605/* --- Latin-1 Codec ------------------------------------------------------ */
1606
1607PyObject *PyUnicode_DecodeLatin1(const char *s,
1608 int size,
1609 const char *errors)
1610{
1611 PyUnicodeObject *v;
1612 Py_UNICODE *p;
1613
1614 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1615 v = _PyUnicode_New(size);
1616 if (v == NULL)
1617 goto onError;
1618 if (size == 0)
1619 return (PyObject *)v;
1620 p = PyUnicode_AS_UNICODE(v);
1621 while (size-- > 0)
1622 *p++ = (unsigned char)*s++;
1623 return (PyObject *)v;
1624
1625 onError:
1626 Py_XDECREF(v);
1627 return NULL;
1628}
1629
1630static
1631int latin1_encoding_error(const Py_UNICODE **source,
1632 char **dest,
1633 const char *errors,
1634 const char *details)
1635{
1636 if ((errors == NULL) ||
1637 (strcmp(errors,"strict") == 0)) {
1638 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001639 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 details);
1641 return -1;
1642 }
1643 else if (strcmp(errors,"ignore") == 0) {
1644 return 0;
1645 }
1646 else if (strcmp(errors,"replace") == 0) {
1647 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001648 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 return 0;
1650 }
1651 else {
1652 PyErr_Format(PyExc_ValueError,
1653 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001654 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 errors);
1656 return -1;
1657 }
1658}
1659
1660PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1661 int size,
1662 const char *errors)
1663{
1664 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001665 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 repr = PyString_FromStringAndSize(NULL, size);
1667 if (repr == NULL)
1668 return NULL;
1669
1670 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001671 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 while (size-- > 0) {
1673 Py_UNICODE ch = *p++;
1674 if (ch >= 256) {
1675 if (latin1_encoding_error(&p, &s, errors,
1676 "ordinal not in range(256)"))
1677 goto onError;
1678 }
1679 else
1680 *s++ = (char)ch;
1681 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001682 /* Resize if error handling skipped some characters */
1683 if (s - start < PyString_GET_SIZE(repr))
1684 if (_PyString_Resize(&repr, s - start))
1685 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return repr;
1687
1688 onError:
1689 Py_DECREF(repr);
1690 return NULL;
1691}
1692
1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 return NULL;
1698 }
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 NULL);
1702}
1703
1704/* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706static
1707int ascii_decoding_error(const char **source,
1708 Py_UNICODE **dest,
1709 const char *errors,
1710 const char *details)
1711{
1712 if ((errors == NULL) ||
1713 (strcmp(errors,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001715 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 details);
1717 return -1;
1718 }
1719 else if (strcmp(errors,"ignore") == 0) {
1720 return 0;
1721 }
1722 else if (strcmp(errors,"replace") == 0) {
1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724 (*dest)++;
1725 return 0;
1726 }
1727 else {
1728 PyErr_Format(PyExc_ValueError,
1729 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001730 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 errors);
1732 return -1;
1733 }
1734}
1735
1736PyObject *PyUnicode_DecodeASCII(const char *s,
1737 int size,
1738 const char *errors)
1739{
1740 PyUnicodeObject *v;
1741 Py_UNICODE *p;
1742
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v = _PyUnicode_New(size);
1745 if (v == NULL)
1746 goto onError;
1747 if (size == 0)
1748 return (PyObject *)v;
1749 p = PyUnicode_AS_UNICODE(v);
1750 while (size-- > 0) {
1751 register unsigned char c;
1752
1753 c = (unsigned char)*s++;
1754 if (c < 128)
1755 *p++ = c;
1756 else if (ascii_decoding_error(&s, &p, errors,
1757 "ordinal not in range(128)"))
1758 goto onError;
1759 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return (PyObject *)v;
1764
1765 onError:
1766 Py_XDECREF(v);
1767 return NULL;
1768}
1769
1770static
1771int ascii_encoding_error(const Py_UNICODE **source,
1772 char **dest,
1773 const char *errors,
1774 const char *details)
1775{
1776 if ((errors == NULL) ||
1777 (strcmp(errors,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001779 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 details);
1781 return -1;
1782 }
1783 else if (strcmp(errors,"ignore") == 0) {
1784 return 0;
1785 }
1786 else if (strcmp(errors,"replace") == 0) {
1787 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001788 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return 0;
1790 }
1791 else {
1792 PyErr_Format(PyExc_ValueError,
1793 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 errors);
1796 return -1;
1797 }
1798}
1799
1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801 int size,
1802 const char *errors)
1803{
1804 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001805 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 repr = PyString_FromStringAndSize(NULL, size);
1807 if (repr == NULL)
1808 return NULL;
1809
1810 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001811 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 while (size-- > 0) {
1813 Py_UNICODE ch = *p++;
1814 if (ch >= 128) {
1815 if (ascii_encoding_error(&p, &s, errors,
1816 "ordinal not in range(128)"))
1817 goto onError;
1818 }
1819 else
1820 *s++ = (char)ch;
1821 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001822 /* Resize if error handling skipped some characters */
1823 if (s - start < PyString_GET_SIZE(repr))
1824 if (_PyString_Resize(&repr, s - start))
1825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 return repr;
1827
1828 onError:
1829 Py_DECREF(repr);
1830 return NULL;
1831}
1832
1833PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1834{
1835 if (!PyUnicode_Check(unicode)) {
1836 PyErr_BadArgument();
1837 return NULL;
1838 }
1839 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1840 PyUnicode_GET_SIZE(unicode),
1841 NULL);
1842}
1843
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001844#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001845
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001846/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001847
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848PyObject *PyUnicode_DecodeMBCS(const char *s,
1849 int size,
1850 const char *errors)
1851{
1852 PyUnicodeObject *v;
1853 Py_UNICODE *p;
1854
1855 /* First get the size of the result */
1856 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001857 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001858 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1859
1860 v = _PyUnicode_New(usize);
1861 if (v == NULL)
1862 return NULL;
1863 if (usize == 0)
1864 return (PyObject *)v;
1865 p = PyUnicode_AS_UNICODE(v);
1866 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1867 Py_DECREF(v);
1868 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1869 }
1870
1871 return (PyObject *)v;
1872}
1873
1874PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1875 int size,
1876 const char *errors)
1877{
1878 PyObject *repr;
1879 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001880 DWORD mbcssize;
1881
1882 /* If there are no characters, bail now! */
1883 if (size==0)
1884 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001885
1886 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001887 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888 if (mbcssize==0)
1889 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1890
1891 repr = PyString_FromStringAndSize(NULL, mbcssize);
1892 if (repr == NULL)
1893 return NULL;
1894 if (mbcssize==0)
1895 return repr;
1896
1897 /* Do the conversion */
1898 s = PyString_AS_STRING(repr);
1899 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1900 Py_DECREF(repr);
1901 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1902 }
1903 return repr;
1904}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001905
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001906#endif /* MS_WIN32 */
1907
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908/* --- Character Mapping Codec -------------------------------------------- */
1909
1910static
1911int charmap_decoding_error(const char **source,
1912 Py_UNICODE **dest,
1913 const char *errors,
1914 const char *details)
1915{
1916 if ((errors == NULL) ||
1917 (strcmp(errors,"strict") == 0)) {
1918 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001919 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 details);
1921 return -1;
1922 }
1923 else if (strcmp(errors,"ignore") == 0) {
1924 return 0;
1925 }
1926 else if (strcmp(errors,"replace") == 0) {
1927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1928 (*dest)++;
1929 return 0;
1930 }
1931 else {
1932 PyErr_Format(PyExc_ValueError,
1933 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001934 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 errors);
1936 return -1;
1937 }
1938}
1939
1940PyObject *PyUnicode_DecodeCharmap(const char *s,
1941 int size,
1942 PyObject *mapping,
1943 const char *errors)
1944{
1945 PyUnicodeObject *v;
1946 Py_UNICODE *p;
1947
1948 /* Default to Latin-1 */
1949 if (mapping == NULL)
1950 return PyUnicode_DecodeLatin1(s, size, errors);
1951
1952 v = _PyUnicode_New(size);
1953 if (v == NULL)
1954 goto onError;
1955 if (size == 0)
1956 return (PyObject *)v;
1957 p = PyUnicode_AS_UNICODE(v);
1958 while (size-- > 0) {
1959 unsigned char ch = *s++;
1960 PyObject *w, *x;
1961
1962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1963 w = PyInt_FromLong((long)ch);
1964 if (w == NULL)
1965 goto onError;
1966 x = PyObject_GetItem(mapping, w);
1967 Py_DECREF(w);
1968 if (x == NULL) {
1969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1970 /* No mapping found: default to Latin-1 mapping */
1971 PyErr_Clear();
1972 *p++ = (Py_UNICODE)ch;
1973 continue;
1974 }
1975 goto onError;
1976 }
1977
1978 /* Apply mapping */
1979 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001980 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 if (value < 0 || value > 65535) {
1982 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001983 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 Py_DECREF(x);
1985 goto onError;
1986 }
1987 *p++ = (Py_UNICODE)value;
1988 }
1989 else if (x == Py_None) {
1990 /* undefined mapping */
1991 if (charmap_decoding_error(&s, &p, errors,
1992 "character maps to <undefined>")) {
1993 Py_DECREF(x);
1994 goto onError;
1995 }
1996 }
1997 else if (PyUnicode_Check(x)) {
1998 if (PyUnicode_GET_SIZE(x) != 1) {
1999 /* 1-n mapping */
2000 PyErr_SetString(PyExc_NotImplementedError,
2001 "1-n mappings are currently not implemented");
2002 Py_DECREF(x);
2003 goto onError;
2004 }
2005 *p++ = *PyUnicode_AS_UNICODE(x);
2006 }
2007 else {
2008 /* wrong return value */
2009 PyErr_SetString(PyExc_TypeError,
2010 "character mapping must return integer, None or unicode");
2011 Py_DECREF(x);
2012 goto onError;
2013 }
2014 Py_DECREF(x);
2015 }
2016 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2017 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2018 goto onError;
2019 return (PyObject *)v;
2020
2021 onError:
2022 Py_XDECREF(v);
2023 return NULL;
2024}
2025
2026static
2027int charmap_encoding_error(const Py_UNICODE **source,
2028 char **dest,
2029 const char *errors,
2030 const char *details)
2031{
2032 if ((errors == NULL) ||
2033 (strcmp(errors,"strict") == 0)) {
2034 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002035 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 details);
2037 return -1;
2038 }
2039 else if (strcmp(errors,"ignore") == 0) {
2040 return 0;
2041 }
2042 else if (strcmp(errors,"replace") == 0) {
2043 **dest = '?';
2044 (*dest)++;
2045 return 0;
2046 }
2047 else {
2048 PyErr_Format(PyExc_ValueError,
2049 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002050 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 errors);
2052 return -1;
2053 }
2054}
2055
2056PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2057 int size,
2058 PyObject *mapping,
2059 const char *errors)
2060{
2061 PyObject *v;
2062 char *s;
2063
2064 /* Default to Latin-1 */
2065 if (mapping == NULL)
2066 return PyUnicode_EncodeLatin1(p, size, errors);
2067
2068 v = PyString_FromStringAndSize(NULL, size);
2069 if (v == NULL)
2070 return NULL;
2071 s = PyString_AS_STRING(v);
2072 while (size-- > 0) {
2073 Py_UNICODE ch = *p++;
2074 PyObject *w, *x;
2075
2076 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2077 w = PyInt_FromLong((long)ch);
2078 if (w == NULL)
2079 goto onError;
2080 x = PyObject_GetItem(mapping, w);
2081 Py_DECREF(w);
2082 if (x == NULL) {
2083 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2084 /* No mapping found: default to Latin-1 mapping if possible */
2085 PyErr_Clear();
2086 if (ch < 256) {
2087 *s++ = (char)ch;
2088 continue;
2089 }
2090 else if (!charmap_encoding_error(&p, &s, errors,
2091 "missing character mapping"))
2092 continue;
2093 }
2094 goto onError;
2095 }
2096
2097 /* Apply mapping */
2098 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002099 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 if (value < 0 || value > 255) {
2101 PyErr_SetString(PyExc_TypeError,
2102 "character mapping must be in range(256)");
2103 Py_DECREF(x);
2104 goto onError;
2105 }
2106 *s++ = (char)value;
2107 }
2108 else if (x == Py_None) {
2109 /* undefined mapping */
2110 if (charmap_encoding_error(&p, &s, errors,
2111 "character maps to <undefined>")) {
2112 Py_DECREF(x);
2113 goto onError;
2114 }
2115 }
2116 else if (PyString_Check(x)) {
2117 if (PyString_GET_SIZE(x) != 1) {
2118 /* 1-n mapping */
2119 PyErr_SetString(PyExc_NotImplementedError,
2120 "1-n mappings are currently not implemented");
2121 Py_DECREF(x);
2122 goto onError;
2123 }
2124 *s++ = *PyString_AS_STRING(x);
2125 }
2126 else {
2127 /* wrong return value */
2128 PyErr_SetString(PyExc_TypeError,
2129 "character mapping must return integer, None or unicode");
2130 Py_DECREF(x);
2131 goto onError;
2132 }
2133 Py_DECREF(x);
2134 }
2135 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2136 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2137 goto onError;
2138 return v;
2139
2140 onError:
2141 Py_DECREF(v);
2142 return NULL;
2143}
2144
2145PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2146 PyObject *mapping)
2147{
2148 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2149 PyErr_BadArgument();
2150 return NULL;
2151 }
2152 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2153 PyUnicode_GET_SIZE(unicode),
2154 mapping,
2155 NULL);
2156}
2157
2158static
2159int translate_error(const Py_UNICODE **source,
2160 Py_UNICODE **dest,
2161 const char *errors,
2162 const char *details)
2163{
2164 if ((errors == NULL) ||
2165 (strcmp(errors,"strict") == 0)) {
2166 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002167 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 details);
2169 return -1;
2170 }
2171 else if (strcmp(errors,"ignore") == 0) {
2172 return 0;
2173 }
2174 else if (strcmp(errors,"replace") == 0) {
2175 **dest = '?';
2176 (*dest)++;
2177 return 0;
2178 }
2179 else {
2180 PyErr_Format(PyExc_ValueError,
2181 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002182 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 errors);
2184 return -1;
2185 }
2186}
2187
2188PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2189 int size,
2190 PyObject *mapping,
2191 const char *errors)
2192{
2193 PyUnicodeObject *v;
2194 Py_UNICODE *p;
2195
2196 if (mapping == NULL) {
2197 PyErr_BadArgument();
2198 return NULL;
2199 }
2200
2201 /* Output will never be longer than input */
2202 v = _PyUnicode_New(size);
2203 if (v == NULL)
2204 goto onError;
2205 if (size == 0)
2206 goto done;
2207 p = PyUnicode_AS_UNICODE(v);
2208 while (size-- > 0) {
2209 Py_UNICODE ch = *s++;
2210 PyObject *w, *x;
2211
2212 /* Get mapping */
2213 w = PyInt_FromLong(ch);
2214 if (w == NULL)
2215 goto onError;
2216 x = PyObject_GetItem(mapping, w);
2217 Py_DECREF(w);
2218 if (x == NULL) {
2219 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2220 /* No mapping found: default to 1-1 mapping */
2221 PyErr_Clear();
2222 *p++ = ch;
2223 continue;
2224 }
2225 goto onError;
2226 }
2227
2228 /* Apply mapping */
2229 if (PyInt_Check(x))
2230 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2231 else if (x == Py_None) {
2232 /* undefined mapping */
2233 if (translate_error(&s, &p, errors,
2234 "character maps to <undefined>")) {
2235 Py_DECREF(x);
2236 goto onError;
2237 }
2238 }
2239 else if (PyUnicode_Check(x)) {
2240 if (PyUnicode_GET_SIZE(x) != 1) {
2241 /* 1-n mapping */
2242 PyErr_SetString(PyExc_NotImplementedError,
2243 "1-n mappings are currently not implemented");
2244 Py_DECREF(x);
2245 goto onError;
2246 }
2247 *p++ = *PyUnicode_AS_UNICODE(x);
2248 }
2249 else {
2250 /* wrong return value */
2251 PyErr_SetString(PyExc_TypeError,
2252 "translate mapping must return integer, None or unicode");
2253 Py_DECREF(x);
2254 goto onError;
2255 }
2256 Py_DECREF(x);
2257 }
2258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002259 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261
2262 done:
2263 return (PyObject *)v;
2264
2265 onError:
2266 Py_XDECREF(v);
2267 return NULL;
2268}
2269
2270PyObject *PyUnicode_Translate(PyObject *str,
2271 PyObject *mapping,
2272 const char *errors)
2273{
2274 PyObject *result;
2275
2276 str = PyUnicode_FromObject(str);
2277 if (str == NULL)
2278 goto onError;
2279 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2280 PyUnicode_GET_SIZE(str),
2281 mapping,
2282 errors);
2283 Py_DECREF(str);
2284 return result;
2285
2286 onError:
2287 Py_XDECREF(str);
2288 return NULL;
2289}
2290
Guido van Rossum9e896b32000-04-05 20:11:21 +00002291/* --- Decimal Encoder ---------------------------------------------------- */
2292
2293int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2294 int length,
2295 char *output,
2296 const char *errors)
2297{
2298 Py_UNICODE *p, *end;
2299
2300 if (output == NULL) {
2301 PyErr_BadArgument();
2302 return -1;
2303 }
2304
2305 p = s;
2306 end = s + length;
2307 while (p < end) {
2308 register Py_UNICODE ch = *p++;
2309 int decimal;
2310
2311 if (Py_UNICODE_ISSPACE(ch)) {
2312 *output++ = ' ';
2313 continue;
2314 }
2315 decimal = Py_UNICODE_TODECIMAL(ch);
2316 if (decimal >= 0) {
2317 *output++ = '0' + decimal;
2318 continue;
2319 }
Guido van Rossumba477042000-04-06 18:18:10 +00002320 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002321 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002322 continue;
2323 }
2324 /* All other characters are considered invalid */
2325 if (errors == NULL || strcmp(errors, "strict") == 0) {
2326 PyErr_SetString(PyExc_ValueError,
2327 "invalid decimal Unicode string");
2328 goto onError;
2329 }
2330 else if (strcmp(errors, "ignore") == 0)
2331 continue;
2332 else if (strcmp(errors, "replace") == 0) {
2333 *output++ = '?';
2334 continue;
2335 }
2336 }
2337 /* 0-terminate the output string */
2338 *output++ = '\0';
2339 return 0;
2340
2341 onError:
2342 return -1;
2343}
2344
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345/* --- Helpers ------------------------------------------------------------ */
2346
2347static
2348int count(PyUnicodeObject *self,
2349 int start,
2350 int end,
2351 PyUnicodeObject *substring)
2352{
2353 int count = 0;
2354
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002355 if (substring->length == 0)
2356 return (end - start + 1);
2357
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358 end -= substring->length;
2359
2360 while (start <= end)
2361 if (Py_UNICODE_MATCH(self, start, substring)) {
2362 count++;
2363 start += substring->length;
2364 } else
2365 start++;
2366
2367 return count;
2368}
2369
2370int PyUnicode_Count(PyObject *str,
2371 PyObject *substr,
2372 int start,
2373 int end)
2374{
2375 int result;
2376
2377 str = PyUnicode_FromObject(str);
2378 if (str == NULL)
2379 return -1;
2380 substr = PyUnicode_FromObject(substr);
2381 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002382 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 return -1;
2384 }
2385
2386 result = count((PyUnicodeObject *)str,
2387 start, end,
2388 (PyUnicodeObject *)substr);
2389
2390 Py_DECREF(str);
2391 Py_DECREF(substr);
2392 return result;
2393}
2394
2395static
2396int findstring(PyUnicodeObject *self,
2397 PyUnicodeObject *substring,
2398 int start,
2399 int end,
2400 int direction)
2401{
2402 if (start < 0)
2403 start += self->length;
2404 if (start < 0)
2405 start = 0;
2406
2407 if (substring->length == 0)
2408 return start;
2409
2410 if (end > self->length)
2411 end = self->length;
2412 if (end < 0)
2413 end += self->length;
2414 if (end < 0)
2415 end = 0;
2416
2417 end -= substring->length;
2418
2419 if (direction < 0) {
2420 for (; end >= start; end--)
2421 if (Py_UNICODE_MATCH(self, end, substring))
2422 return end;
2423 } else {
2424 for (; start <= end; start++)
2425 if (Py_UNICODE_MATCH(self, start, substring))
2426 return start;
2427 }
2428
2429 return -1;
2430}
2431
2432int PyUnicode_Find(PyObject *str,
2433 PyObject *substr,
2434 int start,
2435 int end,
2436 int direction)
2437{
2438 int result;
2439
2440 str = PyUnicode_FromObject(str);
2441 if (str == NULL)
2442 return -1;
2443 substr = PyUnicode_FromObject(substr);
2444 if (substr == NULL) {
2445 Py_DECREF(substr);
2446 return -1;
2447 }
2448
2449 result = findstring((PyUnicodeObject *)str,
2450 (PyUnicodeObject *)substr,
2451 start, end, direction);
2452 Py_DECREF(str);
2453 Py_DECREF(substr);
2454 return result;
2455}
2456
2457static
2458int tailmatch(PyUnicodeObject *self,
2459 PyUnicodeObject *substring,
2460 int start,
2461 int end,
2462 int direction)
2463{
2464 if (start < 0)
2465 start += self->length;
2466 if (start < 0)
2467 start = 0;
2468
2469 if (substring->length == 0)
2470 return 1;
2471
2472 if (end > self->length)
2473 end = self->length;
2474 if (end < 0)
2475 end += self->length;
2476 if (end < 0)
2477 end = 0;
2478
2479 end -= substring->length;
2480 if (end < start)
2481 return 0;
2482
2483 if (direction > 0) {
2484 if (Py_UNICODE_MATCH(self, end, substring))
2485 return 1;
2486 } else {
2487 if (Py_UNICODE_MATCH(self, start, substring))
2488 return 1;
2489 }
2490
2491 return 0;
2492}
2493
2494int PyUnicode_Tailmatch(PyObject *str,
2495 PyObject *substr,
2496 int start,
2497 int end,
2498 int direction)
2499{
2500 int result;
2501
2502 str = PyUnicode_FromObject(str);
2503 if (str == NULL)
2504 return -1;
2505 substr = PyUnicode_FromObject(substr);
2506 if (substr == NULL) {
2507 Py_DECREF(substr);
2508 return -1;
2509 }
2510
2511 result = tailmatch((PyUnicodeObject *)str,
2512 (PyUnicodeObject *)substr,
2513 start, end, direction);
2514 Py_DECREF(str);
2515 Py_DECREF(substr);
2516 return result;
2517}
2518
2519static
2520const Py_UNICODE *findchar(const Py_UNICODE *s,
2521 int size,
2522 Py_UNICODE ch)
2523{
2524 /* like wcschr, but doesn't stop at NULL characters */
2525
2526 while (size-- > 0) {
2527 if (*s == ch)
2528 return s;
2529 s++;
2530 }
2531
2532 return NULL;
2533}
2534
2535/* Apply fixfct filter to the Unicode object self and return a
2536 reference to the modified object */
2537
2538static
2539PyObject *fixup(PyUnicodeObject *self,
2540 int (*fixfct)(PyUnicodeObject *s))
2541{
2542
2543 PyUnicodeObject *u;
2544
2545 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2546 self->length);
2547 if (u == NULL)
2548 return NULL;
2549 if (!fixfct(u)) {
2550 /* fixfct should return TRUE if it modified the buffer. If
2551 FALSE, return a reference to the original buffer instead
2552 (to save space, not time) */
2553 Py_INCREF(self);
2554 Py_DECREF(u);
2555 return (PyObject*) self;
2556 }
2557 return (PyObject*) u;
2558}
2559
2560static
2561int fixupper(PyUnicodeObject *self)
2562{
2563 int len = self->length;
2564 Py_UNICODE *s = self->str;
2565 int status = 0;
2566
2567 while (len-- > 0) {
2568 register Py_UNICODE ch;
2569
2570 ch = Py_UNICODE_TOUPPER(*s);
2571 if (ch != *s) {
2572 status = 1;
2573 *s = ch;
2574 }
2575 s++;
2576 }
2577
2578 return status;
2579}
2580
2581static
2582int fixlower(PyUnicodeObject *self)
2583{
2584 int len = self->length;
2585 Py_UNICODE *s = self->str;
2586 int status = 0;
2587
2588 while (len-- > 0) {
2589 register Py_UNICODE ch;
2590
2591 ch = Py_UNICODE_TOLOWER(*s);
2592 if (ch != *s) {
2593 status = 1;
2594 *s = ch;
2595 }
2596 s++;
2597 }
2598
2599 return status;
2600}
2601
2602static
2603int fixswapcase(PyUnicodeObject *self)
2604{
2605 int len = self->length;
2606 Py_UNICODE *s = self->str;
2607 int status = 0;
2608
2609 while (len-- > 0) {
2610 if (Py_UNICODE_ISUPPER(*s)) {
2611 *s = Py_UNICODE_TOLOWER(*s);
2612 status = 1;
2613 } else if (Py_UNICODE_ISLOWER(*s)) {
2614 *s = Py_UNICODE_TOUPPER(*s);
2615 status = 1;
2616 }
2617 s++;
2618 }
2619
2620 return status;
2621}
2622
2623static
2624int fixcapitalize(PyUnicodeObject *self)
2625{
2626 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2627 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2628 return 1;
2629 }
2630 return 0;
2631}
2632
2633static
2634int fixtitle(PyUnicodeObject *self)
2635{
2636 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2637 register Py_UNICODE *e;
2638 int previous_is_cased;
2639
2640 /* Shortcut for single character strings */
2641 if (PyUnicode_GET_SIZE(self) == 1) {
2642 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2643 if (*p != ch) {
2644 *p = ch;
2645 return 1;
2646 }
2647 else
2648 return 0;
2649 }
2650
2651 e = p + PyUnicode_GET_SIZE(self);
2652 previous_is_cased = 0;
2653 for (; p < e; p++) {
2654 register const Py_UNICODE ch = *p;
2655
2656 if (previous_is_cased)
2657 *p = Py_UNICODE_TOLOWER(ch);
2658 else
2659 *p = Py_UNICODE_TOTITLE(ch);
2660
2661 if (Py_UNICODE_ISLOWER(ch) ||
2662 Py_UNICODE_ISUPPER(ch) ||
2663 Py_UNICODE_ISTITLE(ch))
2664 previous_is_cased = 1;
2665 else
2666 previous_is_cased = 0;
2667 }
2668 return 1;
2669}
2670
2671PyObject *PyUnicode_Join(PyObject *separator,
2672 PyObject *seq)
2673{
2674 Py_UNICODE *sep;
2675 int seplen;
2676 PyUnicodeObject *res = NULL;
2677 int reslen = 0;
2678 Py_UNICODE *p;
2679 int seqlen = 0;
2680 int sz = 100;
2681 int i;
2682
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002683 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (seqlen < 0 && PyErr_Occurred())
2685 return NULL;
2686
2687 if (separator == NULL) {
2688 Py_UNICODE blank = ' ';
2689 sep = &blank;
2690 seplen = 1;
2691 }
2692 else {
2693 separator = PyUnicode_FromObject(separator);
2694 if (separator == NULL)
2695 return NULL;
2696 sep = PyUnicode_AS_UNICODE(separator);
2697 seplen = PyUnicode_GET_SIZE(separator);
2698 }
2699
2700 res = _PyUnicode_New(sz);
2701 if (res == NULL)
2702 goto onError;
2703 p = PyUnicode_AS_UNICODE(res);
2704 reslen = 0;
2705
2706 for (i = 0; i < seqlen; i++) {
2707 int itemlen;
2708 PyObject *item;
2709
2710 item = PySequence_GetItem(seq, i);
2711 if (item == NULL)
2712 goto onError;
2713 if (!PyUnicode_Check(item)) {
2714 PyObject *v;
2715 v = PyUnicode_FromObject(item);
2716 Py_DECREF(item);
2717 item = v;
2718 if (item == NULL)
2719 goto onError;
2720 }
2721 itemlen = PyUnicode_GET_SIZE(item);
2722 while (reslen + itemlen + seplen >= sz) {
2723 if (_PyUnicode_Resize(res, sz*2))
2724 goto onError;
2725 sz *= 2;
2726 p = PyUnicode_AS_UNICODE(res) + reslen;
2727 }
2728 if (i > 0) {
2729 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2730 p += seplen;
2731 reslen += seplen;
2732 }
2733 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2734 p += itemlen;
2735 reslen += itemlen;
2736 Py_DECREF(item);
2737 }
2738 if (_PyUnicode_Resize(res, reslen))
2739 goto onError;
2740
2741 Py_XDECREF(separator);
2742 return (PyObject *)res;
2743
2744 onError:
2745 Py_XDECREF(separator);
2746 Py_DECREF(res);
2747 return NULL;
2748}
2749
2750static
2751PyUnicodeObject *pad(PyUnicodeObject *self,
2752 int left,
2753 int right,
2754 Py_UNICODE fill)
2755{
2756 PyUnicodeObject *u;
2757
2758 if (left < 0)
2759 left = 0;
2760 if (right < 0)
2761 right = 0;
2762
2763 if (left == 0 && right == 0) {
2764 Py_INCREF(self);
2765 return self;
2766 }
2767
2768 u = _PyUnicode_New(left + self->length + right);
2769 if (u) {
2770 if (left)
2771 Py_UNICODE_FILL(u->str, fill, left);
2772 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2773 if (right)
2774 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2775 }
2776
2777 return u;
2778}
2779
2780#define SPLIT_APPEND(data, left, right) \
2781 str = PyUnicode_FromUnicode(data + left, right - left); \
2782 if (!str) \
2783 goto onError; \
2784 if (PyList_Append(list, str)) { \
2785 Py_DECREF(str); \
2786 goto onError; \
2787 } \
2788 else \
2789 Py_DECREF(str);
2790
2791static
2792PyObject *split_whitespace(PyUnicodeObject *self,
2793 PyObject *list,
2794 int maxcount)
2795{
2796 register int i;
2797 register int j;
2798 int len = self->length;
2799 PyObject *str;
2800
2801 for (i = j = 0; i < len; ) {
2802 /* find a token */
2803 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2804 i++;
2805 j = i;
2806 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2807 i++;
2808 if (j < i) {
2809 if (maxcount-- <= 0)
2810 break;
2811 SPLIT_APPEND(self->str, j, i);
2812 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2813 i++;
2814 j = i;
2815 }
2816 }
2817 if (j < len) {
2818 SPLIT_APPEND(self->str, j, len);
2819 }
2820 return list;
2821
2822 onError:
2823 Py_DECREF(list);
2824 return NULL;
2825}
2826
2827PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002828 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829{
2830 register int i;
2831 register int j;
2832 int len;
2833 PyObject *list;
2834 PyObject *str;
2835 Py_UNICODE *data;
2836
2837 string = PyUnicode_FromObject(string);
2838 if (string == NULL)
2839 return NULL;
2840 data = PyUnicode_AS_UNICODE(string);
2841 len = PyUnicode_GET_SIZE(string);
2842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 list = PyList_New(0);
2844 if (!list)
2845 goto onError;
2846
2847 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002848 int eol;
2849
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 /* Find a line and append it */
2851 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2852 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853
2854 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002855 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (i < len) {
2857 if (data[i] == '\r' && i + 1 < len &&
2858 data[i+1] == '\n')
2859 i += 2;
2860 else
2861 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002862 if (keepends)
2863 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 }
Guido van Rossum86662912000-04-11 15:38:46 +00002865 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 j = i;
2867 }
2868 if (j < len) {
2869 SPLIT_APPEND(data, j, len);
2870 }
2871
2872 Py_DECREF(string);
2873 return list;
2874
2875 onError:
2876 Py_DECREF(list);
2877 Py_DECREF(string);
2878 return NULL;
2879}
2880
2881static
2882PyObject *split_char(PyUnicodeObject *self,
2883 PyObject *list,
2884 Py_UNICODE ch,
2885 int maxcount)
2886{
2887 register int i;
2888 register int j;
2889 int len = self->length;
2890 PyObject *str;
2891
2892 for (i = j = 0; i < len; ) {
2893 if (self->str[i] == ch) {
2894 if (maxcount-- <= 0)
2895 break;
2896 SPLIT_APPEND(self->str, j, i);
2897 i = j = i + 1;
2898 } else
2899 i++;
2900 }
2901 if (j <= len) {
2902 SPLIT_APPEND(self->str, j, len);
2903 }
2904 return list;
2905
2906 onError:
2907 Py_DECREF(list);
2908 return NULL;
2909}
2910
2911static
2912PyObject *split_substring(PyUnicodeObject *self,
2913 PyObject *list,
2914 PyUnicodeObject *substring,
2915 int maxcount)
2916{
2917 register int i;
2918 register int j;
2919 int len = self->length;
2920 int sublen = substring->length;
2921 PyObject *str;
2922
2923 for (i = j = 0; i < len - sublen; ) {
2924 if (Py_UNICODE_MATCH(self, i, substring)) {
2925 if (maxcount-- <= 0)
2926 break;
2927 SPLIT_APPEND(self->str, j, i);
2928 i = j = i + sublen;
2929 } else
2930 i++;
2931 }
2932 if (j <= len) {
2933 SPLIT_APPEND(self->str, j, len);
2934 }
2935 return list;
2936
2937 onError:
2938 Py_DECREF(list);
2939 return NULL;
2940}
2941
2942#undef SPLIT_APPEND
2943
2944static
2945PyObject *split(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int maxcount)
2948{
2949 PyObject *list;
2950
2951 if (maxcount < 0)
2952 maxcount = INT_MAX;
2953
2954 list = PyList_New(0);
2955 if (!list)
2956 return NULL;
2957
2958 if (substring == NULL)
2959 return split_whitespace(self,list,maxcount);
2960
2961 else if (substring->length == 1)
2962 return split_char(self,list,substring->str[0],maxcount);
2963
2964 else if (substring->length == 0) {
2965 Py_DECREF(list);
2966 PyErr_SetString(PyExc_ValueError, "empty separator");
2967 return NULL;
2968 }
2969 else
2970 return split_substring(self,list,substring,maxcount);
2971}
2972
2973static
2974PyObject *strip(PyUnicodeObject *self,
2975 int left,
2976 int right)
2977{
2978 Py_UNICODE *p = self->str;
2979 int start = 0;
2980 int end = self->length;
2981
2982 if (left)
2983 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2984 start++;
2985
2986 if (right)
2987 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2988 end--;
2989
2990 if (start == 0 && end == self->length) {
2991 /* couldn't strip anything off, return original string */
2992 Py_INCREF(self);
2993 return (PyObject*) self;
2994 }
2995
2996 return (PyObject*) PyUnicode_FromUnicode(
2997 self->str + start,
2998 end - start
2999 );
3000}
3001
3002static
3003PyObject *replace(PyUnicodeObject *self,
3004 PyUnicodeObject *str1,
3005 PyUnicodeObject *str2,
3006 int maxcount)
3007{
3008 PyUnicodeObject *u;
3009
3010 if (maxcount < 0)
3011 maxcount = INT_MAX;
3012
3013 if (str1->length == 1 && str2->length == 1) {
3014 int i;
3015
3016 /* replace characters */
3017 if (!findchar(self->str, self->length, str1->str[0])) {
3018 /* nothing to replace, return original string */
3019 Py_INCREF(self);
3020 u = self;
3021 } else {
3022 Py_UNICODE u1 = str1->str[0];
3023 Py_UNICODE u2 = str2->str[0];
3024
3025 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3026 self->str,
3027 self->length
3028 );
3029 if (u)
3030 for (i = 0; i < u->length; i++)
3031 if (u->str[i] == u1) {
3032 if (--maxcount < 0)
3033 break;
3034 u->str[i] = u2;
3035 }
3036 }
3037
3038 } else {
3039 int n, i;
3040 Py_UNICODE *p;
3041
3042 /* replace strings */
3043 n = count(self, 0, self->length, str1);
3044 if (n > maxcount)
3045 n = maxcount;
3046 if (n == 0) {
3047 /* nothing to replace, return original string */
3048 Py_INCREF(self);
3049 u = self;
3050 } else {
3051 u = _PyUnicode_New(
3052 self->length + n * (str2->length - str1->length));
3053 if (u) {
3054 i = 0;
3055 p = u->str;
3056 while (i <= self->length - str1->length)
3057 if (Py_UNICODE_MATCH(self, i, str1)) {
3058 /* replace string segment */
3059 Py_UNICODE_COPY(p, str2->str, str2->length);
3060 p += str2->length;
3061 i += str1->length;
3062 if (--n <= 0) {
3063 /* copy remaining part */
3064 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3065 break;
3066 }
3067 } else
3068 *p++ = self->str[i++];
3069 }
3070 }
3071 }
3072
3073 return (PyObject *) u;
3074}
3075
3076/* --- Unicode Object Methods --------------------------------------------- */
3077
3078static char title__doc__[] =
3079"S.title() -> unicode\n\
3080\n\
3081Return a titlecased version of S, i.e. words start with title case\n\
3082characters, all remaining cased characters have lower case.";
3083
3084static PyObject*
3085unicode_title(PyUnicodeObject *self, PyObject *args)
3086{
3087 if (!PyArg_NoArgs(args))
3088 return NULL;
3089 return fixup(self, fixtitle);
3090}
3091
3092static char capitalize__doc__[] =
3093"S.capitalize() -> unicode\n\
3094\n\
3095Return a capitalized version of S, i.e. make the first character\n\
3096have upper case.";
3097
3098static PyObject*
3099unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3100{
3101 if (!PyArg_NoArgs(args))
3102 return NULL;
3103 return fixup(self, fixcapitalize);
3104}
3105
3106#if 0
3107static char capwords__doc__[] =
3108"S.capwords() -> unicode\n\
3109\n\
3110Apply .capitalize() to all words in S and return the result with\n\
3111normalized whitespace (all whitespace strings are replaced by ' ').";
3112
3113static PyObject*
3114unicode_capwords(PyUnicodeObject *self, PyObject *args)
3115{
3116 PyObject *list;
3117 PyObject *item;
3118 int i;
3119
3120 if (!PyArg_NoArgs(args))
3121 return NULL;
3122
3123 /* Split into words */
3124 list = split(self, NULL, -1);
3125 if (!list)
3126 return NULL;
3127
3128 /* Capitalize each word */
3129 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3130 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3131 fixcapitalize);
3132 if (item == NULL)
3133 goto onError;
3134 Py_DECREF(PyList_GET_ITEM(list, i));
3135 PyList_SET_ITEM(list, i, item);
3136 }
3137
3138 /* Join the words to form a new string */
3139 item = PyUnicode_Join(NULL, list);
3140
3141onError:
3142 Py_DECREF(list);
3143 return (PyObject *)item;
3144}
3145#endif
3146
3147static char center__doc__[] =
3148"S.center(width) -> unicode\n\
3149\n\
3150Return S centered in a Unicode string of length width. Padding is done\n\
3151using spaces.";
3152
3153static PyObject *
3154unicode_center(PyUnicodeObject *self, PyObject *args)
3155{
3156 int marg, left;
3157 int width;
3158
3159 if (!PyArg_ParseTuple(args, "i:center", &width))
3160 return NULL;
3161
3162 if (self->length >= width) {
3163 Py_INCREF(self);
3164 return (PyObject*) self;
3165 }
3166
3167 marg = width - self->length;
3168 left = marg / 2 + (marg & width & 1);
3169
3170 return (PyObject*) pad(self, left, marg - left, ' ');
3171}
3172
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003173/* speedy UTF-16 code point order comparison */
3174/* gleaned from: */
3175/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3176
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003177static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003178{
3179 0, 0, 0, 0, 0, 0, 0, 0,
3180 0, 0, 0, 0, 0, 0, 0, 0,
3181 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003182 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003183};
3184
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185static int
3186unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3187{
3188 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003189
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190 Py_UNICODE *s1 = str1->str;
3191 Py_UNICODE *s2 = str2->str;
3192
3193 len1 = str1->length;
3194 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003195
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003197 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003198 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003199
3200 c1 = *s1++;
3201 c2 = *s2++;
3202 if (c1 > (1<<11) * 26)
3203 c1 += utf16Fixup[c1>>11];
3204 if (c2 > (1<<11) * 26)
3205 c2 += utf16Fixup[c2>>11];
3206
3207 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003208 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003209 if (diff)
3210 return (diff < 0) ? -1 : (diff != 0);
3211 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 }
3213
3214 return (len1 < len2) ? -1 : (len1 != len2);
3215}
3216
3217int PyUnicode_Compare(PyObject *left,
3218 PyObject *right)
3219{
3220 PyUnicodeObject *u = NULL, *v = NULL;
3221 int result;
3222
3223 /* Coerce the two arguments */
3224 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3225 if (u == NULL)
3226 goto onError;
3227 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3228 if (v == NULL)
3229 goto onError;
3230
Thomas Wouters7e474022000-07-16 12:04:32 +00003231 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 if (v == u) {
3233 Py_DECREF(u);
3234 Py_DECREF(v);
3235 return 0;
3236 }
3237
3238 result = unicode_compare(u, v);
3239
3240 Py_DECREF(u);
3241 Py_DECREF(v);
3242 return result;
3243
3244onError:
3245 Py_XDECREF(u);
3246 Py_XDECREF(v);
3247 return -1;
3248}
3249
Guido van Rossum403d68b2000-03-13 15:55:09 +00003250int PyUnicode_Contains(PyObject *container,
3251 PyObject *element)
3252{
3253 PyUnicodeObject *u = NULL, *v = NULL;
3254 int result;
3255 register const Py_UNICODE *p, *e;
3256 register Py_UNICODE ch;
3257
3258 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003259 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003260 if (v == NULL) {
3261 PyErr_SetString(PyExc_TypeError,
3262 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003263 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003264 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003265 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3266 if (u == NULL) {
3267 Py_DECREF(v);
3268 goto onError;
3269 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003270
3271 /* Check v in u */
3272 if (PyUnicode_GET_SIZE(v) != 1) {
3273 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003274 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003275 goto onError;
3276 }
3277 ch = *PyUnicode_AS_UNICODE(v);
3278 p = PyUnicode_AS_UNICODE(u);
3279 e = p + PyUnicode_GET_SIZE(u);
3280 result = 0;
3281 while (p < e) {
3282 if (*p++ == ch) {
3283 result = 1;
3284 break;
3285 }
3286 }
3287
3288 Py_DECREF(u);
3289 Py_DECREF(v);
3290 return result;
3291
3292onError:
3293 Py_XDECREF(u);
3294 Py_XDECREF(v);
3295 return -1;
3296}
3297
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298/* Concat to string or Unicode object giving a new Unicode object. */
3299
3300PyObject *PyUnicode_Concat(PyObject *left,
3301 PyObject *right)
3302{
3303 PyUnicodeObject *u = NULL, *v = NULL, *w;
3304
3305 /* Coerce the two arguments */
3306 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3307 if (u == NULL)
3308 goto onError;
3309 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3310 if (v == NULL)
3311 goto onError;
3312
3313 /* Shortcuts */
3314 if (v == unicode_empty) {
3315 Py_DECREF(v);
3316 return (PyObject *)u;
3317 }
3318 if (u == unicode_empty) {
3319 Py_DECREF(u);
3320 return (PyObject *)v;
3321 }
3322
3323 /* Concat the two Unicode strings */
3324 w = _PyUnicode_New(u->length + v->length);
3325 if (w == NULL)
3326 goto onError;
3327 Py_UNICODE_COPY(w->str, u->str, u->length);
3328 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3329
3330 Py_DECREF(u);
3331 Py_DECREF(v);
3332 return (PyObject *)w;
3333
3334onError:
3335 Py_XDECREF(u);
3336 Py_XDECREF(v);
3337 return NULL;
3338}
3339
3340static char count__doc__[] =
3341"S.count(sub[, start[, end]]) -> int\n\
3342\n\
3343Return the number of occurrences of substring sub in Unicode string\n\
3344S[start:end]. Optional arguments start and end are\n\
3345interpreted as in slice notation.";
3346
3347static PyObject *
3348unicode_count(PyUnicodeObject *self, PyObject *args)
3349{
3350 PyUnicodeObject *substring;
3351 int start = 0;
3352 int end = INT_MAX;
3353 PyObject *result;
3354
Guido van Rossumb8872e62000-05-09 14:14:27 +00003355 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3356 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 return NULL;
3358
3359 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3360 (PyObject *)substring);
3361 if (substring == NULL)
3362 return NULL;
3363
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 if (start < 0)
3365 start += self->length;
3366 if (start < 0)
3367 start = 0;
3368 if (end > self->length)
3369 end = self->length;
3370 if (end < 0)
3371 end += self->length;
3372 if (end < 0)
3373 end = 0;
3374
3375 result = PyInt_FromLong((long) count(self, start, end, substring));
3376
3377 Py_DECREF(substring);
3378 return result;
3379}
3380
3381static char encode__doc__[] =
3382"S.encode([encoding[,errors]]) -> string\n\
3383\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003384Return an encoded string version of S. Default encoding is the current\n\
3385default string encoding. errors may be given to set a different error\n\
3386handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3387a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388
3389static PyObject *
3390unicode_encode(PyUnicodeObject *self, PyObject *args)
3391{
3392 char *encoding = NULL;
3393 char *errors = NULL;
3394 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3395 return NULL;
3396 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3397}
3398
3399static char expandtabs__doc__[] =
3400"S.expandtabs([tabsize]) -> unicode\n\
3401\n\
3402Return a copy of S where all tab characters are expanded using spaces.\n\
3403If tabsize is not given, a tab size of 8 characters is assumed.";
3404
3405static PyObject*
3406unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3407{
3408 Py_UNICODE *e;
3409 Py_UNICODE *p;
3410 Py_UNICODE *q;
3411 int i, j;
3412 PyUnicodeObject *u;
3413 int tabsize = 8;
3414
3415 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3416 return NULL;
3417
Thomas Wouters7e474022000-07-16 12:04:32 +00003418 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 i = j = 0;
3420 e = self->str + self->length;
3421 for (p = self->str; p < e; p++)
3422 if (*p == '\t') {
3423 if (tabsize > 0)
3424 j += tabsize - (j % tabsize);
3425 }
3426 else {
3427 j++;
3428 if (*p == '\n' || *p == '\r') {
3429 i += j;
3430 j = 0;
3431 }
3432 }
3433
3434 /* Second pass: create output string and fill it */
3435 u = _PyUnicode_New(i + j);
3436 if (!u)
3437 return NULL;
3438
3439 j = 0;
3440 q = u->str;
3441
3442 for (p = self->str; p < e; p++)
3443 if (*p == '\t') {
3444 if (tabsize > 0) {
3445 i = tabsize - (j % tabsize);
3446 j += i;
3447 while (i--)
3448 *q++ = ' ';
3449 }
3450 }
3451 else {
3452 j++;
3453 *q++ = *p;
3454 if (*p == '\n' || *p == '\r')
3455 j = 0;
3456 }
3457
3458 return (PyObject*) u;
3459}
3460
3461static char find__doc__[] =
3462"S.find(sub [,start [,end]]) -> int\n\
3463\n\
3464Return the lowest index in S where substring sub is found,\n\
3465such that sub is contained within s[start,end]. Optional\n\
3466arguments start and end are interpreted as in slice notation.\n\
3467\n\
3468Return -1 on failure.";
3469
3470static PyObject *
3471unicode_find(PyUnicodeObject *self, PyObject *args)
3472{
3473 PyUnicodeObject *substring;
3474 int start = 0;
3475 int end = INT_MAX;
3476 PyObject *result;
3477
Guido van Rossumb8872e62000-05-09 14:14:27 +00003478 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3479 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 return NULL;
3481 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3482 (PyObject *)substring);
3483 if (substring == NULL)
3484 return NULL;
3485
3486 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3487
3488 Py_DECREF(substring);
3489 return result;
3490}
3491
3492static PyObject *
3493unicode_getitem(PyUnicodeObject *self, int index)
3494{
3495 if (index < 0 || index >= self->length) {
3496 PyErr_SetString(PyExc_IndexError, "string index out of range");
3497 return NULL;
3498 }
3499
3500 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3501}
3502
3503static long
3504unicode_hash(PyUnicodeObject *self)
3505{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003506 /* Since Unicode objects compare equal to their ASCII string
3507 counterparts, they should use the individual character values
3508 as basis for their hash value. This is needed to assure that
3509 strings and Unicode objects behave in the same way as
3510 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511
Fredrik Lundhdde61642000-07-10 18:27:47 +00003512 register int len;
3513 register Py_UNICODE *p;
3514 register long x;
3515
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 if (self->hash != -1)
3517 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003518 len = PyUnicode_GET_SIZE(self);
3519 p = PyUnicode_AS_UNICODE(self);
3520 x = *p << 7;
3521 while (--len >= 0)
3522 x = (1000003*x) ^ *p++;
3523 x ^= PyUnicode_GET_SIZE(self);
3524 if (x == -1)
3525 x = -2;
3526 self->hash = x;
3527 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528}
3529
3530static char index__doc__[] =
3531"S.index(sub [,start [,end]]) -> int\n\
3532\n\
3533Like S.find() but raise ValueError when the substring is not found.";
3534
3535static PyObject *
3536unicode_index(PyUnicodeObject *self, PyObject *args)
3537{
3538 int result;
3539 PyUnicodeObject *substring;
3540 int start = 0;
3541 int end = INT_MAX;
3542
Guido van Rossumb8872e62000-05-09 14:14:27 +00003543 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3544 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 return NULL;
3546
3547 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3548 (PyObject *)substring);
3549 if (substring == NULL)
3550 return NULL;
3551
3552 result = findstring(self, substring, start, end, 1);
3553
3554 Py_DECREF(substring);
3555 if (result < 0) {
3556 PyErr_SetString(PyExc_ValueError, "substring not found");
3557 return NULL;
3558 }
3559 return PyInt_FromLong(result);
3560}
3561
3562static char islower__doc__[] =
3563"S.islower() -> int\n\
3564\n\
3565Return 1 if all cased characters in S are lowercase and there is\n\
3566at least one cased character in S, 0 otherwise.";
3567
3568static PyObject*
3569unicode_islower(PyUnicodeObject *self, PyObject *args)
3570{
3571 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3572 register const Py_UNICODE *e;
3573 int cased;
3574
3575 if (!PyArg_NoArgs(args))
3576 return NULL;
3577
3578 /* Shortcut for single character strings */
3579 if (PyUnicode_GET_SIZE(self) == 1)
3580 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3581
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003582 /* Special case for empty strings */
3583 if (PyString_GET_SIZE(self) == 0)
3584 return PyInt_FromLong(0);
3585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 e = p + PyUnicode_GET_SIZE(self);
3587 cased = 0;
3588 for (; p < e; p++) {
3589 register const Py_UNICODE ch = *p;
3590
3591 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3592 return PyInt_FromLong(0);
3593 else if (!cased && Py_UNICODE_ISLOWER(ch))
3594 cased = 1;
3595 }
3596 return PyInt_FromLong(cased);
3597}
3598
3599static char isupper__doc__[] =
3600"S.isupper() -> int\n\
3601\n\
3602Return 1 if all cased characters in S are uppercase and there is\n\
3603at least one cased character in S, 0 otherwise.";
3604
3605static PyObject*
3606unicode_isupper(PyUnicodeObject *self, PyObject *args)
3607{
3608 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3609 register const Py_UNICODE *e;
3610 int cased;
3611
3612 if (!PyArg_NoArgs(args))
3613 return NULL;
3614
3615 /* Shortcut for single character strings */
3616 if (PyUnicode_GET_SIZE(self) == 1)
3617 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3618
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003619 /* Special case for empty strings */
3620 if (PyString_GET_SIZE(self) == 0)
3621 return PyInt_FromLong(0);
3622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 e = p + PyUnicode_GET_SIZE(self);
3624 cased = 0;
3625 for (; p < e; p++) {
3626 register const Py_UNICODE ch = *p;
3627
3628 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3629 return PyInt_FromLong(0);
3630 else if (!cased && Py_UNICODE_ISUPPER(ch))
3631 cased = 1;
3632 }
3633 return PyInt_FromLong(cased);
3634}
3635
3636static char istitle__doc__[] =
3637"S.istitle() -> int\n\
3638\n\
3639Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3640may only follow uncased characters and lowercase characters only cased\n\
3641ones. Return 0 otherwise.";
3642
3643static PyObject*
3644unicode_istitle(PyUnicodeObject *self, PyObject *args)
3645{
3646 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3647 register const Py_UNICODE *e;
3648 int cased, previous_is_cased;
3649
3650 if (!PyArg_NoArgs(args))
3651 return NULL;
3652
3653 /* Shortcut for single character strings */
3654 if (PyUnicode_GET_SIZE(self) == 1)
3655 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3656 (Py_UNICODE_ISUPPER(*p) != 0));
3657
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003658 /* Special case for empty strings */
3659 if (PyString_GET_SIZE(self) == 0)
3660 return PyInt_FromLong(0);
3661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 e = p + PyUnicode_GET_SIZE(self);
3663 cased = 0;
3664 previous_is_cased = 0;
3665 for (; p < e; p++) {
3666 register const Py_UNICODE ch = *p;
3667
3668 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3669 if (previous_is_cased)
3670 return PyInt_FromLong(0);
3671 previous_is_cased = 1;
3672 cased = 1;
3673 }
3674 else if (Py_UNICODE_ISLOWER(ch)) {
3675 if (!previous_is_cased)
3676 return PyInt_FromLong(0);
3677 previous_is_cased = 1;
3678 cased = 1;
3679 }
3680 else
3681 previous_is_cased = 0;
3682 }
3683 return PyInt_FromLong(cased);
3684}
3685
3686static char isspace__doc__[] =
3687"S.isspace() -> int\n\
3688\n\
3689Return 1 if there are only whitespace characters in S,\n\
36900 otherwise.";
3691
3692static PyObject*
3693unicode_isspace(PyUnicodeObject *self, PyObject *args)
3694{
3695 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3696 register const Py_UNICODE *e;
3697
3698 if (!PyArg_NoArgs(args))
3699 return NULL;
3700
3701 /* Shortcut for single character strings */
3702 if (PyUnicode_GET_SIZE(self) == 1 &&
3703 Py_UNICODE_ISSPACE(*p))
3704 return PyInt_FromLong(1);
3705
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003706 /* Special case for empty strings */
3707 if (PyString_GET_SIZE(self) == 0)
3708 return PyInt_FromLong(0);
3709
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 e = p + PyUnicode_GET_SIZE(self);
3711 for (; p < e; p++) {
3712 if (!Py_UNICODE_ISSPACE(*p))
3713 return PyInt_FromLong(0);
3714 }
3715 return PyInt_FromLong(1);
3716}
3717
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003718static char isalpha__doc__[] =
3719"S.isalpha() -> int\n\
3720\n\
3721Return 1 if all characters in S are alphabetic\n\
3722and there is at least one character in S, 0 otherwise.";
3723
3724static PyObject*
3725unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3726{
3727 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3728 register const Py_UNICODE *e;
3729
3730 if (!PyArg_NoArgs(args))
3731 return NULL;
3732
3733 /* Shortcut for single character strings */
3734 if (PyUnicode_GET_SIZE(self) == 1 &&
3735 Py_UNICODE_ISALPHA(*p))
3736 return PyInt_FromLong(1);
3737
3738 /* Special case for empty strings */
3739 if (PyString_GET_SIZE(self) == 0)
3740 return PyInt_FromLong(0);
3741
3742 e = p + PyUnicode_GET_SIZE(self);
3743 for (; p < e; p++) {
3744 if (!Py_UNICODE_ISALPHA(*p))
3745 return PyInt_FromLong(0);
3746 }
3747 return PyInt_FromLong(1);
3748}
3749
3750static char isalnum__doc__[] =
3751"S.isalnum() -> int\n\
3752\n\
3753Return 1 if all characters in S are alphanumeric\n\
3754and there is at least one character in S, 0 otherwise.";
3755
3756static PyObject*
3757unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3758{
3759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3760 register const Py_UNICODE *e;
3761
3762 if (!PyArg_NoArgs(args))
3763 return NULL;
3764
3765 /* Shortcut for single character strings */
3766 if (PyUnicode_GET_SIZE(self) == 1 &&
3767 Py_UNICODE_ISALNUM(*p))
3768 return PyInt_FromLong(1);
3769
3770 /* Special case for empty strings */
3771 if (PyString_GET_SIZE(self) == 0)
3772 return PyInt_FromLong(0);
3773
3774 e = p + PyUnicode_GET_SIZE(self);
3775 for (; p < e; p++) {
3776 if (!Py_UNICODE_ISALNUM(*p))
3777 return PyInt_FromLong(0);
3778 }
3779 return PyInt_FromLong(1);
3780}
3781
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782static char isdecimal__doc__[] =
3783"S.isdecimal() -> int\n\
3784\n\
3785Return 1 if there are only decimal characters in S,\n\
37860 otherwise.";
3787
3788static PyObject*
3789unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3790{
3791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3792 register const Py_UNICODE *e;
3793
3794 if (!PyArg_NoArgs(args))
3795 return NULL;
3796
3797 /* Shortcut for single character strings */
3798 if (PyUnicode_GET_SIZE(self) == 1 &&
3799 Py_UNICODE_ISDECIMAL(*p))
3800 return PyInt_FromLong(1);
3801
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003802 /* Special case for empty strings */
3803 if (PyString_GET_SIZE(self) == 0)
3804 return PyInt_FromLong(0);
3805
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 e = p + PyUnicode_GET_SIZE(self);
3807 for (; p < e; p++) {
3808 if (!Py_UNICODE_ISDECIMAL(*p))
3809 return PyInt_FromLong(0);
3810 }
3811 return PyInt_FromLong(1);
3812}
3813
3814static char isdigit__doc__[] =
3815"S.isdigit() -> int\n\
3816\n\
3817Return 1 if there are only digit characters in S,\n\
38180 otherwise.";
3819
3820static PyObject*
3821unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3822{
3823 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3824 register const Py_UNICODE *e;
3825
3826 if (!PyArg_NoArgs(args))
3827 return NULL;
3828
3829 /* Shortcut for single character strings */
3830 if (PyUnicode_GET_SIZE(self) == 1 &&
3831 Py_UNICODE_ISDIGIT(*p))
3832 return PyInt_FromLong(1);
3833
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003834 /* Special case for empty strings */
3835 if (PyString_GET_SIZE(self) == 0)
3836 return PyInt_FromLong(0);
3837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 e = p + PyUnicode_GET_SIZE(self);
3839 for (; p < e; p++) {
3840 if (!Py_UNICODE_ISDIGIT(*p))
3841 return PyInt_FromLong(0);
3842 }
3843 return PyInt_FromLong(1);
3844}
3845
3846static char isnumeric__doc__[] =
3847"S.isnumeric() -> int\n\
3848\n\
3849Return 1 if there are only numeric characters in S,\n\
38500 otherwise.";
3851
3852static PyObject*
3853unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3854{
3855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3856 register const Py_UNICODE *e;
3857
3858 if (!PyArg_NoArgs(args))
3859 return NULL;
3860
3861 /* Shortcut for single character strings */
3862 if (PyUnicode_GET_SIZE(self) == 1 &&
3863 Py_UNICODE_ISNUMERIC(*p))
3864 return PyInt_FromLong(1);
3865
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003866 /* Special case for empty strings */
3867 if (PyString_GET_SIZE(self) == 0)
3868 return PyInt_FromLong(0);
3869
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 e = p + PyUnicode_GET_SIZE(self);
3871 for (; p < e; p++) {
3872 if (!Py_UNICODE_ISNUMERIC(*p))
3873 return PyInt_FromLong(0);
3874 }
3875 return PyInt_FromLong(1);
3876}
3877
3878static char join__doc__[] =
3879"S.join(sequence) -> unicode\n\
3880\n\
3881Return a string which is the concatenation of the strings in the\n\
3882sequence. The separator between elements is S.";
3883
3884static PyObject*
3885unicode_join(PyUnicodeObject *self, PyObject *args)
3886{
3887 PyObject *data;
3888 if (!PyArg_ParseTuple(args, "O:join", &data))
3889 return NULL;
3890
3891 return PyUnicode_Join((PyObject *)self, data);
3892}
3893
3894static int
3895unicode_length(PyUnicodeObject *self)
3896{
3897 return self->length;
3898}
3899
3900static char ljust__doc__[] =
3901"S.ljust(width) -> unicode\n\
3902\n\
3903Return S left justified in a Unicode string of length width. Padding is\n\
3904done using spaces.";
3905
3906static PyObject *
3907unicode_ljust(PyUnicodeObject *self, PyObject *args)
3908{
3909 int width;
3910 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3911 return NULL;
3912
3913 if (self->length >= width) {
3914 Py_INCREF(self);
3915 return (PyObject*) self;
3916 }
3917
3918 return (PyObject*) pad(self, 0, width - self->length, ' ');
3919}
3920
3921static char lower__doc__[] =
3922"S.lower() -> unicode\n\
3923\n\
3924Return a copy of the string S converted to lowercase.";
3925
3926static PyObject*
3927unicode_lower(PyUnicodeObject *self, PyObject *args)
3928{
3929 if (!PyArg_NoArgs(args))
3930 return NULL;
3931 return fixup(self, fixlower);
3932}
3933
3934static char lstrip__doc__[] =
3935"S.lstrip() -> unicode\n\
3936\n\
3937Return a copy of the string S with leading whitespace removed.";
3938
3939static PyObject *
3940unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3941{
3942 if (!PyArg_NoArgs(args))
3943 return NULL;
3944 return strip(self, 1, 0);
3945}
3946
3947static PyObject*
3948unicode_repeat(PyUnicodeObject *str, int len)
3949{
3950 PyUnicodeObject *u;
3951 Py_UNICODE *p;
3952
3953 if (len < 0)
3954 len = 0;
3955
3956 if (len == 1) {
3957 /* no repeat, return original string */
3958 Py_INCREF(str);
3959 return (PyObject*) str;
3960 }
3961
3962 u = _PyUnicode_New(len * str->length);
3963 if (!u)
3964 return NULL;
3965
3966 p = u->str;
3967
3968 while (len-- > 0) {
3969 Py_UNICODE_COPY(p, str->str, str->length);
3970 p += str->length;
3971 }
3972
3973 return (PyObject*) u;
3974}
3975
3976PyObject *PyUnicode_Replace(PyObject *obj,
3977 PyObject *subobj,
3978 PyObject *replobj,
3979 int maxcount)
3980{
3981 PyObject *self;
3982 PyObject *str1;
3983 PyObject *str2;
3984 PyObject *result;
3985
3986 self = PyUnicode_FromObject(obj);
3987 if (self == NULL)
3988 return NULL;
3989 str1 = PyUnicode_FromObject(subobj);
3990 if (str1 == NULL) {
3991 Py_DECREF(self);
3992 return NULL;
3993 }
3994 str2 = PyUnicode_FromObject(replobj);
3995 if (str2 == NULL) {
3996 Py_DECREF(self);
3997 Py_DECREF(str1);
3998 return NULL;
3999 }
4000 result = replace((PyUnicodeObject *)self,
4001 (PyUnicodeObject *)str1,
4002 (PyUnicodeObject *)str2,
4003 maxcount);
4004 Py_DECREF(self);
4005 Py_DECREF(str1);
4006 Py_DECREF(str2);
4007 return result;
4008}
4009
4010static char replace__doc__[] =
4011"S.replace (old, new[, maxsplit]) -> unicode\n\
4012\n\
4013Return a copy of S with all occurrences of substring\n\
4014old replaced by new. If the optional argument maxsplit is\n\
4015given, only the first maxsplit occurrences are replaced.";
4016
4017static PyObject*
4018unicode_replace(PyUnicodeObject *self, PyObject *args)
4019{
4020 PyUnicodeObject *str1;
4021 PyUnicodeObject *str2;
4022 int maxcount = -1;
4023 PyObject *result;
4024
4025 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4026 return NULL;
4027 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4028 if (str1 == NULL)
4029 return NULL;
4030 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4031 if (str2 == NULL)
4032 return NULL;
4033
4034 result = replace(self, str1, str2, maxcount);
4035
4036 Py_DECREF(str1);
4037 Py_DECREF(str2);
4038 return result;
4039}
4040
4041static
4042PyObject *unicode_repr(PyObject *unicode)
4043{
4044 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4045 PyUnicode_GET_SIZE(unicode),
4046 1);
4047}
4048
4049static char rfind__doc__[] =
4050"S.rfind(sub [,start [,end]]) -> int\n\
4051\n\
4052Return the highest index in S where substring sub is found,\n\
4053such that sub is contained within s[start,end]. Optional\n\
4054arguments start and end are interpreted as in slice notation.\n\
4055\n\
4056Return -1 on failure.";
4057
4058static PyObject *
4059unicode_rfind(PyUnicodeObject *self, PyObject *args)
4060{
4061 PyUnicodeObject *substring;
4062 int start = 0;
4063 int end = INT_MAX;
4064 PyObject *result;
4065
Guido van Rossumb8872e62000-05-09 14:14:27 +00004066 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4067 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 return NULL;
4069 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4070 (PyObject *)substring);
4071 if (substring == NULL)
4072 return NULL;
4073
4074 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4075
4076 Py_DECREF(substring);
4077 return result;
4078}
4079
4080static char rindex__doc__[] =
4081"S.rindex(sub [,start [,end]]) -> int\n\
4082\n\
4083Like S.rfind() but raise ValueError when the substring is not found.";
4084
4085static PyObject *
4086unicode_rindex(PyUnicodeObject *self, PyObject *args)
4087{
4088 int result;
4089 PyUnicodeObject *substring;
4090 int start = 0;
4091 int end = INT_MAX;
4092
Guido van Rossumb8872e62000-05-09 14:14:27 +00004093 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4094 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 return NULL;
4096 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4097 (PyObject *)substring);
4098 if (substring == NULL)
4099 return NULL;
4100
4101 result = findstring(self, substring, start, end, -1);
4102
4103 Py_DECREF(substring);
4104 if (result < 0) {
4105 PyErr_SetString(PyExc_ValueError, "substring not found");
4106 return NULL;
4107 }
4108 return PyInt_FromLong(result);
4109}
4110
4111static char rjust__doc__[] =
4112"S.rjust(width) -> unicode\n\
4113\n\
4114Return S right justified in a Unicode string of length width. Padding is\n\
4115done using spaces.";
4116
4117static PyObject *
4118unicode_rjust(PyUnicodeObject *self, PyObject *args)
4119{
4120 int width;
4121 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4122 return NULL;
4123
4124 if (self->length >= width) {
4125 Py_INCREF(self);
4126 return (PyObject*) self;
4127 }
4128
4129 return (PyObject*) pad(self, width - self->length, 0, ' ');
4130}
4131
4132static char rstrip__doc__[] =
4133"S.rstrip() -> unicode\n\
4134\n\
4135Return a copy of the string S with trailing whitespace removed.";
4136
4137static PyObject *
4138unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4139{
4140 if (!PyArg_NoArgs(args))
4141 return NULL;
4142 return strip(self, 0, 1);
4143}
4144
4145static PyObject*
4146unicode_slice(PyUnicodeObject *self, int start, int end)
4147{
4148 /* standard clamping */
4149 if (start < 0)
4150 start = 0;
4151 if (end < 0)
4152 end = 0;
4153 if (end > self->length)
4154 end = self->length;
4155 if (start == 0 && end == self->length) {
4156 /* full slice, return original string */
4157 Py_INCREF(self);
4158 return (PyObject*) self;
4159 }
4160 if (start > end)
4161 start = end;
4162 /* copy slice */
4163 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4164 end - start);
4165}
4166
4167PyObject *PyUnicode_Split(PyObject *s,
4168 PyObject *sep,
4169 int maxsplit)
4170{
4171 PyObject *result;
4172
4173 s = PyUnicode_FromObject(s);
4174 if (s == NULL)
4175 return NULL;
4176 if (sep != NULL) {
4177 sep = PyUnicode_FromObject(sep);
4178 if (sep == NULL) {
4179 Py_DECREF(s);
4180 return NULL;
4181 }
4182 }
4183
4184 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4185
4186 Py_DECREF(s);
4187 Py_XDECREF(sep);
4188 return result;
4189}
4190
4191static char split__doc__[] =
4192"S.split([sep [,maxsplit]]) -> list of strings\n\
4193\n\
4194Return a list of the words in S, using sep as the\n\
4195delimiter string. If maxsplit is given, at most maxsplit\n\
4196splits are done. If sep is not specified, any whitespace string\n\
4197is a separator.";
4198
4199static PyObject*
4200unicode_split(PyUnicodeObject *self, PyObject *args)
4201{
4202 PyObject *substring = Py_None;
4203 int maxcount = -1;
4204
4205 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4206 return NULL;
4207
4208 if (substring == Py_None)
4209 return split(self, NULL, maxcount);
4210 else if (PyUnicode_Check(substring))
4211 return split(self, (PyUnicodeObject *)substring, maxcount);
4212 else
4213 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4214}
4215
4216static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004217"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218\n\
4219Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004220Line breaks are not included in the resulting list unless keepends\n\
4221is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222
4223static PyObject*
4224unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4225{
Guido van Rossum86662912000-04-11 15:38:46 +00004226 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227
Guido van Rossum86662912000-04-11 15:38:46 +00004228 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 return NULL;
4230
Guido van Rossum86662912000-04-11 15:38:46 +00004231 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232}
4233
4234static
4235PyObject *unicode_str(PyUnicodeObject *self)
4236{
Fred Drakee4315f52000-05-09 19:53:39 +00004237 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238}
4239
4240static char strip__doc__[] =
4241"S.strip() -> unicode\n\
4242\n\
4243Return a copy of S with leading and trailing whitespace removed.";
4244
4245static PyObject *
4246unicode_strip(PyUnicodeObject *self, PyObject *args)
4247{
4248 if (!PyArg_NoArgs(args))
4249 return NULL;
4250 return strip(self, 1, 1);
4251}
4252
4253static char swapcase__doc__[] =
4254"S.swapcase() -> unicode\n\
4255\n\
4256Return a copy of S with uppercase characters converted to lowercase\n\
4257and vice versa.";
4258
4259static PyObject*
4260unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4261{
4262 if (!PyArg_NoArgs(args))
4263 return NULL;
4264 return fixup(self, fixswapcase);
4265}
4266
4267static char translate__doc__[] =
4268"S.translate(table) -> unicode\n\
4269\n\
4270Return a copy of the string S, where all characters have been mapped\n\
4271through the given translation table, which must be a mapping of\n\
4272Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4273are left untouched. Characters mapped to None are deleted.";
4274
4275static PyObject*
4276unicode_translate(PyUnicodeObject *self, PyObject *args)
4277{
4278 PyObject *table;
4279
4280 if (!PyArg_ParseTuple(args, "O:translate", &table))
4281 return NULL;
4282 return PyUnicode_TranslateCharmap(self->str,
4283 self->length,
4284 table,
4285 "ignore");
4286}
4287
4288static char upper__doc__[] =
4289"S.upper() -> unicode\n\
4290\n\
4291Return a copy of S converted to uppercase.";
4292
4293static PyObject*
4294unicode_upper(PyUnicodeObject *self, PyObject *args)
4295{
4296 if (!PyArg_NoArgs(args))
4297 return NULL;
4298 return fixup(self, fixupper);
4299}
4300
4301#if 0
4302static char zfill__doc__[] =
4303"S.zfill(width) -> unicode\n\
4304\n\
4305Pad a numeric string x with zeros on the left, to fill a field\n\
4306of the specified width. The string x is never truncated.";
4307
4308static PyObject *
4309unicode_zfill(PyUnicodeObject *self, PyObject *args)
4310{
4311 int fill;
4312 PyUnicodeObject *u;
4313
4314 int width;
4315 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4316 return NULL;
4317
4318 if (self->length >= width) {
4319 Py_INCREF(self);
4320 return (PyObject*) self;
4321 }
4322
4323 fill = width - self->length;
4324
4325 u = pad(self, fill, 0, '0');
4326
4327 if (u->str[fill] == '+' || u->str[fill] == '-') {
4328 /* move sign to beginning of string */
4329 u->str[0] = u->str[fill];
4330 u->str[fill] = '0';
4331 }
4332
4333 return (PyObject*) u;
4334}
4335#endif
4336
4337#if 0
4338static PyObject*
4339unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4340{
4341 if (!PyArg_NoArgs(args))
4342 return NULL;
4343 return PyInt_FromLong(unicode_freelist_size);
4344}
4345#endif
4346
4347static char startswith__doc__[] =
4348"S.startswith(prefix[, start[, end]]) -> int\n\
4349\n\
4350Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4351optional start, test S beginning at that position. With optional end, stop\n\
4352comparing S at that position.";
4353
4354static PyObject *
4355unicode_startswith(PyUnicodeObject *self,
4356 PyObject *args)
4357{
4358 PyUnicodeObject *substring;
4359 int start = 0;
4360 int end = INT_MAX;
4361 PyObject *result;
4362
Guido van Rossumb8872e62000-05-09 14:14:27 +00004363 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4364 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 return NULL;
4366 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4367 (PyObject *)substring);
4368 if (substring == NULL)
4369 return NULL;
4370
4371 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4372
4373 Py_DECREF(substring);
4374 return result;
4375}
4376
4377
4378static char endswith__doc__[] =
4379"S.endswith(suffix[, start[, end]]) -> int\n\
4380\n\
4381Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4382optional start, test S beginning at that position. With optional end, stop\n\
4383comparing S at that position.";
4384
4385static PyObject *
4386unicode_endswith(PyUnicodeObject *self,
4387 PyObject *args)
4388{
4389 PyUnicodeObject *substring;
4390 int start = 0;
4391 int end = INT_MAX;
4392 PyObject *result;
4393
Guido van Rossumb8872e62000-05-09 14:14:27 +00004394 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4395 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 return NULL;
4397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4398 (PyObject *)substring);
4399 if (substring == NULL)
4400 return NULL;
4401
4402 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4403
4404 Py_DECREF(substring);
4405 return result;
4406}
4407
4408
4409static PyMethodDef unicode_methods[] = {
4410
4411 /* Order is according to common usage: often used methods should
4412 appear first, since lookup is done sequentially. */
4413
4414 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4415 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4416 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4417 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4418 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4419 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4420 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4421 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4422 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4423 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4424 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4425 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4426 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4427 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4428/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4429 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4430 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4431 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4432 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4433 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4434 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4435 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4436 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4437 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4438 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4439 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4440 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4441 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4442 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4443 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4444 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4445 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4446 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004447 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4448 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449#if 0
4450 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4451 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4452#endif
4453
4454#if 0
4455 /* This one is just used for debugging the implementation. */
4456 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4457#endif
4458
4459 {NULL, NULL}
4460};
4461
4462static PyObject *
4463unicode_getattr(PyUnicodeObject *self, char *name)
4464{
4465 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4466}
4467
4468static PySequenceMethods unicode_as_sequence = {
4469 (inquiry) unicode_length, /* sq_length */
4470 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4471 (intargfunc) unicode_repeat, /* sq_repeat */
4472 (intargfunc) unicode_getitem, /* sq_item */
4473 (intintargfunc) unicode_slice, /* sq_slice */
4474 0, /* sq_ass_item */
4475 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004476 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477};
4478
4479static int
4480unicode_buffer_getreadbuf(PyUnicodeObject *self,
4481 int index,
4482 const void **ptr)
4483{
4484 if (index != 0) {
4485 PyErr_SetString(PyExc_SystemError,
4486 "accessing non-existent unicode segment");
4487 return -1;
4488 }
4489 *ptr = (void *) self->str;
4490 return PyUnicode_GET_DATA_SIZE(self);
4491}
4492
4493static int
4494unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4495 const void **ptr)
4496{
4497 PyErr_SetString(PyExc_TypeError,
4498 "cannot use unicode as modifyable buffer");
4499 return -1;
4500}
4501
4502static int
4503unicode_buffer_getsegcount(PyUnicodeObject *self,
4504 int *lenp)
4505{
4506 if (lenp)
4507 *lenp = PyUnicode_GET_DATA_SIZE(self);
4508 return 1;
4509}
4510
4511static int
4512unicode_buffer_getcharbuf(PyUnicodeObject *self,
4513 int index,
4514 const void **ptr)
4515{
4516 PyObject *str;
4517
4518 if (index != 0) {
4519 PyErr_SetString(PyExc_SystemError,
4520 "accessing non-existent unicode segment");
4521 return -1;
4522 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004523 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 if (str == NULL)
4525 return -1;
4526 *ptr = (void *) PyString_AS_STRING(str);
4527 return PyString_GET_SIZE(str);
4528}
4529
4530/* Helpers for PyUnicode_Format() */
4531
4532static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004533getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534{
4535 int argidx = *p_argidx;
4536 if (argidx < arglen) {
4537 (*p_argidx)++;
4538 if (arglen < 0)
4539 return args;
4540 else
4541 return PyTuple_GetItem(args, argidx);
4542 }
4543 PyErr_SetString(PyExc_TypeError,
4544 "not enough arguments for format string");
4545 return NULL;
4546}
4547
4548#define F_LJUST (1<<0)
4549#define F_SIGN (1<<1)
4550#define F_BLANK (1<<2)
4551#define F_ALT (1<<3)
4552#define F_ZERO (1<<4)
4553
4554static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556{
4557 register int i;
4558 int len;
4559 va_list va;
4560 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562
4563 /* First, format the string as char array, then expand to Py_UNICODE
4564 array. */
4565 charbuffer = (char *)buffer;
4566 len = vsprintf(charbuffer, format, va);
4567 for (i = len - 1; i >= 0; i--)
4568 buffer[i] = (Py_UNICODE) charbuffer[i];
4569
4570 va_end(va);
4571 return len;
4572}
4573
4574static int
4575formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004576 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 int flags,
4578 int prec,
4579 int type,
4580 PyObject *v)
4581{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004582 /* fmt = '%#.' + `prec` + `type`
4583 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 char fmt[20];
4585 double x;
4586
4587 x = PyFloat_AsDouble(v);
4588 if (x == -1.0 && PyErr_Occurred())
4589 return -1;
4590 if (prec < 0)
4591 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4593 type = 'g';
4594 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004595 /* worst case length calc to ensure no buffer overrun:
4596 fmt = %#.<prec>g
4597 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4598 for any double rep.)
4599 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4600 If prec=0 the effective precision is 1 (the leading digit is
4601 always given), therefore increase by one to 10+prec. */
4602 if (buflen <= (size_t)10 + (size_t)prec) {
4603 PyErr_SetString(PyExc_OverflowError,
4604 "formatted float is too long (precision too long?)");
4605 return -1;
4606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 return usprintf(buf, fmt, x);
4608}
4609
4610static int
4611formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004612 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 int flags,
4614 int prec,
4615 int type,
4616 PyObject *v)
4617{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004618 /* fmt = '%#.' + `prec` + 'l' + `type`
4619 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 char fmt[20];
4621 long x;
4622
4623 x = PyInt_AsLong(v);
4624 if (x == -1 && PyErr_Occurred())
4625 return -1;
4626 if (prec < 0)
4627 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004628 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4629 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4630 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4631 PyErr_SetString(PyExc_OverflowError,
4632 "formatted integer is too long (precision too long?)");
4633 return -1;
4634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4636 return usprintf(buf, fmt, x);
4637}
4638
4639static int
4640formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004641 size_t buflen,
4642 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004644 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004645 if (PyUnicode_Check(v)) {
4646 if (PyUnicode_GET_SIZE(v) != 1)
4647 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004651 else if (PyString_Check(v)) {
4652 if (PyString_GET_SIZE(v) != 1)
4653 goto onError;
4654 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656
4657 else {
4658 /* Integer input truncated to a character */
4659 long x;
4660 x = PyInt_AsLong(v);
4661 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 buf[0] = (char) x;
4664 }
4665 buf[1] = '\0';
4666 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004667
4668 onError:
4669 PyErr_SetString(PyExc_TypeError,
4670 "%c requires int or char");
4671 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672}
4673
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004674/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4675
4676 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4677 chars are formatted. XXX This is a magic number. Each formatting
4678 routine does bounds checking to ensure no overflow, but a better
4679 solution may be to malloc a buffer of appropriate size for each
4680 format. For now, the current solution is sufficient.
4681*/
4682#define FORMATBUFLEN (size_t)120
4683
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684PyObject *PyUnicode_Format(PyObject *format,
4685 PyObject *args)
4686{
4687 Py_UNICODE *fmt, *res;
4688 int fmtcnt, rescnt, reslen, arglen, argidx;
4689 int args_owned = 0;
4690 PyUnicodeObject *result = NULL;
4691 PyObject *dict = NULL;
4692 PyObject *uformat;
4693
4694 if (format == NULL || args == NULL) {
4695 PyErr_BadInternalCall();
4696 return NULL;
4697 }
4698 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004699 if (uformat == NULL)
4700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 fmt = PyUnicode_AS_UNICODE(uformat);
4702 fmtcnt = PyUnicode_GET_SIZE(uformat);
4703
4704 reslen = rescnt = fmtcnt + 100;
4705 result = _PyUnicode_New(reslen);
4706 if (result == NULL)
4707 goto onError;
4708 res = PyUnicode_AS_UNICODE(result);
4709
4710 if (PyTuple_Check(args)) {
4711 arglen = PyTuple_Size(args);
4712 argidx = 0;
4713 }
4714 else {
4715 arglen = -1;
4716 argidx = -2;
4717 }
4718 if (args->ob_type->tp_as_mapping)
4719 dict = args;
4720
4721 while (--fmtcnt >= 0) {
4722 if (*fmt != '%') {
4723 if (--rescnt < 0) {
4724 rescnt = fmtcnt + 100;
4725 reslen += rescnt;
4726 if (_PyUnicode_Resize(result, reslen) < 0)
4727 return NULL;
4728 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4729 --rescnt;
4730 }
4731 *res++ = *fmt++;
4732 }
4733 else {
4734 /* Got a format specifier */
4735 int flags = 0;
4736 int width = -1;
4737 int prec = -1;
4738 int size = 0;
4739 Py_UNICODE c = '\0';
4740 Py_UNICODE fill;
4741 PyObject *v = NULL;
4742 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004743 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 Py_UNICODE sign;
4745 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004746 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748 fmt++;
4749 if (*fmt == '(') {
4750 Py_UNICODE *keystart;
4751 int keylen;
4752 PyObject *key;
4753 int pcount = 1;
4754
4755 if (dict == NULL) {
4756 PyErr_SetString(PyExc_TypeError,
4757 "format requires a mapping");
4758 goto onError;
4759 }
4760 ++fmt;
4761 --fmtcnt;
4762 keystart = fmt;
4763 /* Skip over balanced parentheses */
4764 while (pcount > 0 && --fmtcnt >= 0) {
4765 if (*fmt == ')')
4766 --pcount;
4767 else if (*fmt == '(')
4768 ++pcount;
4769 fmt++;
4770 }
4771 keylen = fmt - keystart - 1;
4772 if (fmtcnt < 0 || pcount > 0) {
4773 PyErr_SetString(PyExc_ValueError,
4774 "incomplete format key");
4775 goto onError;
4776 }
Fred Drakee4315f52000-05-09 19:53:39 +00004777 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 then looked up since Python uses strings to hold
4779 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004780 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 key = PyUnicode_EncodeUTF8(keystart,
4782 keylen,
4783 NULL);
4784 if (key == NULL)
4785 goto onError;
4786 if (args_owned) {
4787 Py_DECREF(args);
4788 args_owned = 0;
4789 }
4790 args = PyObject_GetItem(dict, key);
4791 Py_DECREF(key);
4792 if (args == NULL) {
4793 goto onError;
4794 }
4795 args_owned = 1;
4796 arglen = -1;
4797 argidx = -2;
4798 }
4799 while (--fmtcnt >= 0) {
4800 switch (c = *fmt++) {
4801 case '-': flags |= F_LJUST; continue;
4802 case '+': flags |= F_SIGN; continue;
4803 case ' ': flags |= F_BLANK; continue;
4804 case '#': flags |= F_ALT; continue;
4805 case '0': flags |= F_ZERO; continue;
4806 }
4807 break;
4808 }
4809 if (c == '*') {
4810 v = getnextarg(args, arglen, &argidx);
4811 if (v == NULL)
4812 goto onError;
4813 if (!PyInt_Check(v)) {
4814 PyErr_SetString(PyExc_TypeError,
4815 "* wants int");
4816 goto onError;
4817 }
4818 width = PyInt_AsLong(v);
4819 if (width < 0) {
4820 flags |= F_LJUST;
4821 width = -width;
4822 }
4823 if (--fmtcnt >= 0)
4824 c = *fmt++;
4825 }
4826 else if (c >= '0' && c <= '9') {
4827 width = c - '0';
4828 while (--fmtcnt >= 0) {
4829 c = *fmt++;
4830 if (c < '0' || c > '9')
4831 break;
4832 if ((width*10) / 10 != width) {
4833 PyErr_SetString(PyExc_ValueError,
4834 "width too big");
4835 goto onError;
4836 }
4837 width = width*10 + (c - '0');
4838 }
4839 }
4840 if (c == '.') {
4841 prec = 0;
4842 if (--fmtcnt >= 0)
4843 c = *fmt++;
4844 if (c == '*') {
4845 v = getnextarg(args, arglen, &argidx);
4846 if (v == NULL)
4847 goto onError;
4848 if (!PyInt_Check(v)) {
4849 PyErr_SetString(PyExc_TypeError,
4850 "* wants int");
4851 goto onError;
4852 }
4853 prec = PyInt_AsLong(v);
4854 if (prec < 0)
4855 prec = 0;
4856 if (--fmtcnt >= 0)
4857 c = *fmt++;
4858 }
4859 else if (c >= '0' && c <= '9') {
4860 prec = c - '0';
4861 while (--fmtcnt >= 0) {
4862 c = Py_CHARMASK(*fmt++);
4863 if (c < '0' || c > '9')
4864 break;
4865 if ((prec*10) / 10 != prec) {
4866 PyErr_SetString(PyExc_ValueError,
4867 "prec too big");
4868 goto onError;
4869 }
4870 prec = prec*10 + (c - '0');
4871 }
4872 }
4873 } /* prec */
4874 if (fmtcnt >= 0) {
4875 if (c == 'h' || c == 'l' || c == 'L') {
4876 size = c;
4877 if (--fmtcnt >= 0)
4878 c = *fmt++;
4879 }
4880 }
4881 if (fmtcnt < 0) {
4882 PyErr_SetString(PyExc_ValueError,
4883 "incomplete format");
4884 goto onError;
4885 }
4886 if (c != '%') {
4887 v = getnextarg(args, arglen, &argidx);
4888 if (v == NULL)
4889 goto onError;
4890 }
4891 sign = 0;
4892 fill = ' ';
4893 switch (c) {
4894
4895 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004896 pbuf = formatbuf;
4897 /* presume that buffer length is at least 1 */
4898 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 len = 1;
4900 break;
4901
4902 case 's':
4903 case 'r':
4904 if (PyUnicode_Check(v) && c == 's') {
4905 temp = v;
4906 Py_INCREF(temp);
4907 }
4908 else {
4909 PyObject *unicode;
4910 if (c == 's')
4911 temp = PyObject_Str(v);
4912 else
4913 temp = PyObject_Repr(v);
4914 if (temp == NULL)
4915 goto onError;
4916 if (!PyString_Check(temp)) {
4917 /* XXX Note: this should never happen, since
4918 PyObject_Repr() and PyObject_Str() assure
4919 this */
4920 Py_DECREF(temp);
4921 PyErr_SetString(PyExc_TypeError,
4922 "%s argument has non-string str()");
4923 goto onError;
4924 }
Fred Drakee4315f52000-05-09 19:53:39 +00004925 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004927 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 "strict");
4929 Py_DECREF(temp);
4930 temp = unicode;
4931 if (temp == NULL)
4932 goto onError;
4933 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004934 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 len = PyUnicode_GET_SIZE(temp);
4936 if (prec >= 0 && len > prec)
4937 len = prec;
4938 break;
4939
4940 case 'i':
4941 case 'd':
4942 case 'u':
4943 case 'o':
4944 case 'x':
4945 case 'X':
4946 if (c == 'i')
4947 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004948 pbuf = formatbuf;
4949 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4950 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951 if (len < 0)
4952 goto onError;
4953 sign = (c == 'd');
4954 if (flags & F_ZERO) {
4955 fill = '0';
4956 if ((flags&F_ALT) &&
4957 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004958 pbuf[0] == '0' && pbuf[1] == c) {
4959 *res++ = *pbuf++;
4960 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 rescnt -= 2;
4962 len -= 2;
4963 width -= 2;
4964 if (width < 0)
4965 width = 0;
4966 }
4967 }
4968 break;
4969
4970 case 'e':
4971 case 'E':
4972 case 'f':
4973 case 'g':
4974 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004975 pbuf = formatbuf;
4976 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4977 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 if (len < 0)
4979 goto onError;
4980 sign = 1;
4981 if (flags&F_ZERO)
4982 fill = '0';
4983 break;
4984
4985 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004986 pbuf = formatbuf;
4987 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 if (len < 0)
4989 goto onError;
4990 break;
4991
4992 default:
4993 PyErr_Format(PyExc_ValueError,
4994 "unsupported format character '%c' (0x%x)",
4995 c, c);
4996 goto onError;
4997 }
4998 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004999 if (*pbuf == '-' || *pbuf == '+') {
5000 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 len--;
5002 }
5003 else if (flags & F_SIGN)
5004 sign = '+';
5005 else if (flags & F_BLANK)
5006 sign = ' ';
5007 else
5008 sign = 0;
5009 }
5010 if (width < len)
5011 width = len;
5012 if (rescnt < width + (sign != 0)) {
5013 reslen -= rescnt;
5014 rescnt = width + fmtcnt + 100;
5015 reslen += rescnt;
5016 if (_PyUnicode_Resize(result, reslen) < 0)
5017 return NULL;
5018 res = PyUnicode_AS_UNICODE(result)
5019 + reslen - rescnt;
5020 }
5021 if (sign) {
5022 if (fill != ' ')
5023 *res++ = sign;
5024 rescnt--;
5025 if (width > len)
5026 width--;
5027 }
5028 if (width > len && !(flags & F_LJUST)) {
5029 do {
5030 --rescnt;
5031 *res++ = fill;
5032 } while (--width > len);
5033 }
5034 if (sign && fill == ' ')
5035 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005036 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 res += len;
5038 rescnt -= len;
5039 while (--width >= len) {
5040 --rescnt;
5041 *res++ = ' ';
5042 }
5043 if (dict && (argidx < arglen) && c != '%') {
5044 PyErr_SetString(PyExc_TypeError,
5045 "not all arguments converted");
5046 goto onError;
5047 }
5048 Py_XDECREF(temp);
5049 } /* '%' */
5050 } /* until end */
5051 if (argidx < arglen && !dict) {
5052 PyErr_SetString(PyExc_TypeError,
5053 "not all arguments converted");
5054 goto onError;
5055 }
5056
5057 if (args_owned) {
5058 Py_DECREF(args);
5059 }
5060 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005061 if (_PyUnicode_Resize(result, reslen - rescnt))
5062 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 return (PyObject *)result;
5064
5065 onError:
5066 Py_XDECREF(result);
5067 Py_DECREF(uformat);
5068 if (args_owned) {
5069 Py_DECREF(args);
5070 }
5071 return NULL;
5072}
5073
5074static PyBufferProcs unicode_as_buffer = {
5075 (getreadbufferproc) unicode_buffer_getreadbuf,
5076 (getwritebufferproc) unicode_buffer_getwritebuf,
5077 (getsegcountproc) unicode_buffer_getsegcount,
5078 (getcharbufferproc) unicode_buffer_getcharbuf,
5079};
5080
5081PyTypeObject PyUnicode_Type = {
5082 PyObject_HEAD_INIT(&PyType_Type)
5083 0, /* ob_size */
5084 "unicode", /* tp_name */
5085 sizeof(PyUnicodeObject), /* tp_size */
5086 0, /* tp_itemsize */
5087 /* Slots */
5088 (destructor)_PyUnicode_Free, /* tp_dealloc */
5089 0, /* tp_print */
5090 (getattrfunc)unicode_getattr, /* tp_getattr */
5091 0, /* tp_setattr */
5092 (cmpfunc) unicode_compare, /* tp_compare */
5093 (reprfunc) unicode_repr, /* tp_repr */
5094 0, /* tp_as_number */
5095 &unicode_as_sequence, /* tp_as_sequence */
5096 0, /* tp_as_mapping */
5097 (hashfunc) unicode_hash, /* tp_hash*/
5098 0, /* tp_call*/
5099 (reprfunc) unicode_str, /* tp_str */
5100 (getattrofunc) NULL, /* tp_getattro */
5101 (setattrofunc) NULL, /* tp_setattro */
5102 &unicode_as_buffer, /* tp_as_buffer */
5103 Py_TPFLAGS_DEFAULT, /* tp_flags */
5104};
5105
5106/* Initialize the Unicode implementation */
5107
Thomas Wouters78890102000-07-22 19:25:51 +00005108void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109{
5110 /* Doublecheck the configuration... */
5111 if (sizeof(Py_UNICODE) != 2)
5112 Py_FatalError("Unicode configuration error: "
5113 "sizeof(Py_UNICODE) != 2 bytes");
5114
Fred Drakee4315f52000-05-09 19:53:39 +00005115 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005116 unicode_freelist = NULL;
5117 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005119 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120}
5121
5122/* Finalize the Unicode implementation */
5123
5124void
Thomas Wouters78890102000-07-22 19:25:51 +00005125_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126{
5127 PyUnicodeObject *u = unicode_freelist;
5128
5129 while (u != NULL) {
5130 PyUnicodeObject *v = u;
5131 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005132 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005133 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005134 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005135 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005137 unicode_freelist = NULL;
5138 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005140 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141}