blob: df8592d55e89b9ab43a0ee615ada345cfb2e4253 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000401 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000405 int reclevel;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406
407 if (obj == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000411
412 /* Coerce object */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000413 for (reclevel = 0; reclevel < 2; reclevel++) {
414
415 if (PyUnicode_Check(obj)) {
416 if (encoding) {
417 PyErr_SetString(PyExc_TypeError,
418 "decoding Unicode is not supported");
419 goto onError;
420 }
421 if (PyUnicode_CheckExact(obj)) {
422 Py_INCREF(obj);
423 v = obj;
424 }
425 else {
426 /* For a subclass of unicode, return a true unicode object
427 with the same string value. */
428 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
429 PyUnicode_GET_SIZE(obj));
430 }
431 goto done;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000433 else if (PyString_Check(obj)) {
434 s = PyString_AS_STRING(obj);
435 len = PyString_GET_SIZE(obj);
436 break;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000437 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000438 else {
439 PyObject *w;
440
441 /* Try char buffer interface */
442 if (PyObject_AsCharBuffer(obj, &s, &len))
443 PyErr_Clear();
444 else
445 break;
446
447 /* Mimic the behaviour of str(object) if everything else
448 fails (see PyObject_Str()); this also covers instances
449 which implement __str__. */
450 if (obj->ob_type->tp_str == NULL)
451 w = PyObject_Repr(obj);
452 else
453 w = (*obj->ob_type->tp_str)(obj);
454 if (w == NULL)
455 goto onError;
456 if (owned) {
457 Py_DECREF(obj);
458 }
459 obj = w;
460 owned = 1;
Tim Peters78e0fc72001-09-11 03:07:38 +0000461 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000464 if (s == NULL) {
465 PyErr_Format(PyExc_TypeError,
466 "coercing to Unicode: __str__ recursion limit exceeded "
467 "(last type: %.80s)",
468 obj->ob_type->tp_name);
469 goto onError;
470 }
471
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000472 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473 if (len == 0) {
474 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 else
478 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000479
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000480 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000481 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000482 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000483 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000484 return v;
485
486 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000487 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000488 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000489 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491}
492
493PyObject *PyUnicode_Decode(const char *s,
494 int size,
495 const char *encoding,
496 const char *errors)
497{
498 PyObject *buffer = NULL, *unicode;
499
Fred Drakee4315f52000-05-09 19:53:39 +0000500 if (encoding == NULL)
501 encoding = PyUnicode_GetDefaultEncoding();
502
503 /* Shortcuts for common default encodings */
504 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000506 else if (strcmp(encoding, "latin-1") == 0)
507 return PyUnicode_DecodeLatin1(s, size, errors);
508 else if (strcmp(encoding, "ascii") == 0)
509 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 /* Decode via the codec registry */
512 buffer = PyBuffer_FromMemory((void *)s, size);
513 if (buffer == NULL)
514 goto onError;
515 unicode = PyCodec_Decode(buffer, encoding, errors);
516 if (unicode == NULL)
517 goto onError;
518 if (!PyUnicode_Check(unicode)) {
519 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000520 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 unicode->ob_type->tp_name);
522 Py_DECREF(unicode);
523 goto onError;
524 }
525 Py_DECREF(buffer);
526 return unicode;
527
528 onError:
529 Py_XDECREF(buffer);
530 return NULL;
531}
532
533PyObject *PyUnicode_Encode(const Py_UNICODE *s,
534 int size,
535 const char *encoding,
536 const char *errors)
537{
538 PyObject *v, *unicode;
539
540 unicode = PyUnicode_FromUnicode(s, size);
541 if (unicode == NULL)
542 return NULL;
543 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
544 Py_DECREF(unicode);
545 return v;
546}
547
548PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
549 const char *encoding,
550 const char *errors)
551{
552 PyObject *v;
553
554 if (!PyUnicode_Check(unicode)) {
555 PyErr_BadArgument();
556 goto onError;
557 }
Fred Drakee4315f52000-05-09 19:53:39 +0000558
559 if (encoding == NULL)
560 encoding = PyUnicode_GetDefaultEncoding();
561
562 /* Shortcuts for common default encodings */
563 if (errors == NULL) {
564 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000565 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000566 else if (strcmp(encoding, "latin-1") == 0)
567 return PyUnicode_AsLatin1String(unicode);
568 else if (strcmp(encoding, "ascii") == 0)
569 return PyUnicode_AsASCIIString(unicode);
570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571
572 /* Encode via the codec registry */
573 v = PyCodec_Encode(unicode, encoding, errors);
574 if (v == NULL)
575 goto onError;
576 /* XXX Should we really enforce this ? */
577 if (!PyString_Check(v)) {
578 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000579 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 v->ob_type->tp_name);
581 Py_DECREF(v);
582 goto onError;
583 }
584 return v;
585
586 onError:
587 return NULL;
588}
589
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000590PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
591 const char *errors)
592{
593 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
594
595 if (v)
596 return v;
597 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
598 if (v && errors == NULL)
599 ((PyUnicodeObject *)unicode)->defenc = v;
600 return v;
601}
602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
604{
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609 return PyUnicode_AS_UNICODE(unicode);
610
611 onError:
612 return NULL;
613}
614
615int PyUnicode_GetSize(PyObject *unicode)
616{
617 if (!PyUnicode_Check(unicode)) {
618 PyErr_BadArgument();
619 goto onError;
620 }
621 return PyUnicode_GET_SIZE(unicode);
622
623 onError:
624 return -1;
625}
626
Thomas Wouters78890102000-07-22 19:25:51 +0000627const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000628{
629 return unicode_default_encoding;
630}
631
632int PyUnicode_SetDefaultEncoding(const char *encoding)
633{
634 PyObject *v;
635
636 /* Make sure the encoding is valid. As side effect, this also
637 loads the encoding into the codec registry cache. */
638 v = _PyCodec_Lookup(encoding);
639 if (v == NULL)
640 goto onError;
641 Py_DECREF(v);
642 strncpy(unicode_default_encoding,
643 encoding,
644 sizeof(unicode_default_encoding));
645 return 0;
646
647 onError:
648 return -1;
649}
650
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000651/* --- UTF-7 Codec -------------------------------------------------------- */
652
653/* see RFC2152 for details */
654
655static
656char utf7_special[128] = {
657 /* indicate whether a UTF-7 character is special i.e. cannot be directly
658 encoded:
659 0 - not special
660 1 - special
661 2 - whitespace (optional)
662 3 - RFC2152 Set O (optional) */
663 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
665 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
667 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
669 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
671
672};
673
674#define SPECIAL(c, encodeO, encodeWS) \
675 (((c)>127 || utf7_special[(c)] == 1) || \
676 (encodeWS && (utf7_special[(c)] == 2)) || \
677 (encodeO && (utf7_special[(c)] == 3)))
678
679#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
680#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
681#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
682 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
683
684#define ENCODE(out, ch, bits) \
685 while (bits >= 6) { \
686 *out++ = B64(ch >> (bits-6)); \
687 bits -= 6; \
688 }
689
690#define DECODE(out, ch, bits, surrogate) \
691 while (bits >= 16) { \
692 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
693 bits -= 16; \
694 if (surrogate) { \
695 /* We have already generated an error for the high surrogate
696 so let's not bother seeing if the low surrogate is correct or not */\
697 surrogate = 0; \
698 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
699 /* This is a surrogate pair. Unfortunately we can't represent \
700 it in a 16-bit character */ \
701 surrogate = 1; \
702 errmsg = "code pairs are not supported"; \
703 goto utf7Error; \
704 } else { \
705 *out++ = outCh; \
706 } \
707 } \
708
709static
710int utf7_decoding_error(Py_UNICODE **dest,
711 const char *errors,
712 const char *details)
713{
714 if ((errors == NULL) ||
715 (strcmp(errors,"strict") == 0)) {
716 PyErr_Format(PyExc_UnicodeError,
717 "UTF-7 decoding error: %.400s",
718 details);
719 return -1;
720 }
721 else if (strcmp(errors,"ignore") == 0) {
722 return 0;
723 }
724 else if (strcmp(errors,"replace") == 0) {
725 if (dest != NULL) {
726 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
727 (*dest)++;
728 }
729 return 0;
730 }
731 else {
732 PyErr_Format(PyExc_ValueError,
733 "UTF-7 decoding error; unknown error handling code: %.400s",
734 errors);
735 return -1;
736 }
737}
738
739PyObject *PyUnicode_DecodeUTF7(const char *s,
740 int size,
741 const char *errors)
742{
743 const char *e;
744 PyUnicodeObject *unicode;
745 Py_UNICODE *p;
746 const char *errmsg = "";
747 int inShift = 0;
748 unsigned int bitsleft = 0;
749 unsigned long charsleft = 0;
750 int surrogate = 0;
751
752 unicode = _PyUnicode_New(size);
753 if (!unicode)
754 return NULL;
755 if (size == 0)
756 return (PyObject *)unicode;
757
758 p = unicode->str;
759 e = s + size;
760
761 while (s < e) {
762 Py_UNICODE ch = *s;
763
764 if (inShift) {
765 if ((ch == '-') || !B64CHAR(ch)) {
766 inShift = 0;
767 s++;
768
769 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
770 if (bitsleft >= 6) {
771 /* The shift sequence has a partial character in it. If
772 bitsleft < 6 then we could just classify it as padding
773 but that is not the case here */
774
775 errmsg = "partial character in shift sequence";
776 goto utf7Error;
777 }
778 /* According to RFC2152 the remaining bits should be zero. We
779 choose to signal an error/insert a replacement character
780 here so indicate the potential of a misencoded character. */
781
782 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
783 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
784 errmsg = "non-zero padding bits in shift sequence";
785 goto utf7Error;
786 }
787
788 if (ch == '-') {
789 if ((s < e) && (*(s) == '-')) {
790 *p++ = '-';
791 inShift = 1;
792 }
793 } else if (SPECIAL(ch,0,0)) {
794 errmsg = "unexpected special character";
795 goto utf7Error;
796 } else {
797 *p++ = ch;
798 }
799 } else {
800 charsleft = (charsleft << 6) | UB64(ch);
801 bitsleft += 6;
802 s++;
803 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
804 }
805 }
806 else if ( ch == '+' ) {
807 s++;
808 if (s < e && *s == '-') {
809 s++;
810 *p++ = '+';
811 } else
812 {
813 inShift = 1;
814 bitsleft = 0;
815 }
816 }
817 else if (SPECIAL(ch,0,0)) {
818 errmsg = "unexpected special character";
819 s++;
820 goto utf7Error;
821 }
822 else {
823 *p++ = ch;
824 s++;
825 }
826 continue;
827 utf7Error:
828 if (utf7_decoding_error(&p, errors, errmsg))
829 goto onError;
830 }
831
832 if (inShift) {
833 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
834 goto onError;
835 }
836
837 if (_PyUnicode_Resize(&unicode, p - unicode->str))
838 goto onError;
839
840 return (PyObject *)unicode;
841
842onError:
843 Py_DECREF(unicode);
844 return NULL;
845}
846
847
848PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
849 int size,
850 int encodeSetO,
851 int encodeWhiteSpace,
852 const char *errors)
853{
854 PyObject *v;
855 /* It might be possible to tighten this worst case */
856 unsigned int cbAllocated = 5 * size;
857 int inShift = 0;
858 int i = 0;
859 unsigned int bitsleft = 0;
860 unsigned long charsleft = 0;
861 char * out;
862 char * start;
863
864 if (size == 0)
865 return PyString_FromStringAndSize(NULL, 0);
866
867 v = PyString_FromStringAndSize(NULL, cbAllocated);
868 if (v == NULL)
869 return NULL;
870
871 start = out = PyString_AS_STRING(v);
872 for (;i < size; ++i) {
873 Py_UNICODE ch = s[i];
874
875 if (!inShift) {
876 if (ch == '+') {
877 *out++ = '+';
878 *out++ = '-';
879 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
880 charsleft = ch;
881 bitsleft = 16;
882 *out++ = '+';
883 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
884 inShift = bitsleft > 0;
885 } else {
886 *out++ = (char) ch;
887 }
888 } else {
889 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
890 *out++ = B64(charsleft << (6-bitsleft));
891 charsleft = 0;
892 bitsleft = 0;
893 /* Characters not in the BASE64 set implicitly unshift the sequence
894 so no '-' is required, except if the character is itself a '-' */
895 if (B64CHAR(ch) || ch == '-') {
896 *out++ = '-';
897 }
898 inShift = 0;
899 *out++ = (char) ch;
900 } else {
901 bitsleft += 16;
902 charsleft = (charsleft << 16) | ch;
903 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
904
905 /* If the next character is special then we dont' need to terminate
906 the shift sequence. If the next character is not a BASE64 character
907 or '-' then the shift sequence will be terminated implicitly and we
908 don't have to insert a '-'. */
909
910 if (bitsleft == 0) {
911 if (i + 1 < size) {
912 Py_UNICODE ch2 = s[i+1];
913
914 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
915
916 } else if (B64CHAR(ch2) || ch2 == '-') {
917 *out++ = '-';
918 inShift = 0;
919 } else {
920 inShift = 0;
921 }
922
923 }
924 else {
925 *out++ = '-';
926 inShift = 0;
927 }
928 }
929 }
930 }
931 }
932 if (bitsleft) {
933 *out++= B64(charsleft << (6-bitsleft) );
934 *out++ = '-';
935 }
936
937 if (_PyString_Resize(&v, out - start)) {
938 Py_DECREF(v);
939 return NULL;
940 }
941 return v;
942}
943
944#undef SPECIAL
945#undef B64
946#undef B64CHAR
947#undef UB64
948#undef ENCODE
949#undef DECODE
950
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951/* --- UTF-8 Codec -------------------------------------------------------- */
952
953static
954char utf8_code_length[256] = {
955 /* Map UTF-8 encoded prefix byte to sequence length. zero means
956 illegal prefix. see RFC 2279 for details */
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
971 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
972 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
973};
974
975static
976int utf8_decoding_error(const char **source,
977 Py_UNICODE **dest,
978 const char *errors,
979 const char *details)
980{
981 if ((errors == NULL) ||
982 (strcmp(errors,"strict") == 0)) {
983 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000984 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000985 details);
986 return -1;
987 }
988 else if (strcmp(errors,"ignore") == 0) {
989 (*source)++;
990 return 0;
991 }
992 else if (strcmp(errors,"replace") == 0) {
993 (*source)++;
994 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
995 (*dest)++;
996 return 0;
997 }
998 else {
999 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001000 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001 errors);
1002 return -1;
1003 }
1004}
1005
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006PyObject *PyUnicode_DecodeUTF8(const char *s,
1007 int size,
1008 const char *errors)
1009{
1010 int n;
1011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001014 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015
1016 /* Note: size will always be longer than the resulting Unicode
1017 character count */
1018 unicode = _PyUnicode_New(size);
1019 if (!unicode)
1020 return NULL;
1021 if (size == 0)
1022 return (PyObject *)unicode;
1023
1024 /* Unpack UTF-8 encoded data */
1025 p = unicode->str;
1026 e = s + size;
1027
1028 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001029 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030
1031 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001032 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 s++;
1034 continue;
1035 }
1036
1037 n = utf8_code_length[ch];
1038
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001039 if (s + n > e) {
1040 errmsg = "unexpected end of data";
1041 goto utf8Error;
1042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 switch (n) {
1045
1046 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001047 errmsg = "unexpected code byte";
1048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049
1050 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001051 errmsg = "internal error";
1052 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
1054 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 if ((s[1] & 0xc0) != 0x80) {
1056 errmsg = "invalid data";
1057 goto utf8Error;
1058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 if (ch < 0x80) {
1061 errmsg = "illegal encoding";
1062 goto utf8Error;
1063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001065 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066 break;
1067
1068 case 3:
1069 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001070 (s[2] & 0xc0) != 0x80) {
1071 errmsg = "invalid data";
1072 goto utf8Error;
1073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001075 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001080 *p++ = (Py_UNICODE)ch;
1081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1178{
1179 PyObject *v;
1180 char *p;
1181 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 Py_UCS4 ch2;
1183 unsigned int cbAllocated = 3 * size;
1184 unsigned int cbWritten = 0;
1185 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001187 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 if (v == NULL)
1189 return NULL;
1190 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001191 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192
1193 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001194 while (i < size) {
1195 Py_UCS4 ch = s[i++];
1196 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001198 cbWritten++;
1199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 else if (ch < 0x0800) {
1201 *p++ = 0xc0 | (ch >> 6);
1202 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001203 cbWritten += 2;
1204 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001205 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001206 /* Check for high surrogate */
1207 if (0xD800 <= ch && ch <= 0xDBFF) {
1208 if (i != size) {
1209 ch2 = s[i];
1210 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1211
1212 if (cbWritten >= (cbAllocated - 4)) {
1213 /* Provide enough room for some more
1214 surrogates */
1215 cbAllocated += 4*10;
1216 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001218 }
1219
1220 /* combine the two values */
1221 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1222
1223 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001224 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001225 i++;
1226 cbWritten += 4;
1227 }
1228 }
1229 }
1230 else {
1231 *p++ = (char)(0xe0 | (ch >> 12));
1232 cbWritten += 3;
1233 }
1234 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1235 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001236 } else {
1237 *p++ = 0xf0 | (ch>>18);
1238 *p++ = 0x80 | ((ch>>12) & 0x3f);
1239 *p++ = 0x80 | ((ch>>6) & 0x3f);
1240 *p++ = 0x80 | (ch & 0x3f);
1241 cbWritten += 4;
1242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001245 if (_PyString_Resize(&v, p - q))
1246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return v;
1248
1249 onError:
1250 Py_DECREF(v);
1251 return NULL;
1252}
1253
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1255{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 return NULL;
1259 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001260 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1261 PyUnicode_GET_SIZE(unicode),
1262 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263}
1264
1265/* --- UTF-16 Codec ------------------------------------------------------- */
1266
1267static
Tim Peters772747b2001-08-09 22:21:55 +00001268int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 const char *errors,
1270 const char *details)
1271{
1272 if ((errors == NULL) ||
1273 (strcmp(errors,"strict") == 0)) {
1274 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001275 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 details);
1277 return -1;
1278 }
1279 else if (strcmp(errors,"ignore") == 0) {
1280 return 0;
1281 }
1282 else if (strcmp(errors,"replace") == 0) {
1283 if (dest) {
1284 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1285 (*dest)++;
1286 }
1287 return 0;
1288 }
1289 else {
1290 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001291 "UTF-16 decoding error; "
1292 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 errors);
1294 return -1;
1295 }
1296}
1297
Tim Peters772747b2001-08-09 22:21:55 +00001298PyObject *
1299PyUnicode_DecodeUTF16(const char *s,
1300 int size,
1301 const char *errors,
1302 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
1304 PyUnicodeObject *unicode;
1305 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001306 const unsigned char *q, *e;
1307 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001309 /* Offsets from q for retrieving byte pairs in the right order. */
1310#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1311 int ihi = 1, ilo = 0;
1312#else
1313 int ihi = 0, ilo = 1;
1314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315
1316 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001317 if (size & 1) {
1318 if (utf16_decoding_error(NULL, errors, "truncated data"))
1319 return NULL;
1320 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
1322
1323 /* Note: size will always be longer than the resulting Unicode
1324 character count */
1325 unicode = _PyUnicode_New(size);
1326 if (!unicode)
1327 return NULL;
1328 if (size == 0)
1329 return (PyObject *)unicode;
1330
1331 /* Unpack UTF-16 encoded data */
1332 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001333 q = (unsigned char *)s;
1334 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335
1336 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001337 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001339 /* Check for BOM marks (U+FEFF) in the input and adjust current
1340 byte order setting accordingly. In native mode, the leading BOM
1341 mark is skipped, in all other modes, it is copied to the output
1342 stream as-is (giving a ZWNBSP character). */
1343 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001344 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001346 if (bom == 0xFEFF) {
1347 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001348 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001349 }
1350 else if (bom == 0xFFFE) {
1351 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001352 bo = 1;
1353 }
1354#else
Tim Peters772747b2001-08-09 22:21:55 +00001355 if (bom == 0xFEFF) {
1356 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001357 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001358 }
1359 else if (bom == 0xFFFE) {
1360 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001361 bo = -1;
1362 }
1363#endif
1364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
Tim Peters772747b2001-08-09 22:21:55 +00001366 if (bo == -1) {
1367 /* force LE */
1368 ihi = 1;
1369 ilo = 0;
1370 }
1371 else if (bo == 1) {
1372 /* force BE */
1373 ihi = 0;
1374 ilo = 1;
1375 }
1376
1377 while (q < e) {
1378 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1379 q += 2;
1380
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 if (ch < 0xD800 || ch > 0xDFFF) {
1382 *p++ = ch;
1383 continue;
1384 }
1385
1386 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001387 if (q >= e) {
1388 errmsg = "unexpected end of data";
1389 goto utf16Error;
1390 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001391 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001392 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1393 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001394 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001395#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001396 *p++ = ch;
1397 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001398#else
1399 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001400#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001401 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001402 }
1403 else {
1404 errmsg = "illegal UTF-16 surrogate";
1405 goto utf16Error;
1406 }
1407
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001409 errmsg = "illegal encoding";
1410 /* Fall through to report the error */
1411
1412 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001413 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001414 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 }
1416
1417 if (byteorder)
1418 *byteorder = bo;
1419
1420 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001421 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 goto onError;
1423
1424 return (PyObject *)unicode;
1425
1426onError:
1427 Py_DECREF(unicode);
1428 return NULL;
1429}
1430
Tim Peters772747b2001-08-09 22:21:55 +00001431PyObject *
1432PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1433 int size,
1434 const char *errors,
1435 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436{
1437 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001438 unsigned char *p;
1439 int i, pairs;
1440 /* Offsets from p for storing byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
1446
1447#define STORECHAR(CH) \
1448 do { \
1449 p[ihi] = ((CH) >> 8) & 0xff; \
1450 p[ilo] = (CH) & 0xff; \
1451 p += 2; \
1452 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001454 for (i = pairs = 0; i < size; i++)
1455 if (s[i] >= 0x10000)
1456 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001458 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 if (v == NULL)
1460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
Tim Peters772747b2001-08-09 22:21:55 +00001462 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001464 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001465 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001466 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001467
1468 if (byteorder == -1) {
1469 /* force LE */
1470 ihi = 1;
1471 ilo = 0;
1472 }
1473 else if (byteorder == 1) {
1474 /* force BE */
1475 ihi = 0;
1476 ilo = 1;
1477 }
1478
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001479 while (size-- > 0) {
1480 Py_UNICODE ch = *s++;
1481 Py_UNICODE ch2 = 0;
1482 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001483 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1484 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 }
Tim Peters772747b2001-08-09 22:21:55 +00001486 STORECHAR(ch);
1487 if (ch2)
1488 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001491#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001492}
1493
1494PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1495{
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_BadArgument();
1498 return NULL;
1499 }
1500 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1501 PyUnicode_GET_SIZE(unicode),
1502 NULL,
1503 0);
1504}
1505
1506/* --- Unicode Escape Codec ----------------------------------------------- */
1507
1508static
1509int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001510 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 const char *errors,
1512 const char *details)
1513{
1514 if ((errors == NULL) ||
1515 (strcmp(errors,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001517 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 details);
1519 return -1;
1520 }
1521 else if (strcmp(errors,"ignore") == 0) {
1522 return 0;
1523 }
1524 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001525 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return 0;
1527 }
1528 else {
1529 PyErr_Format(PyExc_ValueError,
1530 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001531 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 errors);
1533 return -1;
1534 }
1535}
1536
Fredrik Lundh06d12682001-01-24 07:59:11 +00001537static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001538
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1540 int size,
1541 const char *errors)
1542{
1543 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001546 char* message;
1547 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 /* Escaped strings will always be longer than the resulting
1550 Unicode string, so we start with size here and then reduce the
1551 length after conversion to the true value. */
1552 v = _PyUnicode_New(size);
1553 if (v == NULL)
1554 goto onError;
1555 if (size == 0)
1556 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001557
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 p = buf = PyUnicode_AS_UNICODE(v);
1559 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001560
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 while (s < end) {
1562 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001563 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001564 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
1566 /* Non-escape characters are interpreted as Unicode ordinals */
1567 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569 continue;
1570 }
1571
1572 /* \ - Escapes */
1573 s++;
1574 switch (*s++) {
1575
1576 /* \x escapes */
1577 case '\n': break;
1578 case '\\': *p++ = '\\'; break;
1579 case '\'': *p++ = '\''; break;
1580 case '\"': *p++ = '\"'; break;
1581 case 'b': *p++ = '\b'; break;
1582 case 'f': *p++ = '\014'; break; /* FF */
1583 case 't': *p++ = '\t'; break;
1584 case 'n': *p++ = '\n'; break;
1585 case 'r': *p++ = '\r'; break;
1586 case 'v': *p++ = '\013'; break; /* VT */
1587 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1588
1589 /* \OOO (octal) escapes */
1590 case '0': case '1': case '2': case '3':
1591 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001592 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001594 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001596 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001598 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 break;
1600
Fredrik Lundhccc74732001-02-18 22:13:49 +00001601 /* hex escapes */
1602 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001604 digits = 2;
1605 message = "truncated \\xXX escape";
1606 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607
Fredrik Lundhccc74732001-02-18 22:13:49 +00001608 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001610 digits = 4;
1611 message = "truncated \\uXXXX escape";
1612 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613
Fredrik Lundhccc74732001-02-18 22:13:49 +00001614 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001615 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 digits = 8;
1617 message = "truncated \\UXXXXXXXX escape";
1618 hexescape:
1619 chr = 0;
1620 for (i = 0; i < digits; i++) {
1621 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001622 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001623 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001624 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001626 i++;
1627 break;
1628 }
1629 chr = (chr<<4) & ~0xF;
1630 if (c >= '0' && c <= '9')
1631 chr += c - '0';
1632 else if (c >= 'a' && c <= 'f')
1633 chr += 10 + c - 'a';
1634 else
1635 chr += 10 + c - 'A';
1636 }
1637 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001638 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001639 /* when we get here, chr is a 32-bit unicode character */
1640 if (chr <= 0xffff)
1641 /* UCS-2 character */
1642 *p++ = (Py_UNICODE) chr;
1643 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001644 /* UCS-4 character. Either store directly, or as
1645 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001646#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001647 *p++ = chr;
1648#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 chr -= 0x10000L;
1650 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001651 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001653 } else {
1654 if (unicodeescape_decoding_error(
1655 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 )
1658 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001659 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001660 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001661 break;
1662
1663 /* \N{name} */
1664 case 'N':
1665 message = "malformed \\N character escape";
1666 if (ucnhash_CAPI == NULL) {
1667 /* load the unicode data module */
1668 PyObject *m, *v;
1669 m = PyImport_ImportModule("unicodedata");
1670 if (m == NULL)
1671 goto ucnhashError;
1672 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1673 Py_DECREF(m);
1674 if (v == NULL)
1675 goto ucnhashError;
1676 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1677 Py_DECREF(v);
1678 if (ucnhash_CAPI == NULL)
1679 goto ucnhashError;
1680 }
1681 if (*s == '{') {
1682 const char *start = s+1;
1683 /* look for the closing brace */
1684 while (*s != '}' && s < end)
1685 s++;
1686 if (s > start && s < end && *s == '}') {
1687 /* found a name. look it up in the unicode database */
1688 message = "unknown Unicode character name";
1689 s++;
1690 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1691 goto store;
1692 }
1693 }
1694 if (unicodeescape_decoding_error(&s, &x, errors, message))
1695 goto onError;
1696 *p++ = x;
1697 break;
1698
1699 default:
1700 *p++ = '\\';
1701 *p++ = (unsigned char)s[-1];
1702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 }
1704 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001705 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 return (PyObject *)v;
1708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001710 PyErr_SetString(
1711 PyExc_UnicodeError,
1712 "\\N escapes not supported (can't load unicodedata module)"
1713 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001714 return NULL;
1715
Fredrik Lundhccc74732001-02-18 22:13:49 +00001716onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 Py_XDECREF(v);
1718 return NULL;
1719}
1720
1721/* Return a Unicode-Escape string version of the Unicode object.
1722
1723 If quotes is true, the string is enclosed in u"" or u'' quotes as
1724 appropriate.
1725
1726*/
1727
Barry Warsaw51ac5802000-03-20 16:36:48 +00001728static const Py_UNICODE *findchar(const Py_UNICODE *s,
1729 int size,
1730 Py_UNICODE ch);
1731
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732static
1733PyObject *unicodeescape_string(const Py_UNICODE *s,
1734 int size,
1735 int quotes)
1736{
1737 PyObject *repr;
1738 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001740 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1743 if (repr == NULL)
1744 return NULL;
1745
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001746 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
1748 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 *p++ = 'u';
1750 *p++ = (findchar(s, size, '\'') &&
1751 !findchar(s, size, '"')) ? '"' : '\'';
1752 }
1753 while (size-- > 0) {
1754 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001757 if (quotes &&
1758 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 *p++ = '\\';
1760 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001761 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001763
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001764#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001765 /* Map 21-bit characters to '\U00xxxxxx' */
1766 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001767 int offset = p - PyString_AS_STRING(repr);
1768
1769 /* Resize the string if necessary */
1770 if (offset + 12 > PyString_GET_SIZE(repr)) {
1771 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1772 goto onError;
1773 p = PyString_AS_STRING(repr) + offset;
1774 }
1775
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 *p++ = '\\';
1777 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001778 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1780 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1781 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1782 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1783 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1784 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001785 *p++ = hexdigit[ch & 0x0000000F];
1786 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001787 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001788#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001789 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1790 else if (ch >= 0xD800 && ch < 0xDC00) {
1791 Py_UNICODE ch2;
1792 Py_UCS4 ucs;
1793
1794 ch2 = *s++;
1795 size--;
1796 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1797 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1798 *p++ = '\\';
1799 *p++ = 'U';
1800 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1802 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1803 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1804 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1805 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1806 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1807 *p++ = hexdigit[ucs & 0x0000000F];
1808 continue;
1809 }
1810 /* Fall through: isolated surrogates are copied as-is */
1811 s--;
1812 size++;
1813 }
1814
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001816 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817 *p++ = '\\';
1818 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001819 *p++ = hexdigit[(ch >> 12) & 0x000F];
1820 *p++ = hexdigit[(ch >> 8) & 0x000F];
1821 *p++ = hexdigit[(ch >> 4) & 0x000F];
1822 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001824
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001825 /* Map special whitespace to '\t', \n', '\r' */
1826 else if (ch == '\t') {
1827 *p++ = '\\';
1828 *p++ = 't';
1829 }
1830 else if (ch == '\n') {
1831 *p++ = '\\';
1832 *p++ = 'n';
1833 }
1834 else if (ch == '\r') {
1835 *p++ = '\\';
1836 *p++ = 'r';
1837 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001838
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001839 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 else if (ch < ' ' || ch >= 128) {
1841 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001842 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001843 *p++ = hexdigit[(ch >> 4) & 0x000F];
1844 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001846
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847 /* Copy everything else as-is */
1848 else
1849 *p++ = (char) ch;
1850 }
1851 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001852 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853
1854 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001855 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001856 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857
1858 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001859
1860 onError:
1861 Py_DECREF(repr);
1862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863}
1864
1865PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1866 int size)
1867{
1868 return unicodeescape_string(s, size, 0);
1869}
1870
1871PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1872{
1873 if (!PyUnicode_Check(unicode)) {
1874 PyErr_BadArgument();
1875 return NULL;
1876 }
1877 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1878 PyUnicode_GET_SIZE(unicode));
1879}
1880
1881/* --- Raw Unicode Escape Codec ------------------------------------------- */
1882
1883PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1884 int size,
1885 const char *errors)
1886{
1887 PyUnicodeObject *v;
1888 Py_UNICODE *p, *buf;
1889 const char *end;
1890 const char *bs;
1891
1892 /* Escaped strings will always be longer than the resulting
1893 Unicode string, so we start with size here and then reduce the
1894 length after conversion to the true value. */
1895 v = _PyUnicode_New(size);
1896 if (v == NULL)
1897 goto onError;
1898 if (size == 0)
1899 return (PyObject *)v;
1900 p = buf = PyUnicode_AS_UNICODE(v);
1901 end = s + size;
1902 while (s < end) {
1903 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001904 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 int i;
1906
1907 /* Non-escape characters are interpreted as Unicode ordinals */
1908 if (*s != '\\') {
1909 *p++ = (unsigned char)*s++;
1910 continue;
1911 }
1912
1913 /* \u-escapes are only interpreted iff the number of leading
1914 backslashes if odd */
1915 bs = s;
1916 for (;s < end;) {
1917 if (*s != '\\')
1918 break;
1919 *p++ = (unsigned char)*s++;
1920 }
1921 if (((s - bs) & 1) == 0 ||
1922 s >= end ||
1923 *s != 'u') {
1924 continue;
1925 }
1926 p--;
1927 s++;
1928
1929 /* \uXXXX with 4 hex digits */
1930 for (x = 0, i = 0; i < 4; i++) {
1931 c = (unsigned char)s[i];
1932 if (!isxdigit(c)) {
1933 if (unicodeescape_decoding_error(&s, &x, errors,
1934 "truncated \\uXXXX"))
1935 goto onError;
1936 i++;
1937 break;
1938 }
1939 x = (x<<4) & ~0xF;
1940 if (c >= '0' && c <= '9')
1941 x += c - '0';
1942 else if (c >= 'a' && c <= 'f')
1943 x += 10 + c - 'a';
1944 else
1945 x += 10 + c - 'A';
1946 }
1947 s += i;
1948 *p++ = x;
1949 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001950 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 return (PyObject *)v;
1953
1954 onError:
1955 Py_XDECREF(v);
1956 return NULL;
1957}
1958
1959PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1960 int size)
1961{
1962 PyObject *repr;
1963 char *p;
1964 char *q;
1965
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001966 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967
1968 repr = PyString_FromStringAndSize(NULL, 6 * size);
1969 if (repr == NULL)
1970 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001971 if (size == 0)
1972 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
1974 p = q = PyString_AS_STRING(repr);
1975 while (size-- > 0) {
1976 Py_UNICODE ch = *s++;
1977 /* Map 16-bit characters to '\uxxxx' */
1978 if (ch >= 256) {
1979 *p++ = '\\';
1980 *p++ = 'u';
1981 *p++ = hexdigit[(ch >> 12) & 0xf];
1982 *p++ = hexdigit[(ch >> 8) & 0xf];
1983 *p++ = hexdigit[(ch >> 4) & 0xf];
1984 *p++ = hexdigit[ch & 15];
1985 }
1986 /* Copy everything else as-is */
1987 else
1988 *p++ = (char) ch;
1989 }
1990 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001991 if (_PyString_Resize(&repr, p - q))
1992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993
1994 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001995
1996 onError:
1997 Py_DECREF(repr);
1998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999}
2000
2001PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2002{
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 return NULL;
2006 }
2007 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2008 PyUnicode_GET_SIZE(unicode));
2009}
2010
2011/* --- Latin-1 Codec ------------------------------------------------------ */
2012
2013PyObject *PyUnicode_DecodeLatin1(const char *s,
2014 int size,
2015 const char *errors)
2016{
2017 PyUnicodeObject *v;
2018 Py_UNICODE *p;
2019
2020 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002021 if (size == 1 && *(unsigned char*)s < 256) {
2022 Py_UNICODE r = *(unsigned char*)s;
2023 return PyUnicode_FromUnicode(&r, 1);
2024 }
2025
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 v = _PyUnicode_New(size);
2027 if (v == NULL)
2028 goto onError;
2029 if (size == 0)
2030 return (PyObject *)v;
2031 p = PyUnicode_AS_UNICODE(v);
2032 while (size-- > 0)
2033 *p++ = (unsigned char)*s++;
2034 return (PyObject *)v;
2035
2036 onError:
2037 Py_XDECREF(v);
2038 return NULL;
2039}
2040
2041static
2042int latin1_encoding_error(const Py_UNICODE **source,
2043 char **dest,
2044 const char *errors,
2045 const char *details)
2046{
2047 if ((errors == NULL) ||
2048 (strcmp(errors,"strict") == 0)) {
2049 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002050 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 details);
2052 return -1;
2053 }
2054 else if (strcmp(errors,"ignore") == 0) {
2055 return 0;
2056 }
2057 else if (strcmp(errors,"replace") == 0) {
2058 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002059 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 return 0;
2061 }
2062 else {
2063 PyErr_Format(PyExc_ValueError,
2064 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002065 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 errors);
2067 return -1;
2068 }
2069}
2070
2071PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2072 int size,
2073 const char *errors)
2074{
2075 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002076 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002077
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 repr = PyString_FromStringAndSize(NULL, size);
2079 if (repr == NULL)
2080 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002081 if (size == 0)
2082 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083
2084 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002085 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 while (size-- > 0) {
2087 Py_UNICODE ch = *p++;
2088 if (ch >= 256) {
2089 if (latin1_encoding_error(&p, &s, errors,
2090 "ordinal not in range(256)"))
2091 goto onError;
2092 }
2093 else
2094 *s++ = (char)ch;
2095 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002096 /* Resize if error handling skipped some characters */
2097 if (s - start < PyString_GET_SIZE(repr))
2098 if (_PyString_Resize(&repr, s - start))
2099 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 return repr;
2101
2102 onError:
2103 Py_DECREF(repr);
2104 return NULL;
2105}
2106
2107PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2108{
2109 if (!PyUnicode_Check(unicode)) {
2110 PyErr_BadArgument();
2111 return NULL;
2112 }
2113 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2114 PyUnicode_GET_SIZE(unicode),
2115 NULL);
2116}
2117
2118/* --- 7-bit ASCII Codec -------------------------------------------------- */
2119
2120static
2121int ascii_decoding_error(const char **source,
2122 Py_UNICODE **dest,
2123 const char *errors,
2124 const char *details)
2125{
2126 if ((errors == NULL) ||
2127 (strcmp(errors,"strict") == 0)) {
2128 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002129 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 details);
2131 return -1;
2132 }
2133 else if (strcmp(errors,"ignore") == 0) {
2134 return 0;
2135 }
2136 else if (strcmp(errors,"replace") == 0) {
2137 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2138 (*dest)++;
2139 return 0;
2140 }
2141 else {
2142 PyErr_Format(PyExc_ValueError,
2143 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002144 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 errors);
2146 return -1;
2147 }
2148}
2149
2150PyObject *PyUnicode_DecodeASCII(const char *s,
2151 int size,
2152 const char *errors)
2153{
2154 PyUnicodeObject *v;
2155 Py_UNICODE *p;
2156
2157 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002158 if (size == 1 && *(unsigned char*)s < 128) {
2159 Py_UNICODE r = *(unsigned char*)s;
2160 return PyUnicode_FromUnicode(&r, 1);
2161 }
2162
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 v = _PyUnicode_New(size);
2164 if (v == NULL)
2165 goto onError;
2166 if (size == 0)
2167 return (PyObject *)v;
2168 p = PyUnicode_AS_UNICODE(v);
2169 while (size-- > 0) {
2170 register unsigned char c;
2171
2172 c = (unsigned char)*s++;
2173 if (c < 128)
2174 *p++ = c;
2175 else if (ascii_decoding_error(&s, &p, errors,
2176 "ordinal not in range(128)"))
2177 goto onError;
2178 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002179 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002180 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002181 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 return (PyObject *)v;
2183
2184 onError:
2185 Py_XDECREF(v);
2186 return NULL;
2187}
2188
2189static
2190int ascii_encoding_error(const Py_UNICODE **source,
2191 char **dest,
2192 const char *errors,
2193 const char *details)
2194{
2195 if ((errors == NULL) ||
2196 (strcmp(errors,"strict") == 0)) {
2197 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002198 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 details);
2200 return -1;
2201 }
2202 else if (strcmp(errors,"ignore") == 0) {
2203 return 0;
2204 }
2205 else if (strcmp(errors,"replace") == 0) {
2206 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002207 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 return 0;
2209 }
2210 else {
2211 PyErr_Format(PyExc_ValueError,
2212 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002213 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 errors);
2215 return -1;
2216 }
2217}
2218
2219PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2220 int size,
2221 const char *errors)
2222{
2223 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002224 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002225
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 repr = PyString_FromStringAndSize(NULL, size);
2227 if (repr == NULL)
2228 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002229 if (size == 0)
2230 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231
2232 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002233 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 while (size-- > 0) {
2235 Py_UNICODE ch = *p++;
2236 if (ch >= 128) {
2237 if (ascii_encoding_error(&p, &s, errors,
2238 "ordinal not in range(128)"))
2239 goto onError;
2240 }
2241 else
2242 *s++ = (char)ch;
2243 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002244 /* Resize if error handling skipped some characters */
2245 if (s - start < PyString_GET_SIZE(repr))
2246 if (_PyString_Resize(&repr, s - start))
2247 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 return repr;
2249
2250 onError:
2251 Py_DECREF(repr);
2252 return NULL;
2253}
2254
2255PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2256{
2257 if (!PyUnicode_Check(unicode)) {
2258 PyErr_BadArgument();
2259 return NULL;
2260 }
2261 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2262 PyUnicode_GET_SIZE(unicode),
2263 NULL);
2264}
2265
Fredrik Lundh30831632001-06-26 15:11:00 +00002266#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002267
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002268/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002269
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002270PyObject *PyUnicode_DecodeMBCS(const char *s,
2271 int size,
2272 const char *errors)
2273{
2274 PyUnicodeObject *v;
2275 Py_UNICODE *p;
2276
2277 /* First get the size of the result */
2278 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002279 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002280 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2281
2282 v = _PyUnicode_New(usize);
2283 if (v == NULL)
2284 return NULL;
2285 if (usize == 0)
2286 return (PyObject *)v;
2287 p = PyUnicode_AS_UNICODE(v);
2288 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2289 Py_DECREF(v);
2290 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2291 }
2292
2293 return (PyObject *)v;
2294}
2295
2296PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2297 int size,
2298 const char *errors)
2299{
2300 PyObject *repr;
2301 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002302 DWORD mbcssize;
2303
2304 /* If there are no characters, bail now! */
2305 if (size==0)
2306 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002307
2308 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002309 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002310 if (mbcssize==0)
2311 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2312
2313 repr = PyString_FromStringAndSize(NULL, mbcssize);
2314 if (repr == NULL)
2315 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002316 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002317 return repr;
2318
2319 /* Do the conversion */
2320 s = PyString_AS_STRING(repr);
2321 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2322 Py_DECREF(repr);
2323 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2324 }
2325 return repr;
2326}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002327
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002328#endif /* MS_WIN32 */
2329
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330/* --- Character Mapping Codec -------------------------------------------- */
2331
2332static
2333int charmap_decoding_error(const char **source,
2334 Py_UNICODE **dest,
2335 const char *errors,
2336 const char *details)
2337{
2338 if ((errors == NULL) ||
2339 (strcmp(errors,"strict") == 0)) {
2340 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002341 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342 details);
2343 return -1;
2344 }
2345 else if (strcmp(errors,"ignore") == 0) {
2346 return 0;
2347 }
2348 else if (strcmp(errors,"replace") == 0) {
2349 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2350 (*dest)++;
2351 return 0;
2352 }
2353 else {
2354 PyErr_Format(PyExc_ValueError,
2355 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002356 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 errors);
2358 return -1;
2359 }
2360}
2361
2362PyObject *PyUnicode_DecodeCharmap(const char *s,
2363 int size,
2364 PyObject *mapping,
2365 const char *errors)
2366{
2367 PyUnicodeObject *v;
2368 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002369 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370
2371 /* Default to Latin-1 */
2372 if (mapping == NULL)
2373 return PyUnicode_DecodeLatin1(s, size, errors);
2374
2375 v = _PyUnicode_New(size);
2376 if (v == NULL)
2377 goto onError;
2378 if (size == 0)
2379 return (PyObject *)v;
2380 p = PyUnicode_AS_UNICODE(v);
2381 while (size-- > 0) {
2382 unsigned char ch = *s++;
2383 PyObject *w, *x;
2384
2385 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2386 w = PyInt_FromLong((long)ch);
2387 if (w == NULL)
2388 goto onError;
2389 x = PyObject_GetItem(mapping, w);
2390 Py_DECREF(w);
2391 if (x == NULL) {
2392 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002393 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002395 x = Py_None;
2396 Py_INCREF(x);
2397 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 }
2400
2401 /* Apply mapping */
2402 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002403 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 if (value < 0 || value > 65535) {
2405 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002406 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407 Py_DECREF(x);
2408 goto onError;
2409 }
2410 *p++ = (Py_UNICODE)value;
2411 }
2412 else if (x == Py_None) {
2413 /* undefined mapping */
2414 if (charmap_decoding_error(&s, &p, errors,
2415 "character maps to <undefined>")) {
2416 Py_DECREF(x);
2417 goto onError;
2418 }
2419 }
2420 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002421 int targetsize = PyUnicode_GET_SIZE(x);
2422
2423 if (targetsize == 1)
2424 /* 1-1 mapping */
2425 *p++ = *PyUnicode_AS_UNICODE(x);
2426
2427 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002429 if (targetsize > extrachars) {
2430 /* resize first */
2431 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2432 int needed = (targetsize - extrachars) + \
2433 (targetsize << 2);
2434 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002435 if (_PyUnicode_Resize(&v,
2436 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002437 Py_DECREF(x);
2438 goto onError;
2439 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002440 p = PyUnicode_AS_UNICODE(v) + oldpos;
2441 }
2442 Py_UNICODE_COPY(p,
2443 PyUnicode_AS_UNICODE(x),
2444 targetsize);
2445 p += targetsize;
2446 extrachars -= targetsize;
2447 }
2448 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449 }
2450 else {
2451 /* wrong return value */
2452 PyErr_SetString(PyExc_TypeError,
2453 "character mapping must return integer, None or unicode");
2454 Py_DECREF(x);
2455 goto onError;
2456 }
2457 Py_DECREF(x);
2458 }
2459 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002460 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 goto onError;
2462 return (PyObject *)v;
2463
2464 onError:
2465 Py_XDECREF(v);
2466 return NULL;
2467}
2468
2469static
2470int charmap_encoding_error(const Py_UNICODE **source,
2471 char **dest,
2472 const char *errors,
2473 const char *details)
2474{
2475 if ((errors == NULL) ||
2476 (strcmp(errors,"strict") == 0)) {
2477 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002478 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 details);
2480 return -1;
2481 }
2482 else if (strcmp(errors,"ignore") == 0) {
2483 return 0;
2484 }
2485 else if (strcmp(errors,"replace") == 0) {
2486 **dest = '?';
2487 (*dest)++;
2488 return 0;
2489 }
2490 else {
2491 PyErr_Format(PyExc_ValueError,
2492 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002493 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 errors);
2495 return -1;
2496 }
2497}
2498
2499PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2500 int size,
2501 PyObject *mapping,
2502 const char *errors)
2503{
2504 PyObject *v;
2505 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002506 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507
2508 /* Default to Latin-1 */
2509 if (mapping == NULL)
2510 return PyUnicode_EncodeLatin1(p, size, errors);
2511
2512 v = PyString_FromStringAndSize(NULL, size);
2513 if (v == NULL)
2514 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002515 if (size == 0)
2516 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 s = PyString_AS_STRING(v);
2518 while (size-- > 0) {
2519 Py_UNICODE ch = *p++;
2520 PyObject *w, *x;
2521
2522 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2523 w = PyInt_FromLong((long)ch);
2524 if (w == NULL)
2525 goto onError;
2526 x = PyObject_GetItem(mapping, w);
2527 Py_DECREF(w);
2528 if (x == NULL) {
2529 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002530 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002532 x = Py_None;
2533 Py_INCREF(x);
2534 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 }
2537
2538 /* Apply mapping */
2539 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002540 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 if (value < 0 || value > 255) {
2542 PyErr_SetString(PyExc_TypeError,
2543 "character mapping must be in range(256)");
2544 Py_DECREF(x);
2545 goto onError;
2546 }
2547 *s++ = (char)value;
2548 }
2549 else if (x == Py_None) {
2550 /* undefined mapping */
2551 if (charmap_encoding_error(&p, &s, errors,
2552 "character maps to <undefined>")) {
2553 Py_DECREF(x);
2554 goto onError;
2555 }
2556 }
2557 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002558 int targetsize = PyString_GET_SIZE(x);
2559
2560 if (targetsize == 1)
2561 /* 1-1 mapping */
2562 *s++ = *PyString_AS_STRING(x);
2563
2564 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002566 if (targetsize > extrachars) {
2567 /* resize first */
2568 int oldpos = (int)(s - PyString_AS_STRING(v));
2569 int needed = (targetsize - extrachars) + \
2570 (targetsize << 2);
2571 extrachars += needed;
2572 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002573 Py_DECREF(x);
2574 goto onError;
2575 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002576 s = PyString_AS_STRING(v) + oldpos;
2577 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002578 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002579 s += targetsize;
2580 extrachars -= targetsize;
2581 }
2582 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 }
2584 else {
2585 /* wrong return value */
2586 PyErr_SetString(PyExc_TypeError,
2587 "character mapping must return integer, None or unicode");
2588 Py_DECREF(x);
2589 goto onError;
2590 }
2591 Py_DECREF(x);
2592 }
2593 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2594 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2595 goto onError;
2596 return v;
2597
2598 onError:
2599 Py_DECREF(v);
2600 return NULL;
2601}
2602
2603PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2604 PyObject *mapping)
2605{
2606 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2607 PyErr_BadArgument();
2608 return NULL;
2609 }
2610 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2611 PyUnicode_GET_SIZE(unicode),
2612 mapping,
2613 NULL);
2614}
2615
2616static
2617int translate_error(const Py_UNICODE **source,
2618 Py_UNICODE **dest,
2619 const char *errors,
2620 const char *details)
2621{
2622 if ((errors == NULL) ||
2623 (strcmp(errors,"strict") == 0)) {
2624 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002625 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 details);
2627 return -1;
2628 }
2629 else if (strcmp(errors,"ignore") == 0) {
2630 return 0;
2631 }
2632 else if (strcmp(errors,"replace") == 0) {
2633 **dest = '?';
2634 (*dest)++;
2635 return 0;
2636 }
2637 else {
2638 PyErr_Format(PyExc_ValueError,
2639 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002640 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 errors);
2642 return -1;
2643 }
2644}
2645
2646PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2647 int size,
2648 PyObject *mapping,
2649 const char *errors)
2650{
2651 PyUnicodeObject *v;
2652 Py_UNICODE *p;
2653
2654 if (mapping == NULL) {
2655 PyErr_BadArgument();
2656 return NULL;
2657 }
2658
2659 /* Output will never be longer than input */
2660 v = _PyUnicode_New(size);
2661 if (v == NULL)
2662 goto onError;
2663 if (size == 0)
2664 goto done;
2665 p = PyUnicode_AS_UNICODE(v);
2666 while (size-- > 0) {
2667 Py_UNICODE ch = *s++;
2668 PyObject *w, *x;
2669
2670 /* Get mapping */
2671 w = PyInt_FromLong(ch);
2672 if (w == NULL)
2673 goto onError;
2674 x = PyObject_GetItem(mapping, w);
2675 Py_DECREF(w);
2676 if (x == NULL) {
2677 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2678 /* No mapping found: default to 1-1 mapping */
2679 PyErr_Clear();
2680 *p++ = ch;
2681 continue;
2682 }
2683 goto onError;
2684 }
2685
2686 /* Apply mapping */
2687 if (PyInt_Check(x))
2688 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2689 else if (x == Py_None) {
2690 /* undefined mapping */
2691 if (translate_error(&s, &p, errors,
2692 "character maps to <undefined>")) {
2693 Py_DECREF(x);
2694 goto onError;
2695 }
2696 }
2697 else if (PyUnicode_Check(x)) {
2698 if (PyUnicode_GET_SIZE(x) != 1) {
2699 /* 1-n mapping */
2700 PyErr_SetString(PyExc_NotImplementedError,
2701 "1-n mappings are currently not implemented");
2702 Py_DECREF(x);
2703 goto onError;
2704 }
2705 *p++ = *PyUnicode_AS_UNICODE(x);
2706 }
2707 else {
2708 /* wrong return value */
2709 PyErr_SetString(PyExc_TypeError,
2710 "translate mapping must return integer, None or unicode");
2711 Py_DECREF(x);
2712 goto onError;
2713 }
2714 Py_DECREF(x);
2715 }
2716 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002717 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719
2720 done:
2721 return (PyObject *)v;
2722
2723 onError:
2724 Py_XDECREF(v);
2725 return NULL;
2726}
2727
2728PyObject *PyUnicode_Translate(PyObject *str,
2729 PyObject *mapping,
2730 const char *errors)
2731{
2732 PyObject *result;
2733
2734 str = PyUnicode_FromObject(str);
2735 if (str == NULL)
2736 goto onError;
2737 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2738 PyUnicode_GET_SIZE(str),
2739 mapping,
2740 errors);
2741 Py_DECREF(str);
2742 return result;
2743
2744 onError:
2745 Py_XDECREF(str);
2746 return NULL;
2747}
2748
Guido van Rossum9e896b32000-04-05 20:11:21 +00002749/* --- Decimal Encoder ---------------------------------------------------- */
2750
2751int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2752 int length,
2753 char *output,
2754 const char *errors)
2755{
2756 Py_UNICODE *p, *end;
2757
2758 if (output == NULL) {
2759 PyErr_BadArgument();
2760 return -1;
2761 }
2762
2763 p = s;
2764 end = s + length;
2765 while (p < end) {
2766 register Py_UNICODE ch = *p++;
2767 int decimal;
2768
2769 if (Py_UNICODE_ISSPACE(ch)) {
2770 *output++ = ' ';
2771 continue;
2772 }
2773 decimal = Py_UNICODE_TODECIMAL(ch);
2774 if (decimal >= 0) {
2775 *output++ = '0' + decimal;
2776 continue;
2777 }
Guido van Rossumba477042000-04-06 18:18:10 +00002778 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002779 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002780 continue;
2781 }
2782 /* All other characters are considered invalid */
2783 if (errors == NULL || strcmp(errors, "strict") == 0) {
2784 PyErr_SetString(PyExc_ValueError,
2785 "invalid decimal Unicode string");
2786 goto onError;
2787 }
2788 else if (strcmp(errors, "ignore") == 0)
2789 continue;
2790 else if (strcmp(errors, "replace") == 0) {
2791 *output++ = '?';
2792 continue;
2793 }
2794 }
2795 /* 0-terminate the output string */
2796 *output++ = '\0';
2797 return 0;
2798
2799 onError:
2800 return -1;
2801}
2802
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803/* --- Helpers ------------------------------------------------------------ */
2804
2805static
2806int count(PyUnicodeObject *self,
2807 int start,
2808 int end,
2809 PyUnicodeObject *substring)
2810{
2811 int count = 0;
2812
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002813 if (start < 0)
2814 start += self->length;
2815 if (start < 0)
2816 start = 0;
2817 if (end > self->length)
2818 end = self->length;
2819 if (end < 0)
2820 end += self->length;
2821 if (end < 0)
2822 end = 0;
2823
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002824 if (substring->length == 0)
2825 return (end - start + 1);
2826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 end -= substring->length;
2828
2829 while (start <= end)
2830 if (Py_UNICODE_MATCH(self, start, substring)) {
2831 count++;
2832 start += substring->length;
2833 } else
2834 start++;
2835
2836 return count;
2837}
2838
2839int PyUnicode_Count(PyObject *str,
2840 PyObject *substr,
2841 int start,
2842 int end)
2843{
2844 int result;
2845
2846 str = PyUnicode_FromObject(str);
2847 if (str == NULL)
2848 return -1;
2849 substr = PyUnicode_FromObject(substr);
2850 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002851 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 return -1;
2853 }
2854
2855 result = count((PyUnicodeObject *)str,
2856 start, end,
2857 (PyUnicodeObject *)substr);
2858
2859 Py_DECREF(str);
2860 Py_DECREF(substr);
2861 return result;
2862}
2863
2864static
2865int findstring(PyUnicodeObject *self,
2866 PyUnicodeObject *substring,
2867 int start,
2868 int end,
2869 int direction)
2870{
2871 if (start < 0)
2872 start += self->length;
2873 if (start < 0)
2874 start = 0;
2875
2876 if (substring->length == 0)
2877 return start;
2878
2879 if (end > self->length)
2880 end = self->length;
2881 if (end < 0)
2882 end += self->length;
2883 if (end < 0)
2884 end = 0;
2885
2886 end -= substring->length;
2887
2888 if (direction < 0) {
2889 for (; end >= start; end--)
2890 if (Py_UNICODE_MATCH(self, end, substring))
2891 return end;
2892 } else {
2893 for (; start <= end; start++)
2894 if (Py_UNICODE_MATCH(self, start, substring))
2895 return start;
2896 }
2897
2898 return -1;
2899}
2900
2901int PyUnicode_Find(PyObject *str,
2902 PyObject *substr,
2903 int start,
2904 int end,
2905 int direction)
2906{
2907 int result;
2908
2909 str = PyUnicode_FromObject(str);
2910 if (str == NULL)
2911 return -1;
2912 substr = PyUnicode_FromObject(substr);
2913 if (substr == NULL) {
2914 Py_DECREF(substr);
2915 return -1;
2916 }
2917
2918 result = findstring((PyUnicodeObject *)str,
2919 (PyUnicodeObject *)substr,
2920 start, end, direction);
2921 Py_DECREF(str);
2922 Py_DECREF(substr);
2923 return result;
2924}
2925
2926static
2927int tailmatch(PyUnicodeObject *self,
2928 PyUnicodeObject *substring,
2929 int start,
2930 int end,
2931 int direction)
2932{
2933 if (start < 0)
2934 start += self->length;
2935 if (start < 0)
2936 start = 0;
2937
2938 if (substring->length == 0)
2939 return 1;
2940
2941 if (end > self->length)
2942 end = self->length;
2943 if (end < 0)
2944 end += self->length;
2945 if (end < 0)
2946 end = 0;
2947
2948 end -= substring->length;
2949 if (end < start)
2950 return 0;
2951
2952 if (direction > 0) {
2953 if (Py_UNICODE_MATCH(self, end, substring))
2954 return 1;
2955 } else {
2956 if (Py_UNICODE_MATCH(self, start, substring))
2957 return 1;
2958 }
2959
2960 return 0;
2961}
2962
2963int PyUnicode_Tailmatch(PyObject *str,
2964 PyObject *substr,
2965 int start,
2966 int end,
2967 int direction)
2968{
2969 int result;
2970
2971 str = PyUnicode_FromObject(str);
2972 if (str == NULL)
2973 return -1;
2974 substr = PyUnicode_FromObject(substr);
2975 if (substr == NULL) {
2976 Py_DECREF(substr);
2977 return -1;
2978 }
2979
2980 result = tailmatch((PyUnicodeObject *)str,
2981 (PyUnicodeObject *)substr,
2982 start, end, direction);
2983 Py_DECREF(str);
2984 Py_DECREF(substr);
2985 return result;
2986}
2987
2988static
2989const Py_UNICODE *findchar(const Py_UNICODE *s,
2990 int size,
2991 Py_UNICODE ch)
2992{
2993 /* like wcschr, but doesn't stop at NULL characters */
2994
2995 while (size-- > 0) {
2996 if (*s == ch)
2997 return s;
2998 s++;
2999 }
3000
3001 return NULL;
3002}
3003
3004/* Apply fixfct filter to the Unicode object self and return a
3005 reference to the modified object */
3006
3007static
3008PyObject *fixup(PyUnicodeObject *self,
3009 int (*fixfct)(PyUnicodeObject *s))
3010{
3011
3012 PyUnicodeObject *u;
3013
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003014 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 if (u == NULL)
3016 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003017
3018 Py_UNICODE_COPY(u->str, self->str, self->length);
3019
Tim Peters7a29bd52001-09-12 03:03:31 +00003020 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 /* fixfct should return TRUE if it modified the buffer. If
3022 FALSE, return a reference to the original buffer instead
3023 (to save space, not time) */
3024 Py_INCREF(self);
3025 Py_DECREF(u);
3026 return (PyObject*) self;
3027 }
3028 return (PyObject*) u;
3029}
3030
3031static
3032int fixupper(PyUnicodeObject *self)
3033{
3034 int len = self->length;
3035 Py_UNICODE *s = self->str;
3036 int status = 0;
3037
3038 while (len-- > 0) {
3039 register Py_UNICODE ch;
3040
3041 ch = Py_UNICODE_TOUPPER(*s);
3042 if (ch != *s) {
3043 status = 1;
3044 *s = ch;
3045 }
3046 s++;
3047 }
3048
3049 return status;
3050}
3051
3052static
3053int fixlower(PyUnicodeObject *self)
3054{
3055 int len = self->length;
3056 Py_UNICODE *s = self->str;
3057 int status = 0;
3058
3059 while (len-- > 0) {
3060 register Py_UNICODE ch;
3061
3062 ch = Py_UNICODE_TOLOWER(*s);
3063 if (ch != *s) {
3064 status = 1;
3065 *s = ch;
3066 }
3067 s++;
3068 }
3069
3070 return status;
3071}
3072
3073static
3074int fixswapcase(PyUnicodeObject *self)
3075{
3076 int len = self->length;
3077 Py_UNICODE *s = self->str;
3078 int status = 0;
3079
3080 while (len-- > 0) {
3081 if (Py_UNICODE_ISUPPER(*s)) {
3082 *s = Py_UNICODE_TOLOWER(*s);
3083 status = 1;
3084 } else if (Py_UNICODE_ISLOWER(*s)) {
3085 *s = Py_UNICODE_TOUPPER(*s);
3086 status = 1;
3087 }
3088 s++;
3089 }
3090
3091 return status;
3092}
3093
3094static
3095int fixcapitalize(PyUnicodeObject *self)
3096{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003097 int len = self->length;
3098 Py_UNICODE *s = self->str;
3099 int status = 0;
3100
3101 if (len == 0)
3102 return 0;
3103 if (Py_UNICODE_ISLOWER(*s)) {
3104 *s = Py_UNICODE_TOUPPER(*s);
3105 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003107 s++;
3108 while (--len > 0) {
3109 if (Py_UNICODE_ISUPPER(*s)) {
3110 *s = Py_UNICODE_TOLOWER(*s);
3111 status = 1;
3112 }
3113 s++;
3114 }
3115 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116}
3117
3118static
3119int fixtitle(PyUnicodeObject *self)
3120{
3121 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3122 register Py_UNICODE *e;
3123 int previous_is_cased;
3124
3125 /* Shortcut for single character strings */
3126 if (PyUnicode_GET_SIZE(self) == 1) {
3127 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3128 if (*p != ch) {
3129 *p = ch;
3130 return 1;
3131 }
3132 else
3133 return 0;
3134 }
3135
3136 e = p + PyUnicode_GET_SIZE(self);
3137 previous_is_cased = 0;
3138 for (; p < e; p++) {
3139 register const Py_UNICODE ch = *p;
3140
3141 if (previous_is_cased)
3142 *p = Py_UNICODE_TOLOWER(ch);
3143 else
3144 *p = Py_UNICODE_TOTITLE(ch);
3145
3146 if (Py_UNICODE_ISLOWER(ch) ||
3147 Py_UNICODE_ISUPPER(ch) ||
3148 Py_UNICODE_ISTITLE(ch))
3149 previous_is_cased = 1;
3150 else
3151 previous_is_cased = 0;
3152 }
3153 return 1;
3154}
3155
3156PyObject *PyUnicode_Join(PyObject *separator,
3157 PyObject *seq)
3158{
3159 Py_UNICODE *sep;
3160 int seplen;
3161 PyUnicodeObject *res = NULL;
3162 int reslen = 0;
3163 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 int sz = 100;
3165 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003166 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167
Tim Peters2cfe3682001-05-05 05:36:48 +00003168 it = PyObject_GetIter(seq);
3169 if (it == NULL)
3170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
3172 if (separator == NULL) {
3173 Py_UNICODE blank = ' ';
3174 sep = &blank;
3175 seplen = 1;
3176 }
3177 else {
3178 separator = PyUnicode_FromObject(separator);
3179 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181 sep = PyUnicode_AS_UNICODE(separator);
3182 seplen = PyUnicode_GET_SIZE(separator);
3183 }
3184
3185 res = _PyUnicode_New(sz);
3186 if (res == NULL)
3187 goto onError;
3188 p = PyUnicode_AS_UNICODE(res);
3189 reslen = 0;
3190
Tim Peters2cfe3682001-05-05 05:36:48 +00003191 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003193 PyObject *item = PyIter_Next(it);
3194 if (item == NULL) {
3195 if (PyErr_Occurred())
3196 goto onError;
3197 break;
3198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 if (!PyUnicode_Check(item)) {
3200 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003201 if (!PyString_Check(item)) {
3202 PyErr_Format(PyExc_TypeError,
3203 "sequence item %i: expected string or Unicode,"
3204 " %.80s found",
3205 i, item->ob_type->tp_name);
3206 Py_DECREF(item);
3207 goto onError;
3208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 v = PyUnicode_FromObject(item);
3210 Py_DECREF(item);
3211 item = v;
3212 if (item == NULL)
3213 goto onError;
3214 }
3215 itemlen = PyUnicode_GET_SIZE(item);
3216 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003217 if (_PyUnicode_Resize(&res, sz*2)) {
3218 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 sz *= 2;
3222 p = PyUnicode_AS_UNICODE(res) + reslen;
3223 }
3224 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003225 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 p += seplen;
3227 reslen += seplen;
3228 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003229 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 p += itemlen;
3231 reslen += itemlen;
3232 Py_DECREF(item);
3233 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003234 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 goto onError;
3236
3237 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003238 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 return (PyObject *)res;
3240
3241 onError:
3242 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003243 Py_XDECREF(res);
3244 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 return NULL;
3246}
3247
3248static
3249PyUnicodeObject *pad(PyUnicodeObject *self,
3250 int left,
3251 int right,
3252 Py_UNICODE fill)
3253{
3254 PyUnicodeObject *u;
3255
3256 if (left < 0)
3257 left = 0;
3258 if (right < 0)
3259 right = 0;
3260
Tim Peters7a29bd52001-09-12 03:03:31 +00003261 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_INCREF(self);
3263 return self;
3264 }
3265
3266 u = _PyUnicode_New(left + self->length + right);
3267 if (u) {
3268 if (left)
3269 Py_UNICODE_FILL(u->str, fill, left);
3270 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3271 if (right)
3272 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3273 }
3274
3275 return u;
3276}
3277
3278#define SPLIT_APPEND(data, left, right) \
3279 str = PyUnicode_FromUnicode(data + left, right - left); \
3280 if (!str) \
3281 goto onError; \
3282 if (PyList_Append(list, str)) { \
3283 Py_DECREF(str); \
3284 goto onError; \
3285 } \
3286 else \
3287 Py_DECREF(str);
3288
3289static
3290PyObject *split_whitespace(PyUnicodeObject *self,
3291 PyObject *list,
3292 int maxcount)
3293{
3294 register int i;
3295 register int j;
3296 int len = self->length;
3297 PyObject *str;
3298
3299 for (i = j = 0; i < len; ) {
3300 /* find a token */
3301 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3302 i++;
3303 j = i;
3304 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3305 i++;
3306 if (j < i) {
3307 if (maxcount-- <= 0)
3308 break;
3309 SPLIT_APPEND(self->str, j, i);
3310 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3311 i++;
3312 j = i;
3313 }
3314 }
3315 if (j < len) {
3316 SPLIT_APPEND(self->str, j, len);
3317 }
3318 return list;
3319
3320 onError:
3321 Py_DECREF(list);
3322 return NULL;
3323}
3324
3325PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003326 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327{
3328 register int i;
3329 register int j;
3330 int len;
3331 PyObject *list;
3332 PyObject *str;
3333 Py_UNICODE *data;
3334
3335 string = PyUnicode_FromObject(string);
3336 if (string == NULL)
3337 return NULL;
3338 data = PyUnicode_AS_UNICODE(string);
3339 len = PyUnicode_GET_SIZE(string);
3340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 list = PyList_New(0);
3342 if (!list)
3343 goto onError;
3344
3345 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003346 int eol;
3347
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 /* Find a line and append it */
3349 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3350 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351
3352 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003353 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 if (i < len) {
3355 if (data[i] == '\r' && i + 1 < len &&
3356 data[i+1] == '\n')
3357 i += 2;
3358 else
3359 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003360 if (keepends)
3361 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 }
Guido van Rossum86662912000-04-11 15:38:46 +00003363 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 j = i;
3365 }
3366 if (j < len) {
3367 SPLIT_APPEND(data, j, len);
3368 }
3369
3370 Py_DECREF(string);
3371 return list;
3372
3373 onError:
3374 Py_DECREF(list);
3375 Py_DECREF(string);
3376 return NULL;
3377}
3378
3379static
3380PyObject *split_char(PyUnicodeObject *self,
3381 PyObject *list,
3382 Py_UNICODE ch,
3383 int maxcount)
3384{
3385 register int i;
3386 register int j;
3387 int len = self->length;
3388 PyObject *str;
3389
3390 for (i = j = 0; i < len; ) {
3391 if (self->str[i] == ch) {
3392 if (maxcount-- <= 0)
3393 break;
3394 SPLIT_APPEND(self->str, j, i);
3395 i = j = i + 1;
3396 } else
3397 i++;
3398 }
3399 if (j <= len) {
3400 SPLIT_APPEND(self->str, j, len);
3401 }
3402 return list;
3403
3404 onError:
3405 Py_DECREF(list);
3406 return NULL;
3407}
3408
3409static
3410PyObject *split_substring(PyUnicodeObject *self,
3411 PyObject *list,
3412 PyUnicodeObject *substring,
3413 int maxcount)
3414{
3415 register int i;
3416 register int j;
3417 int len = self->length;
3418 int sublen = substring->length;
3419 PyObject *str;
3420
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003421 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 if (Py_UNICODE_MATCH(self, i, substring)) {
3423 if (maxcount-- <= 0)
3424 break;
3425 SPLIT_APPEND(self->str, j, i);
3426 i = j = i + sublen;
3427 } else
3428 i++;
3429 }
3430 if (j <= len) {
3431 SPLIT_APPEND(self->str, j, len);
3432 }
3433 return list;
3434
3435 onError:
3436 Py_DECREF(list);
3437 return NULL;
3438}
3439
3440#undef SPLIT_APPEND
3441
3442static
3443PyObject *split(PyUnicodeObject *self,
3444 PyUnicodeObject *substring,
3445 int maxcount)
3446{
3447 PyObject *list;
3448
3449 if (maxcount < 0)
3450 maxcount = INT_MAX;
3451
3452 list = PyList_New(0);
3453 if (!list)
3454 return NULL;
3455
3456 if (substring == NULL)
3457 return split_whitespace(self,list,maxcount);
3458
3459 else if (substring->length == 1)
3460 return split_char(self,list,substring->str[0],maxcount);
3461
3462 else if (substring->length == 0) {
3463 Py_DECREF(list);
3464 PyErr_SetString(PyExc_ValueError, "empty separator");
3465 return NULL;
3466 }
3467 else
3468 return split_substring(self,list,substring,maxcount);
3469}
3470
3471static
3472PyObject *strip(PyUnicodeObject *self,
3473 int left,
3474 int right)
3475{
3476 Py_UNICODE *p = self->str;
3477 int start = 0;
3478 int end = self->length;
3479
3480 if (left)
3481 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3482 start++;
3483
3484 if (right)
3485 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3486 end--;
3487
Tim Peters7a29bd52001-09-12 03:03:31 +00003488 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 /* couldn't strip anything off, return original string */
3490 Py_INCREF(self);
3491 return (PyObject*) self;
3492 }
3493
3494 return (PyObject*) PyUnicode_FromUnicode(
3495 self->str + start,
3496 end - start
3497 );
3498}
3499
3500static
3501PyObject *replace(PyUnicodeObject *self,
3502 PyUnicodeObject *str1,
3503 PyUnicodeObject *str2,
3504 int maxcount)
3505{
3506 PyUnicodeObject *u;
3507
3508 if (maxcount < 0)
3509 maxcount = INT_MAX;
3510
3511 if (str1->length == 1 && str2->length == 1) {
3512 int i;
3513
3514 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003515 if (!findchar(self->str, self->length, str1->str[0]) &&
3516 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 /* nothing to replace, return original string */
3518 Py_INCREF(self);
3519 u = self;
3520 } else {
3521 Py_UNICODE u1 = str1->str[0];
3522 Py_UNICODE u2 = str2->str[0];
3523
3524 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003525 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 self->length
3527 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003528 if (u != NULL) {
3529 Py_UNICODE_COPY(u->str, self->str,
3530 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 for (i = 0; i < u->length; i++)
3532 if (u->str[i] == u1) {
3533 if (--maxcount < 0)
3534 break;
3535 u->str[i] = u2;
3536 }
3537 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539
3540 } else {
3541 int n, i;
3542 Py_UNICODE *p;
3543
3544 /* replace strings */
3545 n = count(self, 0, self->length, str1);
3546 if (n > maxcount)
3547 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003548 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 /* nothing to replace, return original string */
3550 Py_INCREF(self);
3551 u = self;
3552 } else {
3553 u = _PyUnicode_New(
3554 self->length + n * (str2->length - str1->length));
3555 if (u) {
3556 i = 0;
3557 p = u->str;
3558 while (i <= self->length - str1->length)
3559 if (Py_UNICODE_MATCH(self, i, str1)) {
3560 /* replace string segment */
3561 Py_UNICODE_COPY(p, str2->str, str2->length);
3562 p += str2->length;
3563 i += str1->length;
3564 if (--n <= 0) {
3565 /* copy remaining part */
3566 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3567 break;
3568 }
3569 } else
3570 *p++ = self->str[i++];
3571 }
3572 }
3573 }
3574
3575 return (PyObject *) u;
3576}
3577
3578/* --- Unicode Object Methods --------------------------------------------- */
3579
3580static char title__doc__[] =
3581"S.title() -> unicode\n\
3582\n\
3583Return a titlecased version of S, i.e. words start with title case\n\
3584characters, all remaining cased characters have lower case.";
3585
3586static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003587unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 return fixup(self, fixtitle);
3590}
3591
3592static char capitalize__doc__[] =
3593"S.capitalize() -> unicode\n\
3594\n\
3595Return a capitalized version of S, i.e. make the first character\n\
3596have upper case.";
3597
3598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003599unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 return fixup(self, fixcapitalize);
3602}
3603
3604#if 0
3605static char capwords__doc__[] =
3606"S.capwords() -> unicode\n\
3607\n\
3608Apply .capitalize() to all words in S and return the result with\n\
3609normalized whitespace (all whitespace strings are replaced by ' ').";
3610
3611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003612unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613{
3614 PyObject *list;
3615 PyObject *item;
3616 int i;
3617
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 /* Split into words */
3619 list = split(self, NULL, -1);
3620 if (!list)
3621 return NULL;
3622
3623 /* Capitalize each word */
3624 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3625 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3626 fixcapitalize);
3627 if (item == NULL)
3628 goto onError;
3629 Py_DECREF(PyList_GET_ITEM(list, i));
3630 PyList_SET_ITEM(list, i, item);
3631 }
3632
3633 /* Join the words to form a new string */
3634 item = PyUnicode_Join(NULL, list);
3635
3636onError:
3637 Py_DECREF(list);
3638 return (PyObject *)item;
3639}
3640#endif
3641
3642static char center__doc__[] =
3643"S.center(width) -> unicode\n\
3644\n\
3645Return S centered in a Unicode string of length width. Padding is done\n\
3646using spaces.";
3647
3648static PyObject *
3649unicode_center(PyUnicodeObject *self, PyObject *args)
3650{
3651 int marg, left;
3652 int width;
3653
3654 if (!PyArg_ParseTuple(args, "i:center", &width))
3655 return NULL;
3656
Tim Peters7a29bd52001-09-12 03:03:31 +00003657 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 Py_INCREF(self);
3659 return (PyObject*) self;
3660 }
3661
3662 marg = width - self->length;
3663 left = marg / 2 + (marg & width & 1);
3664
3665 return (PyObject*) pad(self, left, marg - left, ' ');
3666}
3667
Marc-André Lemburge5034372000-08-08 08:04:29 +00003668#if 0
3669
3670/* This code should go into some future Unicode collation support
3671 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003672 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003673
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003674/* speedy UTF-16 code point order comparison */
3675/* gleaned from: */
3676/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3677
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003678static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003679{
3680 0, 0, 0, 0, 0, 0, 0, 0,
3681 0, 0, 0, 0, 0, 0, 0, 0,
3682 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003683 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003684};
3685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686static int
3687unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3688{
3689 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003690
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 Py_UNICODE *s1 = str1->str;
3692 Py_UNICODE *s2 = str2->str;
3693
3694 len1 = str1->length;
3695 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003698 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003699
3700 c1 = *s1++;
3701 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003702
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003703 if (c1 > (1<<11) * 26)
3704 c1 += utf16Fixup[c1>>11];
3705 if (c2 > (1<<11) * 26)
3706 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003707 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003708
3709 if (c1 != c2)
3710 return (c1 < c2) ? -1 : 1;
3711
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003712 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 }
3714
3715 return (len1 < len2) ? -1 : (len1 != len2);
3716}
3717
Marc-André Lemburge5034372000-08-08 08:04:29 +00003718#else
3719
3720static int
3721unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3722{
3723 register int len1, len2;
3724
3725 Py_UNICODE *s1 = str1->str;
3726 Py_UNICODE *s2 = str2->str;
3727
3728 len1 = str1->length;
3729 len2 = str2->length;
3730
3731 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003732 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003733
Fredrik Lundh45714e92001-06-26 16:39:36 +00003734 c1 = *s1++;
3735 c2 = *s2++;
3736
3737 if (c1 != c2)
3738 return (c1 < c2) ? -1 : 1;
3739
Marc-André Lemburge5034372000-08-08 08:04:29 +00003740 len1--; len2--;
3741 }
3742
3743 return (len1 < len2) ? -1 : (len1 != len2);
3744}
3745
3746#endif
3747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748int PyUnicode_Compare(PyObject *left,
3749 PyObject *right)
3750{
3751 PyUnicodeObject *u = NULL, *v = NULL;
3752 int result;
3753
3754 /* Coerce the two arguments */
3755 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3756 if (u == NULL)
3757 goto onError;
3758 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3759 if (v == NULL)
3760 goto onError;
3761
Thomas Wouters7e474022000-07-16 12:04:32 +00003762 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 if (v == u) {
3764 Py_DECREF(u);
3765 Py_DECREF(v);
3766 return 0;
3767 }
3768
3769 result = unicode_compare(u, v);
3770
3771 Py_DECREF(u);
3772 Py_DECREF(v);
3773 return result;
3774
3775onError:
3776 Py_XDECREF(u);
3777 Py_XDECREF(v);
3778 return -1;
3779}
3780
Guido van Rossum403d68b2000-03-13 15:55:09 +00003781int PyUnicode_Contains(PyObject *container,
3782 PyObject *element)
3783{
3784 PyUnicodeObject *u = NULL, *v = NULL;
3785 int result;
3786 register const Py_UNICODE *p, *e;
3787 register Py_UNICODE ch;
3788
3789 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003790 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003791 if (v == NULL) {
3792 PyErr_SetString(PyExc_TypeError,
3793 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003794 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003795 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003796 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3797 if (u == NULL) {
3798 Py_DECREF(v);
3799 goto onError;
3800 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003801
3802 /* Check v in u */
3803 if (PyUnicode_GET_SIZE(v) != 1) {
3804 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003805 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003806 goto onError;
3807 }
3808 ch = *PyUnicode_AS_UNICODE(v);
3809 p = PyUnicode_AS_UNICODE(u);
3810 e = p + PyUnicode_GET_SIZE(u);
3811 result = 0;
3812 while (p < e) {
3813 if (*p++ == ch) {
3814 result = 1;
3815 break;
3816 }
3817 }
3818
3819 Py_DECREF(u);
3820 Py_DECREF(v);
3821 return result;
3822
3823onError:
3824 Py_XDECREF(u);
3825 Py_XDECREF(v);
3826 return -1;
3827}
3828
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829/* Concat to string or Unicode object giving a new Unicode object. */
3830
3831PyObject *PyUnicode_Concat(PyObject *left,
3832 PyObject *right)
3833{
3834 PyUnicodeObject *u = NULL, *v = NULL, *w;
3835
3836 /* Coerce the two arguments */
3837 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3838 if (u == NULL)
3839 goto onError;
3840 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3841 if (v == NULL)
3842 goto onError;
3843
3844 /* Shortcuts */
3845 if (v == unicode_empty) {
3846 Py_DECREF(v);
3847 return (PyObject *)u;
3848 }
3849 if (u == unicode_empty) {
3850 Py_DECREF(u);
3851 return (PyObject *)v;
3852 }
3853
3854 /* Concat the two Unicode strings */
3855 w = _PyUnicode_New(u->length + v->length);
3856 if (w == NULL)
3857 goto onError;
3858 Py_UNICODE_COPY(w->str, u->str, u->length);
3859 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3860
3861 Py_DECREF(u);
3862 Py_DECREF(v);
3863 return (PyObject *)w;
3864
3865onError:
3866 Py_XDECREF(u);
3867 Py_XDECREF(v);
3868 return NULL;
3869}
3870
3871static char count__doc__[] =
3872"S.count(sub[, start[, end]]) -> int\n\
3873\n\
3874Return the number of occurrences of substring sub in Unicode string\n\
3875S[start:end]. Optional arguments start and end are\n\
3876interpreted as in slice notation.";
3877
3878static PyObject *
3879unicode_count(PyUnicodeObject *self, PyObject *args)
3880{
3881 PyUnicodeObject *substring;
3882 int start = 0;
3883 int end = INT_MAX;
3884 PyObject *result;
3885
Guido van Rossumb8872e62000-05-09 14:14:27 +00003886 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3887 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 return NULL;
3889
3890 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3891 (PyObject *)substring);
3892 if (substring == NULL)
3893 return NULL;
3894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 if (start < 0)
3896 start += self->length;
3897 if (start < 0)
3898 start = 0;
3899 if (end > self->length)
3900 end = self->length;
3901 if (end < 0)
3902 end += self->length;
3903 if (end < 0)
3904 end = 0;
3905
3906 result = PyInt_FromLong((long) count(self, start, end, substring));
3907
3908 Py_DECREF(substring);
3909 return result;
3910}
3911
3912static char encode__doc__[] =
3913"S.encode([encoding[,errors]]) -> string\n\
3914\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003915Return an encoded string version of S. Default encoding is the current\n\
3916default string encoding. errors may be given to set a different error\n\
3917handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3918a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919
3920static PyObject *
3921unicode_encode(PyUnicodeObject *self, PyObject *args)
3922{
3923 char *encoding = NULL;
3924 char *errors = NULL;
3925 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3926 return NULL;
3927 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3928}
3929
3930static char expandtabs__doc__[] =
3931"S.expandtabs([tabsize]) -> unicode\n\
3932\n\
3933Return a copy of S where all tab characters are expanded using spaces.\n\
3934If tabsize is not given, a tab size of 8 characters is assumed.";
3935
3936static PyObject*
3937unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3938{
3939 Py_UNICODE *e;
3940 Py_UNICODE *p;
3941 Py_UNICODE *q;
3942 int i, j;
3943 PyUnicodeObject *u;
3944 int tabsize = 8;
3945
3946 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3947 return NULL;
3948
Thomas Wouters7e474022000-07-16 12:04:32 +00003949 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 i = j = 0;
3951 e = self->str + self->length;
3952 for (p = self->str; p < e; p++)
3953 if (*p == '\t') {
3954 if (tabsize > 0)
3955 j += tabsize - (j % tabsize);
3956 }
3957 else {
3958 j++;
3959 if (*p == '\n' || *p == '\r') {
3960 i += j;
3961 j = 0;
3962 }
3963 }
3964
3965 /* Second pass: create output string and fill it */
3966 u = _PyUnicode_New(i + j);
3967 if (!u)
3968 return NULL;
3969
3970 j = 0;
3971 q = u->str;
3972
3973 for (p = self->str; p < e; p++)
3974 if (*p == '\t') {
3975 if (tabsize > 0) {
3976 i = tabsize - (j % tabsize);
3977 j += i;
3978 while (i--)
3979 *q++ = ' ';
3980 }
3981 }
3982 else {
3983 j++;
3984 *q++ = *p;
3985 if (*p == '\n' || *p == '\r')
3986 j = 0;
3987 }
3988
3989 return (PyObject*) u;
3990}
3991
3992static char find__doc__[] =
3993"S.find(sub [,start [,end]]) -> int\n\
3994\n\
3995Return the lowest index in S where substring sub is found,\n\
3996such that sub is contained within s[start,end]. Optional\n\
3997arguments start and end are interpreted as in slice notation.\n\
3998\n\
3999Return -1 on failure.";
4000
4001static PyObject *
4002unicode_find(PyUnicodeObject *self, PyObject *args)
4003{
4004 PyUnicodeObject *substring;
4005 int start = 0;
4006 int end = INT_MAX;
4007 PyObject *result;
4008
Guido van Rossumb8872e62000-05-09 14:14:27 +00004009 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4010 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 return NULL;
4012 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4013 (PyObject *)substring);
4014 if (substring == NULL)
4015 return NULL;
4016
4017 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4018
4019 Py_DECREF(substring);
4020 return result;
4021}
4022
4023static PyObject *
4024unicode_getitem(PyUnicodeObject *self, int index)
4025{
4026 if (index < 0 || index >= self->length) {
4027 PyErr_SetString(PyExc_IndexError, "string index out of range");
4028 return NULL;
4029 }
4030
4031 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4032}
4033
4034static long
4035unicode_hash(PyUnicodeObject *self)
4036{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004037 /* Since Unicode objects compare equal to their ASCII string
4038 counterparts, they should use the individual character values
4039 as basis for their hash value. This is needed to assure that
4040 strings and Unicode objects behave in the same way as
4041 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042
Fredrik Lundhdde61642000-07-10 18:27:47 +00004043 register int len;
4044 register Py_UNICODE *p;
4045 register long x;
4046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 if (self->hash != -1)
4048 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004049 len = PyUnicode_GET_SIZE(self);
4050 p = PyUnicode_AS_UNICODE(self);
4051 x = *p << 7;
4052 while (--len >= 0)
4053 x = (1000003*x) ^ *p++;
4054 x ^= PyUnicode_GET_SIZE(self);
4055 if (x == -1)
4056 x = -2;
4057 self->hash = x;
4058 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059}
4060
4061static char index__doc__[] =
4062"S.index(sub [,start [,end]]) -> int\n\
4063\n\
4064Like S.find() but raise ValueError when the substring is not found.";
4065
4066static PyObject *
4067unicode_index(PyUnicodeObject *self, PyObject *args)
4068{
4069 int result;
4070 PyUnicodeObject *substring;
4071 int start = 0;
4072 int end = INT_MAX;
4073
Guido van Rossumb8872e62000-05-09 14:14:27 +00004074 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4075 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 return NULL;
4077
4078 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4079 (PyObject *)substring);
4080 if (substring == NULL)
4081 return NULL;
4082
4083 result = findstring(self, substring, start, end, 1);
4084
4085 Py_DECREF(substring);
4086 if (result < 0) {
4087 PyErr_SetString(PyExc_ValueError, "substring not found");
4088 return NULL;
4089 }
4090 return PyInt_FromLong(result);
4091}
4092
4093static char islower__doc__[] =
4094"S.islower() -> int\n\
4095\n\
4096Return 1 if all cased characters in S are lowercase and there is\n\
4097at least one cased character in S, 0 otherwise.";
4098
4099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004100unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101{
4102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4103 register const Py_UNICODE *e;
4104 int cased;
4105
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 /* Shortcut for single character strings */
4107 if (PyUnicode_GET_SIZE(self) == 1)
4108 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004110 /* Special case for empty strings */
4111 if (PyString_GET_SIZE(self) == 0)
4112 return PyInt_FromLong(0);
4113
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 e = p + PyUnicode_GET_SIZE(self);
4115 cased = 0;
4116 for (; p < e; p++) {
4117 register const Py_UNICODE ch = *p;
4118
4119 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4120 return PyInt_FromLong(0);
4121 else if (!cased && Py_UNICODE_ISLOWER(ch))
4122 cased = 1;
4123 }
4124 return PyInt_FromLong(cased);
4125}
4126
4127static char isupper__doc__[] =
4128"S.isupper() -> int\n\
4129\n\
4130Return 1 if all cased characters in S are uppercase and there is\n\
4131at least one cased character in S, 0 otherwise.";
4132
4133static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004134unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
4136 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4137 register const Py_UNICODE *e;
4138 int cased;
4139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 /* Shortcut for single character strings */
4141 if (PyUnicode_GET_SIZE(self) == 1)
4142 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4143
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004144 /* Special case for empty strings */
4145 if (PyString_GET_SIZE(self) == 0)
4146 return PyInt_FromLong(0);
4147
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 e = p + PyUnicode_GET_SIZE(self);
4149 cased = 0;
4150 for (; p < e; p++) {
4151 register const Py_UNICODE ch = *p;
4152
4153 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4154 return PyInt_FromLong(0);
4155 else if (!cased && Py_UNICODE_ISUPPER(ch))
4156 cased = 1;
4157 }
4158 return PyInt_FromLong(cased);
4159}
4160
4161static char istitle__doc__[] =
4162"S.istitle() -> int\n\
4163\n\
4164Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4165may only follow uncased characters and lowercase characters only cased\n\
4166ones. Return 0 otherwise.";
4167
4168static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004169unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170{
4171 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4172 register const Py_UNICODE *e;
4173 int cased, previous_is_cased;
4174
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 /* Shortcut for single character strings */
4176 if (PyUnicode_GET_SIZE(self) == 1)
4177 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4178 (Py_UNICODE_ISUPPER(*p) != 0));
4179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004180 /* Special case for empty strings */
4181 if (PyString_GET_SIZE(self) == 0)
4182 return PyInt_FromLong(0);
4183
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 e = p + PyUnicode_GET_SIZE(self);
4185 cased = 0;
4186 previous_is_cased = 0;
4187 for (; p < e; p++) {
4188 register const Py_UNICODE ch = *p;
4189
4190 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4191 if (previous_is_cased)
4192 return PyInt_FromLong(0);
4193 previous_is_cased = 1;
4194 cased = 1;
4195 }
4196 else if (Py_UNICODE_ISLOWER(ch)) {
4197 if (!previous_is_cased)
4198 return PyInt_FromLong(0);
4199 previous_is_cased = 1;
4200 cased = 1;
4201 }
4202 else
4203 previous_is_cased = 0;
4204 }
4205 return PyInt_FromLong(cased);
4206}
4207
4208static char isspace__doc__[] =
4209"S.isspace() -> int\n\
4210\n\
4211Return 1 if there are only whitespace characters in S,\n\
42120 otherwise.";
4213
4214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004215unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216{
4217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4218 register const Py_UNICODE *e;
4219
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220 /* Shortcut for single character strings */
4221 if (PyUnicode_GET_SIZE(self) == 1 &&
4222 Py_UNICODE_ISSPACE(*p))
4223 return PyInt_FromLong(1);
4224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004225 /* Special case for empty strings */
4226 if (PyString_GET_SIZE(self) == 0)
4227 return PyInt_FromLong(0);
4228
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 e = p + PyUnicode_GET_SIZE(self);
4230 for (; p < e; p++) {
4231 if (!Py_UNICODE_ISSPACE(*p))
4232 return PyInt_FromLong(0);
4233 }
4234 return PyInt_FromLong(1);
4235}
4236
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004237static char isalpha__doc__[] =
4238"S.isalpha() -> int\n\
4239\n\
4240Return 1 if all characters in S are alphabetic\n\
4241and there is at least one character in S, 0 otherwise.";
4242
4243static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004244unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004245{
4246 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4247 register const Py_UNICODE *e;
4248
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004249 /* Shortcut for single character strings */
4250 if (PyUnicode_GET_SIZE(self) == 1 &&
4251 Py_UNICODE_ISALPHA(*p))
4252 return PyInt_FromLong(1);
4253
4254 /* Special case for empty strings */
4255 if (PyString_GET_SIZE(self) == 0)
4256 return PyInt_FromLong(0);
4257
4258 e = p + PyUnicode_GET_SIZE(self);
4259 for (; p < e; p++) {
4260 if (!Py_UNICODE_ISALPHA(*p))
4261 return PyInt_FromLong(0);
4262 }
4263 return PyInt_FromLong(1);
4264}
4265
4266static char isalnum__doc__[] =
4267"S.isalnum() -> int\n\
4268\n\
4269Return 1 if all characters in S are alphanumeric\n\
4270and there is at least one character in S, 0 otherwise.";
4271
4272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004273unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274{
4275 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4276 register const Py_UNICODE *e;
4277
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004278 /* Shortcut for single character strings */
4279 if (PyUnicode_GET_SIZE(self) == 1 &&
4280 Py_UNICODE_ISALNUM(*p))
4281 return PyInt_FromLong(1);
4282
4283 /* Special case for empty strings */
4284 if (PyString_GET_SIZE(self) == 0)
4285 return PyInt_FromLong(0);
4286
4287 e = p + PyUnicode_GET_SIZE(self);
4288 for (; p < e; p++) {
4289 if (!Py_UNICODE_ISALNUM(*p))
4290 return PyInt_FromLong(0);
4291 }
4292 return PyInt_FromLong(1);
4293}
4294
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295static char isdecimal__doc__[] =
4296"S.isdecimal() -> int\n\
4297\n\
4298Return 1 if there are only decimal characters in S,\n\
42990 otherwise.";
4300
4301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004302unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303{
4304 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4305 register const Py_UNICODE *e;
4306
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 /* Shortcut for single character strings */
4308 if (PyUnicode_GET_SIZE(self) == 1 &&
4309 Py_UNICODE_ISDECIMAL(*p))
4310 return PyInt_FromLong(1);
4311
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004312 /* Special case for empty strings */
4313 if (PyString_GET_SIZE(self) == 0)
4314 return PyInt_FromLong(0);
4315
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 e = p + PyUnicode_GET_SIZE(self);
4317 for (; p < e; p++) {
4318 if (!Py_UNICODE_ISDECIMAL(*p))
4319 return PyInt_FromLong(0);
4320 }
4321 return PyInt_FromLong(1);
4322}
4323
4324static char isdigit__doc__[] =
4325"S.isdigit() -> int\n\
4326\n\
4327Return 1 if there are only digit characters in S,\n\
43280 otherwise.";
4329
4330static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004331unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332{
4333 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4334 register const Py_UNICODE *e;
4335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 /* Shortcut for single character strings */
4337 if (PyUnicode_GET_SIZE(self) == 1 &&
4338 Py_UNICODE_ISDIGIT(*p))
4339 return PyInt_FromLong(1);
4340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004341 /* Special case for empty strings */
4342 if (PyString_GET_SIZE(self) == 0)
4343 return PyInt_FromLong(0);
4344
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 e = p + PyUnicode_GET_SIZE(self);
4346 for (; p < e; p++) {
4347 if (!Py_UNICODE_ISDIGIT(*p))
4348 return PyInt_FromLong(0);
4349 }
4350 return PyInt_FromLong(1);
4351}
4352
4353static char isnumeric__doc__[] =
4354"S.isnumeric() -> int\n\
4355\n\
4356Return 1 if there are only numeric characters in S,\n\
43570 otherwise.";
4358
4359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004360unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361{
4362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4363 register const Py_UNICODE *e;
4364
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 /* Shortcut for single character strings */
4366 if (PyUnicode_GET_SIZE(self) == 1 &&
4367 Py_UNICODE_ISNUMERIC(*p))
4368 return PyInt_FromLong(1);
4369
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004370 /* Special case for empty strings */
4371 if (PyString_GET_SIZE(self) == 0)
4372 return PyInt_FromLong(0);
4373
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 e = p + PyUnicode_GET_SIZE(self);
4375 for (; p < e; p++) {
4376 if (!Py_UNICODE_ISNUMERIC(*p))
4377 return PyInt_FromLong(0);
4378 }
4379 return PyInt_FromLong(1);
4380}
4381
4382static char join__doc__[] =
4383"S.join(sequence) -> unicode\n\
4384\n\
4385Return a string which is the concatenation of the strings in the\n\
4386sequence. The separator between elements is S.";
4387
4388static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004389unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004391 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392}
4393
4394static int
4395unicode_length(PyUnicodeObject *self)
4396{
4397 return self->length;
4398}
4399
4400static char ljust__doc__[] =
4401"S.ljust(width) -> unicode\n\
4402\n\
4403Return S left justified in a Unicode string of length width. Padding is\n\
4404done using spaces.";
4405
4406static PyObject *
4407unicode_ljust(PyUnicodeObject *self, PyObject *args)
4408{
4409 int width;
4410 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4411 return NULL;
4412
Tim Peters7a29bd52001-09-12 03:03:31 +00004413 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 Py_INCREF(self);
4415 return (PyObject*) self;
4416 }
4417
4418 return (PyObject*) pad(self, 0, width - self->length, ' ');
4419}
4420
4421static char lower__doc__[] =
4422"S.lower() -> unicode\n\
4423\n\
4424Return a copy of the string S converted to lowercase.";
4425
4426static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004427unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 return fixup(self, fixlower);
4430}
4431
4432static char lstrip__doc__[] =
4433"S.lstrip() -> unicode\n\
4434\n\
4435Return a copy of the string S with leading whitespace removed.";
4436
4437static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004438unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 return strip(self, 1, 0);
4441}
4442
4443static PyObject*
4444unicode_repeat(PyUnicodeObject *str, int len)
4445{
4446 PyUnicodeObject *u;
4447 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004448 int nchars;
4449 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450
4451 if (len < 0)
4452 len = 0;
4453
Tim Peters7a29bd52001-09-12 03:03:31 +00004454 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 /* no repeat, return original string */
4456 Py_INCREF(str);
4457 return (PyObject*) str;
4458 }
Tim Peters8f422462000-09-09 06:13:41 +00004459
4460 /* ensure # of chars needed doesn't overflow int and # of bytes
4461 * needed doesn't overflow size_t
4462 */
4463 nchars = len * str->length;
4464 if (len && nchars / len != str->length) {
4465 PyErr_SetString(PyExc_OverflowError,
4466 "repeated string is too long");
4467 return NULL;
4468 }
4469 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4470 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4471 PyErr_SetString(PyExc_OverflowError,
4472 "repeated string is too long");
4473 return NULL;
4474 }
4475 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 if (!u)
4477 return NULL;
4478
4479 p = u->str;
4480
4481 while (len-- > 0) {
4482 Py_UNICODE_COPY(p, str->str, str->length);
4483 p += str->length;
4484 }
4485
4486 return (PyObject*) u;
4487}
4488
4489PyObject *PyUnicode_Replace(PyObject *obj,
4490 PyObject *subobj,
4491 PyObject *replobj,
4492 int maxcount)
4493{
4494 PyObject *self;
4495 PyObject *str1;
4496 PyObject *str2;
4497 PyObject *result;
4498
4499 self = PyUnicode_FromObject(obj);
4500 if (self == NULL)
4501 return NULL;
4502 str1 = PyUnicode_FromObject(subobj);
4503 if (str1 == NULL) {
4504 Py_DECREF(self);
4505 return NULL;
4506 }
4507 str2 = PyUnicode_FromObject(replobj);
4508 if (str2 == NULL) {
4509 Py_DECREF(self);
4510 Py_DECREF(str1);
4511 return NULL;
4512 }
4513 result = replace((PyUnicodeObject *)self,
4514 (PyUnicodeObject *)str1,
4515 (PyUnicodeObject *)str2,
4516 maxcount);
4517 Py_DECREF(self);
4518 Py_DECREF(str1);
4519 Py_DECREF(str2);
4520 return result;
4521}
4522
4523static char replace__doc__[] =
4524"S.replace (old, new[, maxsplit]) -> unicode\n\
4525\n\
4526Return a copy of S with all occurrences of substring\n\
4527old replaced by new. If the optional argument maxsplit is\n\
4528given, only the first maxsplit occurrences are replaced.";
4529
4530static PyObject*
4531unicode_replace(PyUnicodeObject *self, PyObject *args)
4532{
4533 PyUnicodeObject *str1;
4534 PyUnicodeObject *str2;
4535 int maxcount = -1;
4536 PyObject *result;
4537
4538 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4539 return NULL;
4540 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4541 if (str1 == NULL)
4542 return NULL;
4543 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4544 if (str2 == NULL)
4545 return NULL;
4546
4547 result = replace(self, str1, str2, maxcount);
4548
4549 Py_DECREF(str1);
4550 Py_DECREF(str2);
4551 return result;
4552}
4553
4554static
4555PyObject *unicode_repr(PyObject *unicode)
4556{
4557 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4558 PyUnicode_GET_SIZE(unicode),
4559 1);
4560}
4561
4562static char rfind__doc__[] =
4563"S.rfind(sub [,start [,end]]) -> int\n\
4564\n\
4565Return the highest index in S where substring sub is found,\n\
4566such that sub is contained within s[start,end]. Optional\n\
4567arguments start and end are interpreted as in slice notation.\n\
4568\n\
4569Return -1 on failure.";
4570
4571static PyObject *
4572unicode_rfind(PyUnicodeObject *self, PyObject *args)
4573{
4574 PyUnicodeObject *substring;
4575 int start = 0;
4576 int end = INT_MAX;
4577 PyObject *result;
4578
Guido van Rossumb8872e62000-05-09 14:14:27 +00004579 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4580 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 return NULL;
4582 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4583 (PyObject *)substring);
4584 if (substring == NULL)
4585 return NULL;
4586
4587 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4588
4589 Py_DECREF(substring);
4590 return result;
4591}
4592
4593static char rindex__doc__[] =
4594"S.rindex(sub [,start [,end]]) -> int\n\
4595\n\
4596Like S.rfind() but raise ValueError when the substring is not found.";
4597
4598static PyObject *
4599unicode_rindex(PyUnicodeObject *self, PyObject *args)
4600{
4601 int result;
4602 PyUnicodeObject *substring;
4603 int start = 0;
4604 int end = INT_MAX;
4605
Guido van Rossumb8872e62000-05-09 14:14:27 +00004606 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4607 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608 return NULL;
4609 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4610 (PyObject *)substring);
4611 if (substring == NULL)
4612 return NULL;
4613
4614 result = findstring(self, substring, start, end, -1);
4615
4616 Py_DECREF(substring);
4617 if (result < 0) {
4618 PyErr_SetString(PyExc_ValueError, "substring not found");
4619 return NULL;
4620 }
4621 return PyInt_FromLong(result);
4622}
4623
4624static char rjust__doc__[] =
4625"S.rjust(width) -> unicode\n\
4626\n\
4627Return S right justified in a Unicode string of length width. Padding is\n\
4628done using spaces.";
4629
4630static PyObject *
4631unicode_rjust(PyUnicodeObject *self, PyObject *args)
4632{
4633 int width;
4634 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4635 return NULL;
4636
Tim Peters7a29bd52001-09-12 03:03:31 +00004637 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 Py_INCREF(self);
4639 return (PyObject*) self;
4640 }
4641
4642 return (PyObject*) pad(self, width - self->length, 0, ' ');
4643}
4644
4645static char rstrip__doc__[] =
4646"S.rstrip() -> unicode\n\
4647\n\
4648Return a copy of the string S with trailing whitespace removed.";
4649
4650static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004651unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 return strip(self, 0, 1);
4654}
4655
4656static PyObject*
4657unicode_slice(PyUnicodeObject *self, int start, int end)
4658{
4659 /* standard clamping */
4660 if (start < 0)
4661 start = 0;
4662 if (end < 0)
4663 end = 0;
4664 if (end > self->length)
4665 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004666 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 /* full slice, return original string */
4668 Py_INCREF(self);
4669 return (PyObject*) self;
4670 }
4671 if (start > end)
4672 start = end;
4673 /* copy slice */
4674 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4675 end - start);
4676}
4677
4678PyObject *PyUnicode_Split(PyObject *s,
4679 PyObject *sep,
4680 int maxsplit)
4681{
4682 PyObject *result;
4683
4684 s = PyUnicode_FromObject(s);
4685 if (s == NULL)
4686 return NULL;
4687 if (sep != NULL) {
4688 sep = PyUnicode_FromObject(sep);
4689 if (sep == NULL) {
4690 Py_DECREF(s);
4691 return NULL;
4692 }
4693 }
4694
4695 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4696
4697 Py_DECREF(s);
4698 Py_XDECREF(sep);
4699 return result;
4700}
4701
4702static char split__doc__[] =
4703"S.split([sep [,maxsplit]]) -> list of strings\n\
4704\n\
4705Return a list of the words in S, using sep as the\n\
4706delimiter string. If maxsplit is given, at most maxsplit\n\
4707splits are done. If sep is not specified, any whitespace string\n\
4708is a separator.";
4709
4710static PyObject*
4711unicode_split(PyUnicodeObject *self, PyObject *args)
4712{
4713 PyObject *substring = Py_None;
4714 int maxcount = -1;
4715
4716 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4717 return NULL;
4718
4719 if (substring == Py_None)
4720 return split(self, NULL, maxcount);
4721 else if (PyUnicode_Check(substring))
4722 return split(self, (PyUnicodeObject *)substring, maxcount);
4723 else
4724 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4725}
4726
4727static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004728"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729\n\
4730Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004731Line breaks are not included in the resulting list unless keepends\n\
4732is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733
4734static PyObject*
4735unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4736{
Guido van Rossum86662912000-04-11 15:38:46 +00004737 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738
Guido van Rossum86662912000-04-11 15:38:46 +00004739 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 return NULL;
4741
Guido van Rossum86662912000-04-11 15:38:46 +00004742 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743}
4744
4745static
4746PyObject *unicode_str(PyUnicodeObject *self)
4747{
Fred Drakee4315f52000-05-09 19:53:39 +00004748 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749}
4750
4751static char strip__doc__[] =
4752"S.strip() -> unicode\n\
4753\n\
4754Return a copy of S with leading and trailing whitespace removed.";
4755
4756static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004757unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 return strip(self, 1, 1);
4760}
4761
4762static char swapcase__doc__[] =
4763"S.swapcase() -> unicode\n\
4764\n\
4765Return a copy of S with uppercase characters converted to lowercase\n\
4766and vice versa.";
4767
4768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004769unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 return fixup(self, fixswapcase);
4772}
4773
4774static char translate__doc__[] =
4775"S.translate(table) -> unicode\n\
4776\n\
4777Return a copy of the string S, where all characters have been mapped\n\
4778through the given translation table, which must be a mapping of\n\
4779Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4780are left untouched. Characters mapped to None are deleted.";
4781
4782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004783unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 return PyUnicode_TranslateCharmap(self->str,
4786 self->length,
4787 table,
4788 "ignore");
4789}
4790
4791static char upper__doc__[] =
4792"S.upper() -> unicode\n\
4793\n\
4794Return a copy of S converted to uppercase.";
4795
4796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004797unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 return fixup(self, fixupper);
4800}
4801
4802#if 0
4803static char zfill__doc__[] =
4804"S.zfill(width) -> unicode\n\
4805\n\
4806Pad a numeric string x with zeros on the left, to fill a field\n\
4807of the specified width. The string x is never truncated.";
4808
4809static PyObject *
4810unicode_zfill(PyUnicodeObject *self, PyObject *args)
4811{
4812 int fill;
4813 PyUnicodeObject *u;
4814
4815 int width;
4816 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4817 return NULL;
4818
4819 if (self->length >= width) {
4820 Py_INCREF(self);
4821 return (PyObject*) self;
4822 }
4823
4824 fill = width - self->length;
4825
4826 u = pad(self, fill, 0, '0');
4827
4828 if (u->str[fill] == '+' || u->str[fill] == '-') {
4829 /* move sign to beginning of string */
4830 u->str[0] = u->str[fill];
4831 u->str[fill] = '0';
4832 }
4833
4834 return (PyObject*) u;
4835}
4836#endif
4837
4838#if 0
4839static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004840unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 return PyInt_FromLong(unicode_freelist_size);
4843}
4844#endif
4845
4846static char startswith__doc__[] =
4847"S.startswith(prefix[, start[, end]]) -> int\n\
4848\n\
4849Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4850optional start, test S beginning at that position. With optional end, stop\n\
4851comparing S at that position.";
4852
4853static PyObject *
4854unicode_startswith(PyUnicodeObject *self,
4855 PyObject *args)
4856{
4857 PyUnicodeObject *substring;
4858 int start = 0;
4859 int end = INT_MAX;
4860 PyObject *result;
4861
Guido van Rossumb8872e62000-05-09 14:14:27 +00004862 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4863 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 return NULL;
4865 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4866 (PyObject *)substring);
4867 if (substring == NULL)
4868 return NULL;
4869
4870 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4871
4872 Py_DECREF(substring);
4873 return result;
4874}
4875
4876
4877static char endswith__doc__[] =
4878"S.endswith(suffix[, start[, end]]) -> int\n\
4879\n\
4880Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4881optional start, test S beginning at that position. With optional end, stop\n\
4882comparing S at that position.";
4883
4884static PyObject *
4885unicode_endswith(PyUnicodeObject *self,
4886 PyObject *args)
4887{
4888 PyUnicodeObject *substring;
4889 int start = 0;
4890 int end = INT_MAX;
4891 PyObject *result;
4892
Guido van Rossumb8872e62000-05-09 14:14:27 +00004893 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4894 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 return NULL;
4896 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4897 (PyObject *)substring);
4898 if (substring == NULL)
4899 return NULL;
4900
4901 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4902
4903 Py_DECREF(substring);
4904 return result;
4905}
4906
4907
4908static PyMethodDef unicode_methods[] = {
4909
4910 /* Order is according to common usage: often used methods should
4911 appear first, since lookup is done sequentially. */
4912
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004913 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4914 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4915 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4916 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4917 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4918 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4919 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4920 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4921 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4922 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4923 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4924 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4925 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4926 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4927/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4928 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4929 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4930 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4931 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4932 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4933 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4934 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4935 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4936 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4937 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4938 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4939 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4940 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4941 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4942 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4943 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4944 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4945 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4946 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4947 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004949 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4950 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951#endif
4952
4953#if 0
4954 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004955 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956#endif
4957
4958 {NULL, NULL}
4959};
4960
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961static PySequenceMethods unicode_as_sequence = {
4962 (inquiry) unicode_length, /* sq_length */
4963 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4964 (intargfunc) unicode_repeat, /* sq_repeat */
4965 (intargfunc) unicode_getitem, /* sq_item */
4966 (intintargfunc) unicode_slice, /* sq_slice */
4967 0, /* sq_ass_item */
4968 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004969 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970};
4971
4972static int
4973unicode_buffer_getreadbuf(PyUnicodeObject *self,
4974 int index,
4975 const void **ptr)
4976{
4977 if (index != 0) {
4978 PyErr_SetString(PyExc_SystemError,
4979 "accessing non-existent unicode segment");
4980 return -1;
4981 }
4982 *ptr = (void *) self->str;
4983 return PyUnicode_GET_DATA_SIZE(self);
4984}
4985
4986static int
4987unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4988 const void **ptr)
4989{
4990 PyErr_SetString(PyExc_TypeError,
4991 "cannot use unicode as modifyable buffer");
4992 return -1;
4993}
4994
4995static int
4996unicode_buffer_getsegcount(PyUnicodeObject *self,
4997 int *lenp)
4998{
4999 if (lenp)
5000 *lenp = PyUnicode_GET_DATA_SIZE(self);
5001 return 1;
5002}
5003
5004static int
5005unicode_buffer_getcharbuf(PyUnicodeObject *self,
5006 int index,
5007 const void **ptr)
5008{
5009 PyObject *str;
5010
5011 if (index != 0) {
5012 PyErr_SetString(PyExc_SystemError,
5013 "accessing non-existent unicode segment");
5014 return -1;
5015 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005016 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 if (str == NULL)
5018 return -1;
5019 *ptr = (void *) PyString_AS_STRING(str);
5020 return PyString_GET_SIZE(str);
5021}
5022
5023/* Helpers for PyUnicode_Format() */
5024
5025static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005026getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027{
5028 int argidx = *p_argidx;
5029 if (argidx < arglen) {
5030 (*p_argidx)++;
5031 if (arglen < 0)
5032 return args;
5033 else
5034 return PyTuple_GetItem(args, argidx);
5035 }
5036 PyErr_SetString(PyExc_TypeError,
5037 "not enough arguments for format string");
5038 return NULL;
5039}
5040
5041#define F_LJUST (1<<0)
5042#define F_SIGN (1<<1)
5043#define F_BLANK (1<<2)
5044#define F_ALT (1<<3)
5045#define F_ZERO (1<<4)
5046
5047static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049{
5050 register int i;
5051 int len;
5052 va_list va;
5053 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055
5056 /* First, format the string as char array, then expand to Py_UNICODE
5057 array. */
5058 charbuffer = (char *)buffer;
5059 len = vsprintf(charbuffer, format, va);
5060 for (i = len - 1; i >= 0; i--)
5061 buffer[i] = (Py_UNICODE) charbuffer[i];
5062
5063 va_end(va);
5064 return len;
5065}
5066
5067static int
5068formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005069 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 int flags,
5071 int prec,
5072 int type,
5073 PyObject *v)
5074{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005075 /* fmt = '%#.' + `prec` + `type`
5076 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 char fmt[20];
5078 double x;
5079
5080 x = PyFloat_AsDouble(v);
5081 if (x == -1.0 && PyErr_Occurred())
5082 return -1;
5083 if (prec < 0)
5084 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5086 type = 'g';
5087 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005088 /* worst case length calc to ensure no buffer overrun:
5089 fmt = %#.<prec>g
5090 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5091 for any double rep.)
5092 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5093 If prec=0 the effective precision is 1 (the leading digit is
5094 always given), therefore increase by one to 10+prec. */
5095 if (buflen <= (size_t)10 + (size_t)prec) {
5096 PyErr_SetString(PyExc_OverflowError,
5097 "formatted float is too long (precision too long?)");
5098 return -1;
5099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 return usprintf(buf, fmt, x);
5101}
5102
Tim Peters38fd5b62000-09-21 05:43:11 +00005103static PyObject*
5104formatlong(PyObject *val, int flags, int prec, int type)
5105{
5106 char *buf;
5107 int i, len;
5108 PyObject *str; /* temporary string object. */
5109 PyUnicodeObject *result;
5110
5111 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5112 if (!str)
5113 return NULL;
5114 result = _PyUnicode_New(len);
5115 for (i = 0; i < len; i++)
5116 result->str[i] = buf[i];
5117 result->str[len] = 0;
5118 Py_DECREF(str);
5119 return (PyObject*)result;
5120}
5121
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122static int
5123formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005124 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 int flags,
5126 int prec,
5127 int type,
5128 PyObject *v)
5129{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005130 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005131 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5132 + 1 + 1 = 24*/
5133 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005135 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136
5137 x = PyInt_AsLong(v);
5138 if (x == -1 && PyErr_Occurred())
5139 return -1;
5140 if (prec < 0)
5141 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005142 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5143 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5144 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5145 PyErr_SetString(PyExc_OverflowError,
5146 "formatted integer is too long (precision too long?)");
5147 return -1;
5148 }
Tim Petersfff53252001-04-12 18:38:48 +00005149 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5150 * but we want it (for consistency with other %#x conversions, and
5151 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005152 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5153 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5154 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005155 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005156 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5157 /* Only way to know what the platform does is to try it. */
5158 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5159 if (fmt[1] != (char)type) {
5160 /* Supply our own leading 0x/0X -- needed under std C */
5161 use_native_c_format = 0;
5162 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5163 }
5164 }
5165 if (use_native_c_format)
5166 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 return usprintf(buf, fmt, x);
5168}
5169
5170static int
5171formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005172 size_t buflen,
5173 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005175 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005176 if (PyUnicode_Check(v)) {
5177 if (PyUnicode_GET_SIZE(v) != 1)
5178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005182 else if (PyString_Check(v)) {
5183 if (PyString_GET_SIZE(v) != 1)
5184 goto onError;
5185 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187
5188 else {
5189 /* Integer input truncated to a character */
5190 long x;
5191 x = PyInt_AsLong(v);
5192 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 buf[0] = (char) x;
5195 }
5196 buf[1] = '\0';
5197 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005198
5199 onError:
5200 PyErr_SetString(PyExc_TypeError,
5201 "%c requires int or char");
5202 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203}
5204
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005205/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5206
5207 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5208 chars are formatted. XXX This is a magic number. Each formatting
5209 routine does bounds checking to ensure no overflow, but a better
5210 solution may be to malloc a buffer of appropriate size for each
5211 format. For now, the current solution is sufficient.
5212*/
5213#define FORMATBUFLEN (size_t)120
5214
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215PyObject *PyUnicode_Format(PyObject *format,
5216 PyObject *args)
5217{
5218 Py_UNICODE *fmt, *res;
5219 int fmtcnt, rescnt, reslen, arglen, argidx;
5220 int args_owned = 0;
5221 PyUnicodeObject *result = NULL;
5222 PyObject *dict = NULL;
5223 PyObject *uformat;
5224
5225 if (format == NULL || args == NULL) {
5226 PyErr_BadInternalCall();
5227 return NULL;
5228 }
5229 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005230 if (uformat == NULL)
5231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 fmt = PyUnicode_AS_UNICODE(uformat);
5233 fmtcnt = PyUnicode_GET_SIZE(uformat);
5234
5235 reslen = rescnt = fmtcnt + 100;
5236 result = _PyUnicode_New(reslen);
5237 if (result == NULL)
5238 goto onError;
5239 res = PyUnicode_AS_UNICODE(result);
5240
5241 if (PyTuple_Check(args)) {
5242 arglen = PyTuple_Size(args);
5243 argidx = 0;
5244 }
5245 else {
5246 arglen = -1;
5247 argidx = -2;
5248 }
5249 if (args->ob_type->tp_as_mapping)
5250 dict = args;
5251
5252 while (--fmtcnt >= 0) {
5253 if (*fmt != '%') {
5254 if (--rescnt < 0) {
5255 rescnt = fmtcnt + 100;
5256 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005257 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 return NULL;
5259 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5260 --rescnt;
5261 }
5262 *res++ = *fmt++;
5263 }
5264 else {
5265 /* Got a format specifier */
5266 int flags = 0;
5267 int width = -1;
5268 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 Py_UNICODE c = '\0';
5270 Py_UNICODE fill;
5271 PyObject *v = NULL;
5272 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005273 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 Py_UNICODE sign;
5275 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005276 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
5278 fmt++;
5279 if (*fmt == '(') {
5280 Py_UNICODE *keystart;
5281 int keylen;
5282 PyObject *key;
5283 int pcount = 1;
5284
5285 if (dict == NULL) {
5286 PyErr_SetString(PyExc_TypeError,
5287 "format requires a mapping");
5288 goto onError;
5289 }
5290 ++fmt;
5291 --fmtcnt;
5292 keystart = fmt;
5293 /* Skip over balanced parentheses */
5294 while (pcount > 0 && --fmtcnt >= 0) {
5295 if (*fmt == ')')
5296 --pcount;
5297 else if (*fmt == '(')
5298 ++pcount;
5299 fmt++;
5300 }
5301 keylen = fmt - keystart - 1;
5302 if (fmtcnt < 0 || pcount > 0) {
5303 PyErr_SetString(PyExc_ValueError,
5304 "incomplete format key");
5305 goto onError;
5306 }
Fred Drakee4315f52000-05-09 19:53:39 +00005307 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 then looked up since Python uses strings to hold
5309 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005310 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 key = PyUnicode_EncodeUTF8(keystart,
5312 keylen,
5313 NULL);
5314 if (key == NULL)
5315 goto onError;
5316 if (args_owned) {
5317 Py_DECREF(args);
5318 args_owned = 0;
5319 }
5320 args = PyObject_GetItem(dict, key);
5321 Py_DECREF(key);
5322 if (args == NULL) {
5323 goto onError;
5324 }
5325 args_owned = 1;
5326 arglen = -1;
5327 argidx = -2;
5328 }
5329 while (--fmtcnt >= 0) {
5330 switch (c = *fmt++) {
5331 case '-': flags |= F_LJUST; continue;
5332 case '+': flags |= F_SIGN; continue;
5333 case ' ': flags |= F_BLANK; continue;
5334 case '#': flags |= F_ALT; continue;
5335 case '0': flags |= F_ZERO; continue;
5336 }
5337 break;
5338 }
5339 if (c == '*') {
5340 v = getnextarg(args, arglen, &argidx);
5341 if (v == NULL)
5342 goto onError;
5343 if (!PyInt_Check(v)) {
5344 PyErr_SetString(PyExc_TypeError,
5345 "* wants int");
5346 goto onError;
5347 }
5348 width = PyInt_AsLong(v);
5349 if (width < 0) {
5350 flags |= F_LJUST;
5351 width = -width;
5352 }
5353 if (--fmtcnt >= 0)
5354 c = *fmt++;
5355 }
5356 else if (c >= '0' && c <= '9') {
5357 width = c - '0';
5358 while (--fmtcnt >= 0) {
5359 c = *fmt++;
5360 if (c < '0' || c > '9')
5361 break;
5362 if ((width*10) / 10 != width) {
5363 PyErr_SetString(PyExc_ValueError,
5364 "width too big");
5365 goto onError;
5366 }
5367 width = width*10 + (c - '0');
5368 }
5369 }
5370 if (c == '.') {
5371 prec = 0;
5372 if (--fmtcnt >= 0)
5373 c = *fmt++;
5374 if (c == '*') {
5375 v = getnextarg(args, arglen, &argidx);
5376 if (v == NULL)
5377 goto onError;
5378 if (!PyInt_Check(v)) {
5379 PyErr_SetString(PyExc_TypeError,
5380 "* wants int");
5381 goto onError;
5382 }
5383 prec = PyInt_AsLong(v);
5384 if (prec < 0)
5385 prec = 0;
5386 if (--fmtcnt >= 0)
5387 c = *fmt++;
5388 }
5389 else if (c >= '0' && c <= '9') {
5390 prec = c - '0';
5391 while (--fmtcnt >= 0) {
5392 c = Py_CHARMASK(*fmt++);
5393 if (c < '0' || c > '9')
5394 break;
5395 if ((prec*10) / 10 != prec) {
5396 PyErr_SetString(PyExc_ValueError,
5397 "prec too big");
5398 goto onError;
5399 }
5400 prec = prec*10 + (c - '0');
5401 }
5402 }
5403 } /* prec */
5404 if (fmtcnt >= 0) {
5405 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 if (--fmtcnt >= 0)
5407 c = *fmt++;
5408 }
5409 }
5410 if (fmtcnt < 0) {
5411 PyErr_SetString(PyExc_ValueError,
5412 "incomplete format");
5413 goto onError;
5414 }
5415 if (c != '%') {
5416 v = getnextarg(args, arglen, &argidx);
5417 if (v == NULL)
5418 goto onError;
5419 }
5420 sign = 0;
5421 fill = ' ';
5422 switch (c) {
5423
5424 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005425 pbuf = formatbuf;
5426 /* presume that buffer length is at least 1 */
5427 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 len = 1;
5429 break;
5430
5431 case 's':
5432 case 'r':
5433 if (PyUnicode_Check(v) && c == 's') {
5434 temp = v;
5435 Py_INCREF(temp);
5436 }
5437 else {
5438 PyObject *unicode;
5439 if (c == 's')
5440 temp = PyObject_Str(v);
5441 else
5442 temp = PyObject_Repr(v);
5443 if (temp == NULL)
5444 goto onError;
5445 if (!PyString_Check(temp)) {
5446 /* XXX Note: this should never happen, since
5447 PyObject_Repr() and PyObject_Str() assure
5448 this */
5449 Py_DECREF(temp);
5450 PyErr_SetString(PyExc_TypeError,
5451 "%s argument has non-string str()");
5452 goto onError;
5453 }
Fred Drakee4315f52000-05-09 19:53:39 +00005454 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005456 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 "strict");
5458 Py_DECREF(temp);
5459 temp = unicode;
5460 if (temp == NULL)
5461 goto onError;
5462 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005463 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 len = PyUnicode_GET_SIZE(temp);
5465 if (prec >= 0 && len > prec)
5466 len = prec;
5467 break;
5468
5469 case 'i':
5470 case 'd':
5471 case 'u':
5472 case 'o':
5473 case 'x':
5474 case 'X':
5475 if (c == 'i')
5476 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005477 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005478 temp = formatlong(v, flags, prec, c);
5479 if (!temp)
5480 goto onError;
5481 pbuf = PyUnicode_AS_UNICODE(temp);
5482 len = PyUnicode_GET_SIZE(temp);
5483 /* unbounded ints can always produce
5484 a sign character! */
5485 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005487 else {
5488 pbuf = formatbuf;
5489 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5490 flags, prec, c, v);
5491 if (len < 0)
5492 goto onError;
5493 /* only d conversion is signed */
5494 sign = c == 'd';
5495 }
5496 if (flags & F_ZERO)
5497 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 break;
5499
5500 case 'e':
5501 case 'E':
5502 case 'f':
5503 case 'g':
5504 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005505 pbuf = formatbuf;
5506 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5507 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 if (len < 0)
5509 goto onError;
5510 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005511 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 fill = '0';
5513 break;
5514
5515 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005516 pbuf = formatbuf;
5517 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 if (len < 0)
5519 goto onError;
5520 break;
5521
5522 default:
5523 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005524 "unsupported format character '%c' (0x%x) "
5525 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005526 (31<=c && c<=126) ? c : '?',
5527 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 goto onError;
5529 }
5530 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005531 if (*pbuf == '-' || *pbuf == '+') {
5532 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 len--;
5534 }
5535 else if (flags & F_SIGN)
5536 sign = '+';
5537 else if (flags & F_BLANK)
5538 sign = ' ';
5539 else
5540 sign = 0;
5541 }
5542 if (width < len)
5543 width = len;
5544 if (rescnt < width + (sign != 0)) {
5545 reslen -= rescnt;
5546 rescnt = width + fmtcnt + 100;
5547 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005548 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 return NULL;
5550 res = PyUnicode_AS_UNICODE(result)
5551 + reslen - rescnt;
5552 }
5553 if (sign) {
5554 if (fill != ' ')
5555 *res++ = sign;
5556 rescnt--;
5557 if (width > len)
5558 width--;
5559 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005560 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5561 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005562 assert(pbuf[1] == c);
5563 if (fill != ' ') {
5564 *res++ = *pbuf++;
5565 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005566 }
Tim Petersfff53252001-04-12 18:38:48 +00005567 rescnt -= 2;
5568 width -= 2;
5569 if (width < 0)
5570 width = 0;
5571 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 if (width > len && !(flags & F_LJUST)) {
5574 do {
5575 --rescnt;
5576 *res++ = fill;
5577 } while (--width > len);
5578 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005579 if (fill == ' ') {
5580 if (sign)
5581 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005582 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005583 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005584 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005585 *res++ = *pbuf++;
5586 *res++ = *pbuf++;
5587 }
5588 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005589 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 res += len;
5591 rescnt -= len;
5592 while (--width >= len) {
5593 --rescnt;
5594 *res++ = ' ';
5595 }
5596 if (dict && (argidx < arglen) && c != '%') {
5597 PyErr_SetString(PyExc_TypeError,
5598 "not all arguments converted");
5599 goto onError;
5600 }
5601 Py_XDECREF(temp);
5602 } /* '%' */
5603 } /* until end */
5604 if (argidx < arglen && !dict) {
5605 PyErr_SetString(PyExc_TypeError,
5606 "not all arguments converted");
5607 goto onError;
5608 }
5609
5610 if (args_owned) {
5611 Py_DECREF(args);
5612 }
5613 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005614 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005615 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 return (PyObject *)result;
5617
5618 onError:
5619 Py_XDECREF(result);
5620 Py_DECREF(uformat);
5621 if (args_owned) {
5622 Py_DECREF(args);
5623 }
5624 return NULL;
5625}
5626
5627static PyBufferProcs unicode_as_buffer = {
5628 (getreadbufferproc) unicode_buffer_getreadbuf,
5629 (getwritebufferproc) unicode_buffer_getwritebuf,
5630 (getsegcountproc) unicode_buffer_getsegcount,
5631 (getcharbufferproc) unicode_buffer_getcharbuf,
5632};
5633
Guido van Rossume023fe02001-08-30 03:12:59 +00005634staticforward PyObject *
5635unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5636
Tim Peters6d6c1a32001-08-02 04:15:00 +00005637static PyObject *
5638unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5639{
5640 PyObject *x = NULL;
5641 static char *kwlist[] = {"string", "encoding", "errors", 0};
5642 char *encoding = NULL;
5643 char *errors = NULL;
5644
Guido van Rossume023fe02001-08-30 03:12:59 +00005645 if (type != &PyUnicode_Type)
5646 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005647 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5648 kwlist, &x, &encoding, &errors))
5649 return NULL;
5650 if (x == NULL)
5651 return (PyObject *)_PyUnicode_New(0);
5652 return PyUnicode_FromEncodedObject(x, encoding, errors);
5653}
5654
Guido van Rossume023fe02001-08-30 03:12:59 +00005655static PyObject *
5656unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5657{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005658 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005659 int n;
5660
5661 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5662 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5663 if (tmp == NULL)
5664 return NULL;
5665 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005666 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5667 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005668 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005669 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5670 if (pnew->str == NULL) {
5671 _Py_ForgetReference((PyObject *)pnew);
5672 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005673 return NULL;
5674 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005675 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5676 pnew->length = n;
5677 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005678 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005679 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005680}
5681
Tim Peters6d6c1a32001-08-02 04:15:00 +00005682static char unicode_doc[] =
5683"unicode(string [, encoding[, errors]]) -> object\n\
5684\n\
5685Create a new Unicode object from the given encoded string.\n\
5686encoding defaults to the current default string encoding and \n\
5687errors, defining the error handling, to 'strict'.";
5688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689PyTypeObject PyUnicode_Type = {
5690 PyObject_HEAD_INIT(&PyType_Type)
5691 0, /* ob_size */
5692 "unicode", /* tp_name */
5693 sizeof(PyUnicodeObject), /* tp_size */
5694 0, /* tp_itemsize */
5695 /* Slots */
5696 (destructor)_PyUnicode_Free, /* tp_dealloc */
5697 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005698 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 0, /* tp_setattr */
5700 (cmpfunc) unicode_compare, /* tp_compare */
5701 (reprfunc) unicode_repr, /* tp_repr */
5702 0, /* tp_as_number */
5703 &unicode_as_sequence, /* tp_as_sequence */
5704 0, /* tp_as_mapping */
5705 (hashfunc) unicode_hash, /* tp_hash*/
5706 0, /* tp_call*/
5707 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005708 PyObject_GenericGetAttr, /* tp_getattro */
5709 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005711 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005712 unicode_doc, /* tp_doc */
5713 0, /* tp_traverse */
5714 0, /* tp_clear */
5715 0, /* tp_richcompare */
5716 0, /* tp_weaklistoffset */
5717 0, /* tp_iter */
5718 0, /* tp_iternext */
5719 unicode_methods, /* tp_methods */
5720 0, /* tp_members */
5721 0, /* tp_getset */
5722 0, /* tp_base */
5723 0, /* tp_dict */
5724 0, /* tp_descr_get */
5725 0, /* tp_descr_set */
5726 0, /* tp_dictoffset */
5727 0, /* tp_init */
5728 0, /* tp_alloc */
5729 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730};
5731
5732/* Initialize the Unicode implementation */
5733
Thomas Wouters78890102000-07-22 19:25:51 +00005734void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005736 int i;
5737
Fred Drakee4315f52000-05-09 19:53:39 +00005738 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005739 unicode_freelist = NULL;
5740 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005742 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005743 for (i = 0; i < 256; i++)
5744 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745}
5746
5747/* Finalize the Unicode implementation */
5748
5749void
Thomas Wouters78890102000-07-22 19:25:51 +00005750_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005752 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005753 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005755 Py_XDECREF(unicode_empty);
5756 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005757
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005758 for (i = 0; i < 256; i++) {
5759 if (unicode_latin1[i]) {
5760 Py_DECREF(unicode_latin1[i]);
5761 unicode_latin1[i] = NULL;
5762 }
5763 }
5764
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005765 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 PyUnicodeObject *v = u;
5767 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005768 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005769 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005770 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005771 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005773 unicode_freelist = NULL;
5774 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775}