blob: a252587e721eaf6567d4d72b155f245e8c95d86a [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum9475a232001-10-05 20:51:39 +0000229 if (!PyUnicode_CheckExact(unicode)) {
230 unicode->ob_type->tp_free((PyObject *)unicode);
231 return;
232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000234 /* Keep-Alive optimization */
235 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str = NULL;
238 unicode->length = 0;
239 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 if (unicode->defenc) {
241 Py_DECREF(unicode->defenc);
242 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 }
244 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 *(PyUnicodeObject **)unicode = unicode_freelist;
246 unicode_freelist = unicode;
247 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 }
249 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000251 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 }
254}
255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256int PyUnicode_Resize(PyObject **unicode,
257 int length)
258{
259 register PyUnicodeObject *v;
260
261 /* Argument checks */
262 if (unicode == NULL) {
263 PyErr_BadInternalCall();
264 return -1;
265 }
266 v = (PyUnicodeObject *)*unicode;
267 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
268 PyErr_BadInternalCall();
269 return -1;
270 }
271
272 /* Resizing unicode_empty and single character objects is not
273 possible since these are being shared. We simply return a fresh
274 copy with the same Unicode content. */
275 if (v->length != length &&
276 (v == unicode_empty || v->length == 1)) {
277 PyUnicodeObject *w = _PyUnicode_New(length);
278 if (w == NULL)
279 return -1;
280 Py_UNICODE_COPY(w->str, v->str,
281 length < v->length ? length : v->length);
282 *unicode = (PyObject *)w;
283 return 0;
284 }
285
286 /* Note that we don't have to modify *unicode for unshared Unicode
287 objects, since we can modify them in-place. */
288 return unicode_resize(v, length);
289}
290
291/* Internal API for use in unicodeobject.c only ! */
292#define _PyUnicode_Resize(unicodevar, length) \
293 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
294
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
296 int size)
297{
298 PyUnicodeObject *unicode;
299
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000300 /* If the Unicode data is known at construction time, we can apply
301 some optimizations which share commonly used objects. */
302 if (u != NULL) {
303
304 /* Optimization for empty strings */
305 if (size == 0 && unicode_empty != NULL) {
306 Py_INCREF(unicode_empty);
307 return (PyObject *)unicode_empty;
308 }
309
310 /* Single character Unicode objects in the Latin-1 range are
311 shared when using this constructor */
312 if (size == 1 && *u < 256) {
313 unicode = unicode_latin1[*u];
314 if (!unicode) {
315 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 if (!unicode)
317 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000318 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000319 unicode_latin1[*u] = unicode;
320 }
321 Py_INCREF(unicode);
322 return (PyObject *)unicode;
323 }
324 }
325
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode = _PyUnicode_New(size);
327 if (!unicode)
328 return NULL;
329
330 /* Copy the Unicode data into the new object */
331 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333
334 return (PyObject *)unicode;
335}
336
337#ifdef HAVE_WCHAR_H
338
339PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
340 int size)
341{
342 PyUnicodeObject *unicode;
343
344 if (w == NULL) {
345 PyErr_BadInternalCall();
346 return NULL;
347 }
348
349 unicode = _PyUnicode_New(size);
350 if (!unicode)
351 return NULL;
352
353 /* Copy the wchar_t data into the new object */
354#ifdef HAVE_USABLE_WCHAR_T
355 memcpy(unicode->str, w, size * sizeof(wchar_t));
356#else
357 {
358 register Py_UNICODE *u;
359 register int i;
360 u = PyUnicode_AS_UNICODE(unicode);
361 for (i = size; i >= 0; i--)
362 *u++ = *w++;
363 }
364#endif
365
366 return (PyObject *)unicode;
367}
368
369int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
370 register wchar_t *w,
371 int size)
372{
373 if (unicode == NULL) {
374 PyErr_BadInternalCall();
375 return -1;
376 }
377 if (size > PyUnicode_GET_SIZE(unicode))
378 size = PyUnicode_GET_SIZE(unicode);
379#ifdef HAVE_USABLE_WCHAR_T
380 memcpy(w, unicode->str, size * sizeof(wchar_t));
381#else
382 {
383 register Py_UNICODE *u;
384 register int i;
385 u = PyUnicode_AS_UNICODE(unicode);
386 for (i = size; i >= 0; i--)
387 *w++ = *u++;
388 }
389#endif
390
391 return size;
392}
393
394#endif
395
396PyObject *PyUnicode_FromObject(register PyObject *obj)
397{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000398 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
399}
400
401PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
402 const char *encoding,
403 const char *errors)
404{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000405 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 int owned = 0;
408 PyObject *v;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000409 int reclevel;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410
411 if (obj == NULL) {
412 PyErr_BadInternalCall();
413 return NULL;
414 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000415
416 /* Coerce object */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000417 for (reclevel = 0; reclevel < 2; reclevel++) {
418
419 if (PyUnicode_Check(obj)) {
420 if (encoding) {
421 PyErr_SetString(PyExc_TypeError,
422 "decoding Unicode is not supported");
423 goto onError;
424 }
425 if (PyUnicode_CheckExact(obj)) {
426 Py_INCREF(obj);
427 v = obj;
428 }
429 else {
430 /* For a subclass of unicode, return a true unicode object
431 with the same string value. */
432 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
433 PyUnicode_GET_SIZE(obj));
434 }
435 goto done;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000436 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000437 else if (PyString_Check(obj)) {
438 s = PyString_AS_STRING(obj);
439 len = PyString_GET_SIZE(obj);
440 break;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000441 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000442 else {
443 PyObject *w;
444
445 /* Try char buffer interface */
446 if (PyObject_AsCharBuffer(obj, &s, &len))
447 PyErr_Clear();
448 else
449 break;
450
451 /* Mimic the behaviour of str(object) if everything else
452 fails (see PyObject_Str()); this also covers instances
453 which implement __str__. */
454 if (obj->ob_type->tp_str == NULL)
455 w = PyObject_Repr(obj);
456 else
457 w = (*obj->ob_type->tp_str)(obj);
458 if (w == NULL)
459 goto onError;
460 if (owned) {
461 Py_DECREF(obj);
462 }
463 obj = w;
464 owned = 1;
Tim Peters78e0fc72001-09-11 03:07:38 +0000465 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000466 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468 if (s == NULL) {
469 PyErr_Format(PyExc_TypeError,
470 "coercing to Unicode: __str__ recursion limit exceeded "
471 "(last type: %.80s)",
472 obj->ob_type->tp_name);
473 goto onError;
474 }
475
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000476 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477 if (len == 0) {
478 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000479 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 else
482 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000483
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000484 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000485 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000487 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000488 return v;
489
490 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000491 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000492 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000493 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495}
496
497PyObject *PyUnicode_Decode(const char *s,
498 int size,
499 const char *encoding,
500 const char *errors)
501{
502 PyObject *buffer = NULL, *unicode;
503
Fred Drakee4315f52000-05-09 19:53:39 +0000504 if (encoding == NULL)
505 encoding = PyUnicode_GetDefaultEncoding();
506
507 /* Shortcuts for common default encodings */
508 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000510 else if (strcmp(encoding, "latin-1") == 0)
511 return PyUnicode_DecodeLatin1(s, size, errors);
512 else if (strcmp(encoding, "ascii") == 0)
513 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514
515 /* Decode via the codec registry */
516 buffer = PyBuffer_FromMemory((void *)s, size);
517 if (buffer == NULL)
518 goto onError;
519 unicode = PyCodec_Decode(buffer, encoding, errors);
520 if (unicode == NULL)
521 goto onError;
522 if (!PyUnicode_Check(unicode)) {
523 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000524 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525 unicode->ob_type->tp_name);
526 Py_DECREF(unicode);
527 goto onError;
528 }
529 Py_DECREF(buffer);
530 return unicode;
531
532 onError:
533 Py_XDECREF(buffer);
534 return NULL;
535}
536
537PyObject *PyUnicode_Encode(const Py_UNICODE *s,
538 int size,
539 const char *encoding,
540 const char *errors)
541{
542 PyObject *v, *unicode;
543
544 unicode = PyUnicode_FromUnicode(s, size);
545 if (unicode == NULL)
546 return NULL;
547 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
548 Py_DECREF(unicode);
549 return v;
550}
551
552PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
553 const char *encoding,
554 const char *errors)
555{
556 PyObject *v;
557
558 if (!PyUnicode_Check(unicode)) {
559 PyErr_BadArgument();
560 goto onError;
561 }
Fred Drakee4315f52000-05-09 19:53:39 +0000562
563 if (encoding == NULL)
564 encoding = PyUnicode_GetDefaultEncoding();
565
566 /* Shortcuts for common default encodings */
567 if (errors == NULL) {
568 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000569 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000570 else if (strcmp(encoding, "latin-1") == 0)
571 return PyUnicode_AsLatin1String(unicode);
572 else if (strcmp(encoding, "ascii") == 0)
573 return PyUnicode_AsASCIIString(unicode);
574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575
576 /* Encode via the codec registry */
577 v = PyCodec_Encode(unicode, encoding, errors);
578 if (v == NULL)
579 goto onError;
580 /* XXX Should we really enforce this ? */
581 if (!PyString_Check(v)) {
582 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000583 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 v->ob_type->tp_name);
585 Py_DECREF(v);
586 goto onError;
587 }
588 return v;
589
590 onError:
591 return NULL;
592}
593
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000594PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
595 const char *errors)
596{
597 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
598
599 if (v)
600 return v;
601 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
602 if (v && errors == NULL)
603 ((PyUnicodeObject *)unicode)->defenc = v;
604 return v;
605}
606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
608{
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_BadArgument();
611 goto onError;
612 }
613 return PyUnicode_AS_UNICODE(unicode);
614
615 onError:
616 return NULL;
617}
618
619int PyUnicode_GetSize(PyObject *unicode)
620{
621 if (!PyUnicode_Check(unicode)) {
622 PyErr_BadArgument();
623 goto onError;
624 }
625 return PyUnicode_GET_SIZE(unicode);
626
627 onError:
628 return -1;
629}
630
Thomas Wouters78890102000-07-22 19:25:51 +0000631const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000632{
633 return unicode_default_encoding;
634}
635
636int PyUnicode_SetDefaultEncoding(const char *encoding)
637{
638 PyObject *v;
639
640 /* Make sure the encoding is valid. As side effect, this also
641 loads the encoding into the codec registry cache. */
642 v = _PyCodec_Lookup(encoding);
643 if (v == NULL)
644 goto onError;
645 Py_DECREF(v);
646 strncpy(unicode_default_encoding,
647 encoding,
648 sizeof(unicode_default_encoding));
649 return 0;
650
651 onError:
652 return -1;
653}
654
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000655/* --- UTF-7 Codec -------------------------------------------------------- */
656
657/* see RFC2152 for details */
658
659static
660char utf7_special[128] = {
661 /* indicate whether a UTF-7 character is special i.e. cannot be directly
662 encoded:
663 0 - not special
664 1 - special
665 2 - whitespace (optional)
666 3 - RFC2152 Set O (optional) */
667 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
669 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
671 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
673 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
675
676};
677
678#define SPECIAL(c, encodeO, encodeWS) \
679 (((c)>127 || utf7_special[(c)] == 1) || \
680 (encodeWS && (utf7_special[(c)] == 2)) || \
681 (encodeO && (utf7_special[(c)] == 3)))
682
683#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
684#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
685#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
686 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
687
688#define ENCODE(out, ch, bits) \
689 while (bits >= 6) { \
690 *out++ = B64(ch >> (bits-6)); \
691 bits -= 6; \
692 }
693
694#define DECODE(out, ch, bits, surrogate) \
695 while (bits >= 16) { \
696 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
697 bits -= 16; \
698 if (surrogate) { \
699 /* We have already generated an error for the high surrogate
700 so let's not bother seeing if the low surrogate is correct or not */\
701 surrogate = 0; \
702 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
703 /* This is a surrogate pair. Unfortunately we can't represent \
704 it in a 16-bit character */ \
705 surrogate = 1; \
706 errmsg = "code pairs are not supported"; \
707 goto utf7Error; \
708 } else { \
709 *out++ = outCh; \
710 } \
711 } \
712
713static
714int utf7_decoding_error(Py_UNICODE **dest,
715 const char *errors,
716 const char *details)
717{
718 if ((errors == NULL) ||
719 (strcmp(errors,"strict") == 0)) {
720 PyErr_Format(PyExc_UnicodeError,
721 "UTF-7 decoding error: %.400s",
722 details);
723 return -1;
724 }
725 else if (strcmp(errors,"ignore") == 0) {
726 return 0;
727 }
728 else if (strcmp(errors,"replace") == 0) {
729 if (dest != NULL) {
730 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
731 (*dest)++;
732 }
733 return 0;
734 }
735 else {
736 PyErr_Format(PyExc_ValueError,
737 "UTF-7 decoding error; unknown error handling code: %.400s",
738 errors);
739 return -1;
740 }
741}
742
743PyObject *PyUnicode_DecodeUTF7(const char *s,
744 int size,
745 const char *errors)
746{
747 const char *e;
748 PyUnicodeObject *unicode;
749 Py_UNICODE *p;
750 const char *errmsg = "";
751 int inShift = 0;
752 unsigned int bitsleft = 0;
753 unsigned long charsleft = 0;
754 int surrogate = 0;
755
756 unicode = _PyUnicode_New(size);
757 if (!unicode)
758 return NULL;
759 if (size == 0)
760 return (PyObject *)unicode;
761
762 p = unicode->str;
763 e = s + size;
764
765 while (s < e) {
766 Py_UNICODE ch = *s;
767
768 if (inShift) {
769 if ((ch == '-') || !B64CHAR(ch)) {
770 inShift = 0;
771 s++;
772
773 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
774 if (bitsleft >= 6) {
775 /* The shift sequence has a partial character in it. If
776 bitsleft < 6 then we could just classify it as padding
777 but that is not the case here */
778
779 errmsg = "partial character in shift sequence";
780 goto utf7Error;
781 }
782 /* According to RFC2152 the remaining bits should be zero. We
783 choose to signal an error/insert a replacement character
784 here so indicate the potential of a misencoded character. */
785
786 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
787 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
788 errmsg = "non-zero padding bits in shift sequence";
789 goto utf7Error;
790 }
791
792 if (ch == '-') {
793 if ((s < e) && (*(s) == '-')) {
794 *p++ = '-';
795 inShift = 1;
796 }
797 } else if (SPECIAL(ch,0,0)) {
798 errmsg = "unexpected special character";
799 goto utf7Error;
800 } else {
801 *p++ = ch;
802 }
803 } else {
804 charsleft = (charsleft << 6) | UB64(ch);
805 bitsleft += 6;
806 s++;
807 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
808 }
809 }
810 else if ( ch == '+' ) {
811 s++;
812 if (s < e && *s == '-') {
813 s++;
814 *p++ = '+';
815 } else
816 {
817 inShift = 1;
818 bitsleft = 0;
819 }
820 }
821 else if (SPECIAL(ch,0,0)) {
822 errmsg = "unexpected special character";
823 s++;
824 goto utf7Error;
825 }
826 else {
827 *p++ = ch;
828 s++;
829 }
830 continue;
831 utf7Error:
832 if (utf7_decoding_error(&p, errors, errmsg))
833 goto onError;
834 }
835
836 if (inShift) {
837 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
838 goto onError;
839 }
840
841 if (_PyUnicode_Resize(&unicode, p - unicode->str))
842 goto onError;
843
844 return (PyObject *)unicode;
845
846onError:
847 Py_DECREF(unicode);
848 return NULL;
849}
850
851
852PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
853 int size,
854 int encodeSetO,
855 int encodeWhiteSpace,
856 const char *errors)
857{
858 PyObject *v;
859 /* It might be possible to tighten this worst case */
860 unsigned int cbAllocated = 5 * size;
861 int inShift = 0;
862 int i = 0;
863 unsigned int bitsleft = 0;
864 unsigned long charsleft = 0;
865 char * out;
866 char * start;
867
868 if (size == 0)
869 return PyString_FromStringAndSize(NULL, 0);
870
871 v = PyString_FromStringAndSize(NULL, cbAllocated);
872 if (v == NULL)
873 return NULL;
874
875 start = out = PyString_AS_STRING(v);
876 for (;i < size; ++i) {
877 Py_UNICODE ch = s[i];
878
879 if (!inShift) {
880 if (ch == '+') {
881 *out++ = '+';
882 *out++ = '-';
883 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
884 charsleft = ch;
885 bitsleft = 16;
886 *out++ = '+';
887 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
888 inShift = bitsleft > 0;
889 } else {
890 *out++ = (char) ch;
891 }
892 } else {
893 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
894 *out++ = B64(charsleft << (6-bitsleft));
895 charsleft = 0;
896 bitsleft = 0;
897 /* Characters not in the BASE64 set implicitly unshift the sequence
898 so no '-' is required, except if the character is itself a '-' */
899 if (B64CHAR(ch) || ch == '-') {
900 *out++ = '-';
901 }
902 inShift = 0;
903 *out++ = (char) ch;
904 } else {
905 bitsleft += 16;
906 charsleft = (charsleft << 16) | ch;
907 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
908
909 /* If the next character is special then we dont' need to terminate
910 the shift sequence. If the next character is not a BASE64 character
911 or '-' then the shift sequence will be terminated implicitly and we
912 don't have to insert a '-'. */
913
914 if (bitsleft == 0) {
915 if (i + 1 < size) {
916 Py_UNICODE ch2 = s[i+1];
917
918 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
919
920 } else if (B64CHAR(ch2) || ch2 == '-') {
921 *out++ = '-';
922 inShift = 0;
923 } else {
924 inShift = 0;
925 }
926
927 }
928 else {
929 *out++ = '-';
930 inShift = 0;
931 }
932 }
933 }
934 }
935 }
936 if (bitsleft) {
937 *out++= B64(charsleft << (6-bitsleft) );
938 *out++ = '-';
939 }
940
941 if (_PyString_Resize(&v, out - start)) {
942 Py_DECREF(v);
943 return NULL;
944 }
945 return v;
946}
947
948#undef SPECIAL
949#undef B64
950#undef B64CHAR
951#undef UB64
952#undef ENCODE
953#undef DECODE
954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- UTF-8 Codec -------------------------------------------------------- */
956
957static
958char utf8_code_length[256] = {
959 /* Map UTF-8 encoded prefix byte to sequence length. zero means
960 illegal prefix. see RFC 2279 for details */
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
966 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
967 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
968 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
971 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
973 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
974 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
976 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
977};
978
979static
980int utf8_decoding_error(const char **source,
981 Py_UNICODE **dest,
982 const char *errors,
983 const char *details)
984{
985 if ((errors == NULL) ||
986 (strcmp(errors,"strict") == 0)) {
987 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000988 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 details);
990 return -1;
991 }
992 else if (strcmp(errors,"ignore") == 0) {
993 (*source)++;
994 return 0;
995 }
996 else if (strcmp(errors,"replace") == 0) {
997 (*source)++;
998 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
999 (*dest)++;
1000 return 0;
1001 }
1002 else {
1003 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001004 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 errors);
1006 return -1;
1007 }
1008}
1009
Guido van Rossumd57fd912000-03-10 22:53:23 +00001010PyObject *PyUnicode_DecodeUTF8(const char *s,
1011 int size,
1012 const char *errors)
1013{
1014 int n;
1015 const char *e;
1016 PyUnicodeObject *unicode;
1017 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001018 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019
1020 /* Note: size will always be longer than the resulting Unicode
1021 character count */
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 /* Unpack UTF-8 encoded data */
1029 p = unicode->str;
1030 e = s + size;
1031
1032 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001033 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034
1035 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001036 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 s++;
1038 continue;
1039 }
1040
1041 n = utf8_code_length[ch];
1042
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001043 if (s + n > e) {
1044 errmsg = "unexpected end of data";
1045 goto utf8Error;
1046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047
1048 switch (n) {
1049
1050 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001051 errmsg = "unexpected code byte";
1052 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
1054 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 errmsg = "internal error";
1056 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057
1058 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001059 if ((s[1] & 0xc0) != 0x80) {
1060 errmsg = "invalid data";
1061 goto utf8Error;
1062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001064 if (ch < 0x80) {
1065 errmsg = "illegal encoding";
1066 goto utf8Error;
1067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001069 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070 break;
1071
1072 case 3:
1073 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001074 (s[2] & 0xc0) != 0x80) {
1075 errmsg = "invalid data";
1076 goto utf8Error;
1077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001079 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1080 errmsg = "illegal encoding";
1081 goto utf8Error;
1082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001084 *p++ = (Py_UNICODE)ch;
1085 break;
1086
1087 case 4:
1088 if ((s[1] & 0xc0) != 0x80 ||
1089 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001090 (s[3] & 0xc0) != 0x80) {
1091 errmsg = "invalid data";
1092 goto utf8Error;
1093 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001094 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1095 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1096 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001099 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001100 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001101 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001102 errmsg = "illegal encoding";
1103 goto utf8Error;
1104 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001105#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001106 *p++ = (Py_UNICODE)ch;
1107#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001108 /* compute and append the two surrogates: */
1109
1110 /* translate from 10000..10FFFF to 0..FFFF */
1111 ch -= 0x10000;
1112
1113 /* high surrogate = top 10 bits added to D800 */
1114 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1115
1116 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001117 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001118#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 break;
1120
1121 default:
1122 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 errmsg = "unsupported Unicode code range";
1124 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 }
1126 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001127 continue;
1128
1129 utf8Error:
1130 if (utf8_decoding_error(&s, &p, errors, errmsg))
1131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 }
1133
1134 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001135 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 goto onError;
1137
1138 return (PyObject *)unicode;
1139
1140onError:
1141 Py_DECREF(unicode);
1142 return NULL;
1143}
1144
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001145/* Not used anymore, now that the encoder supports UTF-16
1146 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001147#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148static
1149int utf8_encoding_error(const Py_UNICODE **source,
1150 char **dest,
1151 const char *errors,
1152 const char *details)
1153{
1154 if ((errors == NULL) ||
1155 (strcmp(errors,"strict") == 0)) {
1156 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001157 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 details);
1159 return -1;
1160 }
1161 else if (strcmp(errors,"ignore") == 0) {
1162 return 0;
1163 }
1164 else if (strcmp(errors,"replace") == 0) {
1165 **dest = '?';
1166 (*dest)++;
1167 return 0;
1168 }
1169 else {
1170 PyErr_Format(PyExc_ValueError,
1171 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001172 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 errors);
1174 return -1;
1175 }
1176}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001177#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178
1179PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1180 int size,
1181 const char *errors)
1182{
1183 PyObject *v;
1184 char *p;
1185 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001186 Py_UCS4 ch2;
1187 unsigned int cbAllocated = 3 * size;
1188 unsigned int cbWritten = 0;
1189 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if (v == NULL)
1193 return NULL;
1194 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001195 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196
1197 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001198 while (i < size) {
1199 Py_UCS4 ch = s[i++];
1200 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001202 cbWritten++;
1203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 else if (ch < 0x0800) {
1205 *p++ = 0xc0 | (ch >> 6);
1206 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001207 cbWritten += 2;
1208 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001209 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001210 /* Check for high surrogate */
1211 if (0xD800 <= ch && ch <= 0xDBFF) {
1212 if (i != size) {
1213 ch2 = s[i];
1214 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1215
1216 if (cbWritten >= (cbAllocated - 4)) {
1217 /* Provide enough room for some more
1218 surrogates */
1219 cbAllocated += 4*10;
1220 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001222 }
1223
1224 /* combine the two values */
1225 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1226
1227 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001228 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001229 i++;
1230 cbWritten += 4;
1231 }
1232 }
1233 }
1234 else {
1235 *p++ = (char)(0xe0 | (ch >> 12));
1236 cbWritten += 3;
1237 }
1238 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1239 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001240 } else {
1241 *p++ = 0xf0 | (ch>>18);
1242 *p++ = 0x80 | ((ch>>12) & 0x3f);
1243 *p++ = 0x80 | ((ch>>6) & 0x3f);
1244 *p++ = 0x80 | (ch & 0x3f);
1245 cbWritten += 4;
1246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 }
1248 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001249 if (_PyString_Resize(&v, p - q))
1250 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 return v;
1252
1253 onError:
1254 Py_DECREF(v);
1255 return NULL;
1256}
1257
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1259{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 if (!PyUnicode_Check(unicode)) {
1261 PyErr_BadArgument();
1262 return NULL;
1263 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001264 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1265 PyUnicode_GET_SIZE(unicode),
1266 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267}
1268
1269/* --- UTF-16 Codec ------------------------------------------------------- */
1270
1271static
Tim Peters772747b2001-08-09 22:21:55 +00001272int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 const char *errors,
1274 const char *details)
1275{
1276 if ((errors == NULL) ||
1277 (strcmp(errors,"strict") == 0)) {
1278 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001279 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 details);
1281 return -1;
1282 }
1283 else if (strcmp(errors,"ignore") == 0) {
1284 return 0;
1285 }
1286 else if (strcmp(errors,"replace") == 0) {
1287 if (dest) {
1288 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1289 (*dest)++;
1290 }
1291 return 0;
1292 }
1293 else {
1294 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001295 "UTF-16 decoding error; "
1296 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 errors);
1298 return -1;
1299 }
1300}
1301
Tim Peters772747b2001-08-09 22:21:55 +00001302PyObject *
1303PyUnicode_DecodeUTF16(const char *s,
1304 int size,
1305 const char *errors,
1306 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307{
1308 PyUnicodeObject *unicode;
1309 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001310 const unsigned char *q, *e;
1311 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001313 /* Offsets from q for retrieving byte pairs in the right order. */
1314#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1315 int ihi = 1, ilo = 0;
1316#else
1317 int ihi = 0, ilo = 1;
1318#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001321 if (size & 1) {
1322 if (utf16_decoding_error(NULL, errors, "truncated data"))
1323 return NULL;
1324 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
1326
1327 /* Note: size will always be longer than the resulting Unicode
1328 character count */
1329 unicode = _PyUnicode_New(size);
1330 if (!unicode)
1331 return NULL;
1332 if (size == 0)
1333 return (PyObject *)unicode;
1334
1335 /* Unpack UTF-16 encoded data */
1336 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001337 q = (unsigned char *)s;
1338 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339
1340 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001341 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001343 /* Check for BOM marks (U+FEFF) in the input and adjust current
1344 byte order setting accordingly. In native mode, the leading BOM
1345 mark is skipped, in all other modes, it is copied to the output
1346 stream as-is (giving a ZWNBSP character). */
1347 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001348 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001349#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001350 if (bom == 0xFEFF) {
1351 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001352 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001353 }
1354 else if (bom == 0xFFFE) {
1355 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001356 bo = 1;
1357 }
1358#else
Tim Peters772747b2001-08-09 22:21:55 +00001359 if (bom == 0xFEFF) {
1360 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001361 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001362 }
1363 else if (bom == 0xFFFE) {
1364 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001365 bo = -1;
1366 }
1367#endif
1368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369
Tim Peters772747b2001-08-09 22:21:55 +00001370 if (bo == -1) {
1371 /* force LE */
1372 ihi = 1;
1373 ilo = 0;
1374 }
1375 else if (bo == 1) {
1376 /* force BE */
1377 ihi = 0;
1378 ilo = 1;
1379 }
1380
1381 while (q < e) {
1382 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1383 q += 2;
1384
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 if (ch < 0xD800 || ch > 0xDFFF) {
1386 *p++ = ch;
1387 continue;
1388 }
1389
1390 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001391 if (q >= e) {
1392 errmsg = "unexpected end of data";
1393 goto utf16Error;
1394 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001395 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001396 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1397 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001398 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001399#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001400 *p++ = ch;
1401 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001402#else
1403 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001404#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001405 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001406 }
1407 else {
1408 errmsg = "illegal UTF-16 surrogate";
1409 goto utf16Error;
1410 }
1411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001413 errmsg = "illegal encoding";
1414 /* Fall through to report the error */
1415
1416 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001417 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419 }
1420
1421 if (byteorder)
1422 *byteorder = bo;
1423
1424 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001425 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 goto onError;
1427
1428 return (PyObject *)unicode;
1429
1430onError:
1431 Py_DECREF(unicode);
1432 return NULL;
1433}
1434
Tim Peters772747b2001-08-09 22:21:55 +00001435PyObject *
1436PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1437 int size,
1438 const char *errors,
1439 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440{
1441 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001442 unsigned char *p;
1443 int i, pairs;
1444 /* Offsets from p for storing byte pairs in the right order. */
1445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1446 int ihi = 1, ilo = 0;
1447#else
1448 int ihi = 0, ilo = 1;
1449#endif
1450
1451#define STORECHAR(CH) \
1452 do { \
1453 p[ihi] = ((CH) >> 8) & 0xff; \
1454 p[ilo] = (CH) & 0xff; \
1455 p += 2; \
1456 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001458 for (i = pairs = 0; i < size; i++)
1459 if (s[i] >= 0x10000)
1460 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001462 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 if (v == NULL)
1464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465
Tim Peters772747b2001-08-09 22:21:55 +00001466 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001468 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001469 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001470 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001471
1472 if (byteorder == -1) {
1473 /* force LE */
1474 ihi = 1;
1475 ilo = 0;
1476 }
1477 else if (byteorder == 1) {
1478 /* force BE */
1479 ihi = 0;
1480 ilo = 1;
1481 }
1482
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001483 while (size-- > 0) {
1484 Py_UNICODE ch = *s++;
1485 Py_UNICODE ch2 = 0;
1486 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001487 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1488 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 }
Tim Peters772747b2001-08-09 22:21:55 +00001490 STORECHAR(ch);
1491 if (ch2)
1492 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001495#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496}
1497
1498PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1499{
1500 if (!PyUnicode_Check(unicode)) {
1501 PyErr_BadArgument();
1502 return NULL;
1503 }
1504 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 NULL,
1507 0);
1508}
1509
1510/* --- Unicode Escape Codec ----------------------------------------------- */
1511
1512static
1513int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001514 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 const char *errors,
1516 const char *details)
1517{
1518 if ((errors == NULL) ||
1519 (strcmp(errors,"strict") == 0)) {
1520 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001521 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 details);
1523 return -1;
1524 }
1525 else if (strcmp(errors,"ignore") == 0) {
1526 return 0;
1527 }
1528 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001529 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 return 0;
1531 }
1532 else {
1533 PyErr_Format(PyExc_ValueError,
1534 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001535 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536 errors);
1537 return -1;
1538 }
1539}
1540
Fredrik Lundh06d12682001-01-24 07:59:11 +00001541static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001542
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1544 int size,
1545 const char *errors)
1546{
1547 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001548 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001550 char* message;
1551 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1552
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 /* Escaped strings will always be longer than the resulting
1554 Unicode string, so we start with size here and then reduce the
1555 length after conversion to the true value. */
1556 v = _PyUnicode_New(size);
1557 if (v == NULL)
1558 goto onError;
1559 if (size == 0)
1560 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001561
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 p = buf = PyUnicode_AS_UNICODE(v);
1563 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001564
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 while (s < end) {
1566 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001567 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569
1570 /* Non-escape characters are interpreted as Unicode ordinals */
1571 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001572 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573 continue;
1574 }
1575
1576 /* \ - Escapes */
1577 s++;
1578 switch (*s++) {
1579
1580 /* \x escapes */
1581 case '\n': break;
1582 case '\\': *p++ = '\\'; break;
1583 case '\'': *p++ = '\''; break;
1584 case '\"': *p++ = '\"'; break;
1585 case 'b': *p++ = '\b'; break;
1586 case 'f': *p++ = '\014'; break; /* FF */
1587 case 't': *p++ = '\t'; break;
1588 case 'n': *p++ = '\n'; break;
1589 case 'r': *p++ = '\r'; break;
1590 case 'v': *p++ = '\013'; break; /* VT */
1591 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1592
1593 /* \OOO (octal) escapes */
1594 case '0': case '1': case '2': case '3':
1595 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001596 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001598 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001600 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001602 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 break;
1604
Fredrik Lundhccc74732001-02-18 22:13:49 +00001605 /* hex escapes */
1606 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001608 digits = 2;
1609 message = "truncated \\xXX escape";
1610 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611
Fredrik Lundhccc74732001-02-18 22:13:49 +00001612 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001614 digits = 4;
1615 message = "truncated \\uXXXX escape";
1616 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001619 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001620 digits = 8;
1621 message = "truncated \\UXXXXXXXX escape";
1622 hexescape:
1623 chr = 0;
1624 for (i = 0; i < digits; i++) {
1625 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001626 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001627 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001628 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001629 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001630 i++;
1631 break;
1632 }
1633 chr = (chr<<4) & ~0xF;
1634 if (c >= '0' && c <= '9')
1635 chr += c - '0';
1636 else if (c >= 'a' && c <= 'f')
1637 chr += 10 + c - 'a';
1638 else
1639 chr += 10 + c - 'A';
1640 }
1641 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001642 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001643 /* when we get here, chr is a 32-bit unicode character */
1644 if (chr <= 0xffff)
1645 /* UCS-2 character */
1646 *p++ = (Py_UNICODE) chr;
1647 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001648 /* UCS-4 character. Either store directly, or as
1649 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001650#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001651 *p++ = chr;
1652#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001653 chr -= 0x10000L;
1654 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001655 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001656#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 } else {
1658 if (unicodeescape_decoding_error(
1659 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001661 )
1662 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001663 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001664 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001665 break;
1666
1667 /* \N{name} */
1668 case 'N':
1669 message = "malformed \\N character escape";
1670 if (ucnhash_CAPI == NULL) {
1671 /* load the unicode data module */
1672 PyObject *m, *v;
1673 m = PyImport_ImportModule("unicodedata");
1674 if (m == NULL)
1675 goto ucnhashError;
1676 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1677 Py_DECREF(m);
1678 if (v == NULL)
1679 goto ucnhashError;
1680 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1681 Py_DECREF(v);
1682 if (ucnhash_CAPI == NULL)
1683 goto ucnhashError;
1684 }
1685 if (*s == '{') {
1686 const char *start = s+1;
1687 /* look for the closing brace */
1688 while (*s != '}' && s < end)
1689 s++;
1690 if (s > start && s < end && *s == '}') {
1691 /* found a name. look it up in the unicode database */
1692 message = "unknown Unicode character name";
1693 s++;
1694 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1695 goto store;
1696 }
1697 }
1698 if (unicodeescape_decoding_error(&s, &x, errors, message))
1699 goto onError;
1700 *p++ = x;
1701 break;
1702
1703 default:
1704 *p++ = '\\';
1705 *p++ = (unsigned char)s[-1];
1706 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 }
1708 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001709 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001710 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 return (PyObject *)v;
1712
Fredrik Lundhccc74732001-02-18 22:13:49 +00001713ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001714 PyErr_SetString(
1715 PyExc_UnicodeError,
1716 "\\N escapes not supported (can't load unicodedata module)"
1717 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001718 return NULL;
1719
Fredrik Lundhccc74732001-02-18 22:13:49 +00001720onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 Py_XDECREF(v);
1722 return NULL;
1723}
1724
1725/* Return a Unicode-Escape string version of the Unicode object.
1726
1727 If quotes is true, the string is enclosed in u"" or u'' quotes as
1728 appropriate.
1729
1730*/
1731
Barry Warsaw51ac5802000-03-20 16:36:48 +00001732static const Py_UNICODE *findchar(const Py_UNICODE *s,
1733 int size,
1734 Py_UNICODE ch);
1735
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736static
1737PyObject *unicodeescape_string(const Py_UNICODE *s,
1738 int size,
1739 int quotes)
1740{
1741 PyObject *repr;
1742 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001744 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745
1746 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1747 if (repr == NULL)
1748 return NULL;
1749
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001750 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751
1752 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 *p++ = 'u';
1754 *p++ = (findchar(s, size, '\'') &&
1755 !findchar(s, size, '"')) ? '"' : '\'';
1756 }
1757 while (size-- > 0) {
1758 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001759
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001761 if (quotes &&
1762 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 *p++ = '\\';
1764 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001765 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001767
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001768#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001769 /* Map 21-bit characters to '\U00xxxxxx' */
1770 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001771 int offset = p - PyString_AS_STRING(repr);
1772
1773 /* Resize the string if necessary */
1774 if (offset + 12 > PyString_GET_SIZE(repr)) {
1775 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1776 goto onError;
1777 p = PyString_AS_STRING(repr) + offset;
1778 }
1779
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001780 *p++ = '\\';
1781 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001782 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1783 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1784 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1785 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1786 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1787 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1788 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001789 *p++ = hexdigit[ch & 0x0000000F];
1790 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001791 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001792#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001793 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1794 else if (ch >= 0xD800 && ch < 0xDC00) {
1795 Py_UNICODE ch2;
1796 Py_UCS4 ucs;
1797
1798 ch2 = *s++;
1799 size--;
1800 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1801 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1802 *p++ = '\\';
1803 *p++ = 'U';
1804 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1805 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1806 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1807 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1808 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1809 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1810 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1811 *p++ = hexdigit[ucs & 0x0000000F];
1812 continue;
1813 }
1814 /* Fall through: isolated surrogates are copied as-is */
1815 s--;
1816 size++;
1817 }
1818
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001820 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 *p++ = '\\';
1822 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001823 *p++ = hexdigit[(ch >> 12) & 0x000F];
1824 *p++ = hexdigit[(ch >> 8) & 0x000F];
1825 *p++ = hexdigit[(ch >> 4) & 0x000F];
1826 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001828
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001829 /* Map special whitespace to '\t', \n', '\r' */
1830 else if (ch == '\t') {
1831 *p++ = '\\';
1832 *p++ = 't';
1833 }
1834 else if (ch == '\n') {
1835 *p++ = '\\';
1836 *p++ = 'n';
1837 }
1838 else if (ch == '\r') {
1839 *p++ = '\\';
1840 *p++ = 'r';
1841 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001842
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001843 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 else if (ch < ' ' || ch >= 128) {
1845 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001846 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001847 *p++ = hexdigit[(ch >> 4) & 0x000F];
1848 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001850
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 /* Copy everything else as-is */
1852 else
1853 *p++ = (char) ch;
1854 }
1855 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001856 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857
1858 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001859 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861
1862 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001863
1864 onError:
1865 Py_DECREF(repr);
1866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867}
1868
1869PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1870 int size)
1871{
1872 return unicodeescape_string(s, size, 0);
1873}
1874
1875PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1876{
1877 if (!PyUnicode_Check(unicode)) {
1878 PyErr_BadArgument();
1879 return NULL;
1880 }
1881 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1882 PyUnicode_GET_SIZE(unicode));
1883}
1884
1885/* --- Raw Unicode Escape Codec ------------------------------------------- */
1886
1887PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1888 int size,
1889 const char *errors)
1890{
1891 PyUnicodeObject *v;
1892 Py_UNICODE *p, *buf;
1893 const char *end;
1894 const char *bs;
1895
1896 /* Escaped strings will always be longer than the resulting
1897 Unicode string, so we start with size here and then reduce the
1898 length after conversion to the true value. */
1899 v = _PyUnicode_New(size);
1900 if (v == NULL)
1901 goto onError;
1902 if (size == 0)
1903 return (PyObject *)v;
1904 p = buf = PyUnicode_AS_UNICODE(v);
1905 end = s + size;
1906 while (s < end) {
1907 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001908 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 int i;
1910
1911 /* Non-escape characters are interpreted as Unicode ordinals */
1912 if (*s != '\\') {
1913 *p++ = (unsigned char)*s++;
1914 continue;
1915 }
1916
1917 /* \u-escapes are only interpreted iff the number of leading
1918 backslashes if odd */
1919 bs = s;
1920 for (;s < end;) {
1921 if (*s != '\\')
1922 break;
1923 *p++ = (unsigned char)*s++;
1924 }
1925 if (((s - bs) & 1) == 0 ||
1926 s >= end ||
1927 *s != 'u') {
1928 continue;
1929 }
1930 p--;
1931 s++;
1932
1933 /* \uXXXX with 4 hex digits */
1934 for (x = 0, i = 0; i < 4; i++) {
1935 c = (unsigned char)s[i];
1936 if (!isxdigit(c)) {
1937 if (unicodeescape_decoding_error(&s, &x, errors,
1938 "truncated \\uXXXX"))
1939 goto onError;
1940 i++;
1941 break;
1942 }
1943 x = (x<<4) & ~0xF;
1944 if (c >= '0' && c <= '9')
1945 x += c - '0';
1946 else if (c >= 'a' && c <= 'f')
1947 x += 10 + c - 'a';
1948 else
1949 x += 10 + c - 'A';
1950 }
1951 s += i;
1952 *p++ = x;
1953 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001954 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 return (PyObject *)v;
1957
1958 onError:
1959 Py_XDECREF(v);
1960 return NULL;
1961}
1962
1963PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1964 int size)
1965{
1966 PyObject *repr;
1967 char *p;
1968 char *q;
1969
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001970 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971
1972 repr = PyString_FromStringAndSize(NULL, 6 * size);
1973 if (repr == NULL)
1974 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001975 if (size == 0)
1976 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
1978 p = q = PyString_AS_STRING(repr);
1979 while (size-- > 0) {
1980 Py_UNICODE ch = *s++;
1981 /* Map 16-bit characters to '\uxxxx' */
1982 if (ch >= 256) {
1983 *p++ = '\\';
1984 *p++ = 'u';
1985 *p++ = hexdigit[(ch >> 12) & 0xf];
1986 *p++ = hexdigit[(ch >> 8) & 0xf];
1987 *p++ = hexdigit[(ch >> 4) & 0xf];
1988 *p++ = hexdigit[ch & 15];
1989 }
1990 /* Copy everything else as-is */
1991 else
1992 *p++ = (char) ch;
1993 }
1994 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001995 if (_PyString_Resize(&repr, p - q))
1996 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
1998 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001999
2000 onError:
2001 Py_DECREF(repr);
2002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003}
2004
2005PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2006{
2007 if (!PyUnicode_Check(unicode)) {
2008 PyErr_BadArgument();
2009 return NULL;
2010 }
2011 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2012 PyUnicode_GET_SIZE(unicode));
2013}
2014
2015/* --- Latin-1 Codec ------------------------------------------------------ */
2016
2017PyObject *PyUnicode_DecodeLatin1(const char *s,
2018 int size,
2019 const char *errors)
2020{
2021 PyUnicodeObject *v;
2022 Py_UNICODE *p;
2023
2024 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002025 if (size == 1 && *(unsigned char*)s < 256) {
2026 Py_UNICODE r = *(unsigned char*)s;
2027 return PyUnicode_FromUnicode(&r, 1);
2028 }
2029
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 v = _PyUnicode_New(size);
2031 if (v == NULL)
2032 goto onError;
2033 if (size == 0)
2034 return (PyObject *)v;
2035 p = PyUnicode_AS_UNICODE(v);
2036 while (size-- > 0)
2037 *p++ = (unsigned char)*s++;
2038 return (PyObject *)v;
2039
2040 onError:
2041 Py_XDECREF(v);
2042 return NULL;
2043}
2044
2045static
2046int latin1_encoding_error(const Py_UNICODE **source,
2047 char **dest,
2048 const char *errors,
2049 const char *details)
2050{
2051 if ((errors == NULL) ||
2052 (strcmp(errors,"strict") == 0)) {
2053 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002054 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 details);
2056 return -1;
2057 }
2058 else if (strcmp(errors,"ignore") == 0) {
2059 return 0;
2060 }
2061 else if (strcmp(errors,"replace") == 0) {
2062 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002063 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return 0;
2065 }
2066 else {
2067 PyErr_Format(PyExc_ValueError,
2068 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002069 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 errors);
2071 return -1;
2072 }
2073}
2074
2075PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2076 int size,
2077 const char *errors)
2078{
2079 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002080 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002081
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 repr = PyString_FromStringAndSize(NULL, size);
2083 if (repr == NULL)
2084 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002085 if (size == 0)
2086 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087
2088 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002089 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 while (size-- > 0) {
2091 Py_UNICODE ch = *p++;
2092 if (ch >= 256) {
2093 if (latin1_encoding_error(&p, &s, errors,
2094 "ordinal not in range(256)"))
2095 goto onError;
2096 }
2097 else
2098 *s++ = (char)ch;
2099 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002100 /* Resize if error handling skipped some characters */
2101 if (s - start < PyString_GET_SIZE(repr))
2102 if (_PyString_Resize(&repr, s - start))
2103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 return repr;
2105
2106 onError:
2107 Py_DECREF(repr);
2108 return NULL;
2109}
2110
2111PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2112{
2113 if (!PyUnicode_Check(unicode)) {
2114 PyErr_BadArgument();
2115 return NULL;
2116 }
2117 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2118 PyUnicode_GET_SIZE(unicode),
2119 NULL);
2120}
2121
2122/* --- 7-bit ASCII Codec -------------------------------------------------- */
2123
2124static
2125int ascii_decoding_error(const char **source,
2126 Py_UNICODE **dest,
2127 const char *errors,
2128 const char *details)
2129{
2130 if ((errors == NULL) ||
2131 (strcmp(errors,"strict") == 0)) {
2132 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002133 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 details);
2135 return -1;
2136 }
2137 else if (strcmp(errors,"ignore") == 0) {
2138 return 0;
2139 }
2140 else if (strcmp(errors,"replace") == 0) {
2141 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2142 (*dest)++;
2143 return 0;
2144 }
2145 else {
2146 PyErr_Format(PyExc_ValueError,
2147 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002148 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149 errors);
2150 return -1;
2151 }
2152}
2153
2154PyObject *PyUnicode_DecodeASCII(const char *s,
2155 int size,
2156 const char *errors)
2157{
2158 PyUnicodeObject *v;
2159 Py_UNICODE *p;
2160
2161 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002162 if (size == 1 && *(unsigned char*)s < 128) {
2163 Py_UNICODE r = *(unsigned char*)s;
2164 return PyUnicode_FromUnicode(&r, 1);
2165 }
2166
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 v = _PyUnicode_New(size);
2168 if (v == NULL)
2169 goto onError;
2170 if (size == 0)
2171 return (PyObject *)v;
2172 p = PyUnicode_AS_UNICODE(v);
2173 while (size-- > 0) {
2174 register unsigned char c;
2175
2176 c = (unsigned char)*s++;
2177 if (c < 128)
2178 *p++ = c;
2179 else if (ascii_decoding_error(&s, &p, errors,
2180 "ordinal not in range(128)"))
2181 goto onError;
2182 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002183 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002184 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002185 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return (PyObject *)v;
2187
2188 onError:
2189 Py_XDECREF(v);
2190 return NULL;
2191}
2192
2193static
2194int ascii_encoding_error(const Py_UNICODE **source,
2195 char **dest,
2196 const char *errors,
2197 const char *details)
2198{
2199 if ((errors == NULL) ||
2200 (strcmp(errors,"strict") == 0)) {
2201 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002202 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 details);
2204 return -1;
2205 }
2206 else if (strcmp(errors,"ignore") == 0) {
2207 return 0;
2208 }
2209 else if (strcmp(errors,"replace") == 0) {
2210 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002211 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 return 0;
2213 }
2214 else {
2215 PyErr_Format(PyExc_ValueError,
2216 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002217 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218 errors);
2219 return -1;
2220 }
2221}
2222
2223PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2224 int size,
2225 const char *errors)
2226{
2227 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002228 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002229
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 repr = PyString_FromStringAndSize(NULL, size);
2231 if (repr == NULL)
2232 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002233 if (size == 0)
2234 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235
2236 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002237 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 while (size-- > 0) {
2239 Py_UNICODE ch = *p++;
2240 if (ch >= 128) {
2241 if (ascii_encoding_error(&p, &s, errors,
2242 "ordinal not in range(128)"))
2243 goto onError;
2244 }
2245 else
2246 *s++ = (char)ch;
2247 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002248 /* Resize if error handling skipped some characters */
2249 if (s - start < PyString_GET_SIZE(repr))
2250 if (_PyString_Resize(&repr, s - start))
2251 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 return repr;
2253
2254 onError:
2255 Py_DECREF(repr);
2256 return NULL;
2257}
2258
2259PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2260{
2261 if (!PyUnicode_Check(unicode)) {
2262 PyErr_BadArgument();
2263 return NULL;
2264 }
2265 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2266 PyUnicode_GET_SIZE(unicode),
2267 NULL);
2268}
2269
Fredrik Lundh30831632001-06-26 15:11:00 +00002270#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002271
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002272/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002273
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002274PyObject *PyUnicode_DecodeMBCS(const char *s,
2275 int size,
2276 const char *errors)
2277{
2278 PyUnicodeObject *v;
2279 Py_UNICODE *p;
2280
2281 /* First get the size of the result */
2282 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002283 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002284 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2285
2286 v = _PyUnicode_New(usize);
2287 if (v == NULL)
2288 return NULL;
2289 if (usize == 0)
2290 return (PyObject *)v;
2291 p = PyUnicode_AS_UNICODE(v);
2292 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2293 Py_DECREF(v);
2294 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2295 }
2296
2297 return (PyObject *)v;
2298}
2299
2300PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2301 int size,
2302 const char *errors)
2303{
2304 PyObject *repr;
2305 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002306 DWORD mbcssize;
2307
2308 /* If there are no characters, bail now! */
2309 if (size==0)
2310 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002311
2312 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002313 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002314 if (mbcssize==0)
2315 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2316
2317 repr = PyString_FromStringAndSize(NULL, mbcssize);
2318 if (repr == NULL)
2319 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002320 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002321 return repr;
2322
2323 /* Do the conversion */
2324 s = PyString_AS_STRING(repr);
2325 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2326 Py_DECREF(repr);
2327 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2328 }
2329 return repr;
2330}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002331
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002332#endif /* MS_WIN32 */
2333
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334/* --- Character Mapping Codec -------------------------------------------- */
2335
2336static
2337int charmap_decoding_error(const char **source,
2338 Py_UNICODE **dest,
2339 const char *errors,
2340 const char *details)
2341{
2342 if ((errors == NULL) ||
2343 (strcmp(errors,"strict") == 0)) {
2344 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002345 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 details);
2347 return -1;
2348 }
2349 else if (strcmp(errors,"ignore") == 0) {
2350 return 0;
2351 }
2352 else if (strcmp(errors,"replace") == 0) {
2353 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2354 (*dest)++;
2355 return 0;
2356 }
2357 else {
2358 PyErr_Format(PyExc_ValueError,
2359 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002360 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361 errors);
2362 return -1;
2363 }
2364}
2365
2366PyObject *PyUnicode_DecodeCharmap(const char *s,
2367 int size,
2368 PyObject *mapping,
2369 const char *errors)
2370{
2371 PyUnicodeObject *v;
2372 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002373 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
2375 /* Default to Latin-1 */
2376 if (mapping == NULL)
2377 return PyUnicode_DecodeLatin1(s, size, errors);
2378
2379 v = _PyUnicode_New(size);
2380 if (v == NULL)
2381 goto onError;
2382 if (size == 0)
2383 return (PyObject *)v;
2384 p = PyUnicode_AS_UNICODE(v);
2385 while (size-- > 0) {
2386 unsigned char ch = *s++;
2387 PyObject *w, *x;
2388
2389 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2390 w = PyInt_FromLong((long)ch);
2391 if (w == NULL)
2392 goto onError;
2393 x = PyObject_GetItem(mapping, w);
2394 Py_DECREF(w);
2395 if (x == NULL) {
2396 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002397 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002399 x = Py_None;
2400 Py_INCREF(x);
2401 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002402 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
2404
2405 /* Apply mapping */
2406 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002407 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 if (value < 0 || value > 65535) {
2409 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002410 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 Py_DECREF(x);
2412 goto onError;
2413 }
2414 *p++ = (Py_UNICODE)value;
2415 }
2416 else if (x == Py_None) {
2417 /* undefined mapping */
2418 if (charmap_decoding_error(&s, &p, errors,
2419 "character maps to <undefined>")) {
2420 Py_DECREF(x);
2421 goto onError;
2422 }
2423 }
2424 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002425 int targetsize = PyUnicode_GET_SIZE(x);
2426
2427 if (targetsize == 1)
2428 /* 1-1 mapping */
2429 *p++ = *PyUnicode_AS_UNICODE(x);
2430
2431 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002433 if (targetsize > extrachars) {
2434 /* resize first */
2435 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2436 int needed = (targetsize - extrachars) + \
2437 (targetsize << 2);
2438 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002439 if (_PyUnicode_Resize(&v,
2440 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002441 Py_DECREF(x);
2442 goto onError;
2443 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002444 p = PyUnicode_AS_UNICODE(v) + oldpos;
2445 }
2446 Py_UNICODE_COPY(p,
2447 PyUnicode_AS_UNICODE(x),
2448 targetsize);
2449 p += targetsize;
2450 extrachars -= targetsize;
2451 }
2452 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 }
2454 else {
2455 /* wrong return value */
2456 PyErr_SetString(PyExc_TypeError,
2457 "character mapping must return integer, None or unicode");
2458 Py_DECREF(x);
2459 goto onError;
2460 }
2461 Py_DECREF(x);
2462 }
2463 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002464 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 goto onError;
2466 return (PyObject *)v;
2467
2468 onError:
2469 Py_XDECREF(v);
2470 return NULL;
2471}
2472
2473static
2474int charmap_encoding_error(const Py_UNICODE **source,
2475 char **dest,
2476 const char *errors,
2477 const char *details)
2478{
2479 if ((errors == NULL) ||
2480 (strcmp(errors,"strict") == 0)) {
2481 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002482 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 details);
2484 return -1;
2485 }
2486 else if (strcmp(errors,"ignore") == 0) {
2487 return 0;
2488 }
2489 else if (strcmp(errors,"replace") == 0) {
2490 **dest = '?';
2491 (*dest)++;
2492 return 0;
2493 }
2494 else {
2495 PyErr_Format(PyExc_ValueError,
2496 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002497 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 errors);
2499 return -1;
2500 }
2501}
2502
2503PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2504 int size,
2505 PyObject *mapping,
2506 const char *errors)
2507{
2508 PyObject *v;
2509 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002510 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511
2512 /* Default to Latin-1 */
2513 if (mapping == NULL)
2514 return PyUnicode_EncodeLatin1(p, size, errors);
2515
2516 v = PyString_FromStringAndSize(NULL, size);
2517 if (v == NULL)
2518 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002519 if (size == 0)
2520 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 s = PyString_AS_STRING(v);
2522 while (size-- > 0) {
2523 Py_UNICODE ch = *p++;
2524 PyObject *w, *x;
2525
2526 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2527 w = PyInt_FromLong((long)ch);
2528 if (w == NULL)
2529 goto onError;
2530 x = PyObject_GetItem(mapping, w);
2531 Py_DECREF(w);
2532 if (x == NULL) {
2533 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002534 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002536 x = Py_None;
2537 Py_INCREF(x);
2538 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002539 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 }
2541
2542 /* Apply mapping */
2543 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002544 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 if (value < 0 || value > 255) {
2546 PyErr_SetString(PyExc_TypeError,
2547 "character mapping must be in range(256)");
2548 Py_DECREF(x);
2549 goto onError;
2550 }
2551 *s++ = (char)value;
2552 }
2553 else if (x == Py_None) {
2554 /* undefined mapping */
2555 if (charmap_encoding_error(&p, &s, errors,
2556 "character maps to <undefined>")) {
2557 Py_DECREF(x);
2558 goto onError;
2559 }
2560 }
2561 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002562 int targetsize = PyString_GET_SIZE(x);
2563
2564 if (targetsize == 1)
2565 /* 1-1 mapping */
2566 *s++ = *PyString_AS_STRING(x);
2567
2568 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002570 if (targetsize > extrachars) {
2571 /* resize first */
2572 int oldpos = (int)(s - PyString_AS_STRING(v));
2573 int needed = (targetsize - extrachars) + \
2574 (targetsize << 2);
2575 extrachars += needed;
2576 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002577 Py_DECREF(x);
2578 goto onError;
2579 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002580 s = PyString_AS_STRING(v) + oldpos;
2581 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002582 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002583 s += targetsize;
2584 extrachars -= targetsize;
2585 }
2586 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 }
2588 else {
2589 /* wrong return value */
2590 PyErr_SetString(PyExc_TypeError,
2591 "character mapping must return integer, None or unicode");
2592 Py_DECREF(x);
2593 goto onError;
2594 }
2595 Py_DECREF(x);
2596 }
2597 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2598 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2599 goto onError;
2600 return v;
2601
2602 onError:
2603 Py_DECREF(v);
2604 return NULL;
2605}
2606
2607PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2608 PyObject *mapping)
2609{
2610 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2611 PyErr_BadArgument();
2612 return NULL;
2613 }
2614 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2615 PyUnicode_GET_SIZE(unicode),
2616 mapping,
2617 NULL);
2618}
2619
2620static
2621int translate_error(const Py_UNICODE **source,
2622 Py_UNICODE **dest,
2623 const char *errors,
2624 const char *details)
2625{
2626 if ((errors == NULL) ||
2627 (strcmp(errors,"strict") == 0)) {
2628 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002629 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 details);
2631 return -1;
2632 }
2633 else if (strcmp(errors,"ignore") == 0) {
2634 return 0;
2635 }
2636 else if (strcmp(errors,"replace") == 0) {
2637 **dest = '?';
2638 (*dest)++;
2639 return 0;
2640 }
2641 else {
2642 PyErr_Format(PyExc_ValueError,
2643 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002644 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 errors);
2646 return -1;
2647 }
2648}
2649
2650PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2651 int size,
2652 PyObject *mapping,
2653 const char *errors)
2654{
2655 PyUnicodeObject *v;
2656 Py_UNICODE *p;
2657
2658 if (mapping == NULL) {
2659 PyErr_BadArgument();
2660 return NULL;
2661 }
2662
2663 /* Output will never be longer than input */
2664 v = _PyUnicode_New(size);
2665 if (v == NULL)
2666 goto onError;
2667 if (size == 0)
2668 goto done;
2669 p = PyUnicode_AS_UNICODE(v);
2670 while (size-- > 0) {
2671 Py_UNICODE ch = *s++;
2672 PyObject *w, *x;
2673
2674 /* Get mapping */
2675 w = PyInt_FromLong(ch);
2676 if (w == NULL)
2677 goto onError;
2678 x = PyObject_GetItem(mapping, w);
2679 Py_DECREF(w);
2680 if (x == NULL) {
2681 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2682 /* No mapping found: default to 1-1 mapping */
2683 PyErr_Clear();
2684 *p++ = ch;
2685 continue;
2686 }
2687 goto onError;
2688 }
2689
2690 /* Apply mapping */
2691 if (PyInt_Check(x))
2692 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2693 else if (x == Py_None) {
2694 /* undefined mapping */
2695 if (translate_error(&s, &p, errors,
2696 "character maps to <undefined>")) {
2697 Py_DECREF(x);
2698 goto onError;
2699 }
2700 }
2701 else if (PyUnicode_Check(x)) {
2702 if (PyUnicode_GET_SIZE(x) != 1) {
2703 /* 1-n mapping */
2704 PyErr_SetString(PyExc_NotImplementedError,
2705 "1-n mappings are currently not implemented");
2706 Py_DECREF(x);
2707 goto onError;
2708 }
2709 *p++ = *PyUnicode_AS_UNICODE(x);
2710 }
2711 else {
2712 /* wrong return value */
2713 PyErr_SetString(PyExc_TypeError,
2714 "translate mapping must return integer, None or unicode");
2715 Py_DECREF(x);
2716 goto onError;
2717 }
2718 Py_DECREF(x);
2719 }
2720 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002721 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723
2724 done:
2725 return (PyObject *)v;
2726
2727 onError:
2728 Py_XDECREF(v);
2729 return NULL;
2730}
2731
2732PyObject *PyUnicode_Translate(PyObject *str,
2733 PyObject *mapping,
2734 const char *errors)
2735{
2736 PyObject *result;
2737
2738 str = PyUnicode_FromObject(str);
2739 if (str == NULL)
2740 goto onError;
2741 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2742 PyUnicode_GET_SIZE(str),
2743 mapping,
2744 errors);
2745 Py_DECREF(str);
2746 return result;
2747
2748 onError:
2749 Py_XDECREF(str);
2750 return NULL;
2751}
2752
Guido van Rossum9e896b32000-04-05 20:11:21 +00002753/* --- Decimal Encoder ---------------------------------------------------- */
2754
2755int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2756 int length,
2757 char *output,
2758 const char *errors)
2759{
2760 Py_UNICODE *p, *end;
2761
2762 if (output == NULL) {
2763 PyErr_BadArgument();
2764 return -1;
2765 }
2766
2767 p = s;
2768 end = s + length;
2769 while (p < end) {
2770 register Py_UNICODE ch = *p++;
2771 int decimal;
2772
2773 if (Py_UNICODE_ISSPACE(ch)) {
2774 *output++ = ' ';
2775 continue;
2776 }
2777 decimal = Py_UNICODE_TODECIMAL(ch);
2778 if (decimal >= 0) {
2779 *output++ = '0' + decimal;
2780 continue;
2781 }
Guido van Rossumba477042000-04-06 18:18:10 +00002782 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002783 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002784 continue;
2785 }
2786 /* All other characters are considered invalid */
2787 if (errors == NULL || strcmp(errors, "strict") == 0) {
2788 PyErr_SetString(PyExc_ValueError,
2789 "invalid decimal Unicode string");
2790 goto onError;
2791 }
2792 else if (strcmp(errors, "ignore") == 0)
2793 continue;
2794 else if (strcmp(errors, "replace") == 0) {
2795 *output++ = '?';
2796 continue;
2797 }
2798 }
2799 /* 0-terminate the output string */
2800 *output++ = '\0';
2801 return 0;
2802
2803 onError:
2804 return -1;
2805}
2806
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807/* --- Helpers ------------------------------------------------------------ */
2808
2809static
2810int count(PyUnicodeObject *self,
2811 int start,
2812 int end,
2813 PyUnicodeObject *substring)
2814{
2815 int count = 0;
2816
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002817 if (start < 0)
2818 start += self->length;
2819 if (start < 0)
2820 start = 0;
2821 if (end > self->length)
2822 end = self->length;
2823 if (end < 0)
2824 end += self->length;
2825 if (end < 0)
2826 end = 0;
2827
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002828 if (substring->length == 0)
2829 return (end - start + 1);
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 end -= substring->length;
2832
2833 while (start <= end)
2834 if (Py_UNICODE_MATCH(self, start, substring)) {
2835 count++;
2836 start += substring->length;
2837 } else
2838 start++;
2839
2840 return count;
2841}
2842
2843int PyUnicode_Count(PyObject *str,
2844 PyObject *substr,
2845 int start,
2846 int end)
2847{
2848 int result;
2849
2850 str = PyUnicode_FromObject(str);
2851 if (str == NULL)
2852 return -1;
2853 substr = PyUnicode_FromObject(substr);
2854 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002855 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 return -1;
2857 }
2858
2859 result = count((PyUnicodeObject *)str,
2860 start, end,
2861 (PyUnicodeObject *)substr);
2862
2863 Py_DECREF(str);
2864 Py_DECREF(substr);
2865 return result;
2866}
2867
2868static
2869int findstring(PyUnicodeObject *self,
2870 PyUnicodeObject *substring,
2871 int start,
2872 int end,
2873 int direction)
2874{
2875 if (start < 0)
2876 start += self->length;
2877 if (start < 0)
2878 start = 0;
2879
2880 if (substring->length == 0)
2881 return start;
2882
2883 if (end > self->length)
2884 end = self->length;
2885 if (end < 0)
2886 end += self->length;
2887 if (end < 0)
2888 end = 0;
2889
2890 end -= substring->length;
2891
2892 if (direction < 0) {
2893 for (; end >= start; end--)
2894 if (Py_UNICODE_MATCH(self, end, substring))
2895 return end;
2896 } else {
2897 for (; start <= end; start++)
2898 if (Py_UNICODE_MATCH(self, start, substring))
2899 return start;
2900 }
2901
2902 return -1;
2903}
2904
2905int PyUnicode_Find(PyObject *str,
2906 PyObject *substr,
2907 int start,
2908 int end,
2909 int direction)
2910{
2911 int result;
2912
2913 str = PyUnicode_FromObject(str);
2914 if (str == NULL)
2915 return -1;
2916 substr = PyUnicode_FromObject(substr);
2917 if (substr == NULL) {
2918 Py_DECREF(substr);
2919 return -1;
2920 }
2921
2922 result = findstring((PyUnicodeObject *)str,
2923 (PyUnicodeObject *)substr,
2924 start, end, direction);
2925 Py_DECREF(str);
2926 Py_DECREF(substr);
2927 return result;
2928}
2929
2930static
2931int tailmatch(PyUnicodeObject *self,
2932 PyUnicodeObject *substring,
2933 int start,
2934 int end,
2935 int direction)
2936{
2937 if (start < 0)
2938 start += self->length;
2939 if (start < 0)
2940 start = 0;
2941
2942 if (substring->length == 0)
2943 return 1;
2944
2945 if (end > self->length)
2946 end = self->length;
2947 if (end < 0)
2948 end += self->length;
2949 if (end < 0)
2950 end = 0;
2951
2952 end -= substring->length;
2953 if (end < start)
2954 return 0;
2955
2956 if (direction > 0) {
2957 if (Py_UNICODE_MATCH(self, end, substring))
2958 return 1;
2959 } else {
2960 if (Py_UNICODE_MATCH(self, start, substring))
2961 return 1;
2962 }
2963
2964 return 0;
2965}
2966
2967int PyUnicode_Tailmatch(PyObject *str,
2968 PyObject *substr,
2969 int start,
2970 int end,
2971 int direction)
2972{
2973 int result;
2974
2975 str = PyUnicode_FromObject(str);
2976 if (str == NULL)
2977 return -1;
2978 substr = PyUnicode_FromObject(substr);
2979 if (substr == NULL) {
2980 Py_DECREF(substr);
2981 return -1;
2982 }
2983
2984 result = tailmatch((PyUnicodeObject *)str,
2985 (PyUnicodeObject *)substr,
2986 start, end, direction);
2987 Py_DECREF(str);
2988 Py_DECREF(substr);
2989 return result;
2990}
2991
2992static
2993const Py_UNICODE *findchar(const Py_UNICODE *s,
2994 int size,
2995 Py_UNICODE ch)
2996{
2997 /* like wcschr, but doesn't stop at NULL characters */
2998
2999 while (size-- > 0) {
3000 if (*s == ch)
3001 return s;
3002 s++;
3003 }
3004
3005 return NULL;
3006}
3007
3008/* Apply fixfct filter to the Unicode object self and return a
3009 reference to the modified object */
3010
3011static
3012PyObject *fixup(PyUnicodeObject *self,
3013 int (*fixfct)(PyUnicodeObject *s))
3014{
3015
3016 PyUnicodeObject *u;
3017
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003018 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 if (u == NULL)
3020 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003021
3022 Py_UNICODE_COPY(u->str, self->str, self->length);
3023
Tim Peters7a29bd52001-09-12 03:03:31 +00003024 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 /* fixfct should return TRUE if it modified the buffer. If
3026 FALSE, return a reference to the original buffer instead
3027 (to save space, not time) */
3028 Py_INCREF(self);
3029 Py_DECREF(u);
3030 return (PyObject*) self;
3031 }
3032 return (PyObject*) u;
3033}
3034
3035static
3036int fixupper(PyUnicodeObject *self)
3037{
3038 int len = self->length;
3039 Py_UNICODE *s = self->str;
3040 int status = 0;
3041
3042 while (len-- > 0) {
3043 register Py_UNICODE ch;
3044
3045 ch = Py_UNICODE_TOUPPER(*s);
3046 if (ch != *s) {
3047 status = 1;
3048 *s = ch;
3049 }
3050 s++;
3051 }
3052
3053 return status;
3054}
3055
3056static
3057int fixlower(PyUnicodeObject *self)
3058{
3059 int len = self->length;
3060 Py_UNICODE *s = self->str;
3061 int status = 0;
3062
3063 while (len-- > 0) {
3064 register Py_UNICODE ch;
3065
3066 ch = Py_UNICODE_TOLOWER(*s);
3067 if (ch != *s) {
3068 status = 1;
3069 *s = ch;
3070 }
3071 s++;
3072 }
3073
3074 return status;
3075}
3076
3077static
3078int fixswapcase(PyUnicodeObject *self)
3079{
3080 int len = self->length;
3081 Py_UNICODE *s = self->str;
3082 int status = 0;
3083
3084 while (len-- > 0) {
3085 if (Py_UNICODE_ISUPPER(*s)) {
3086 *s = Py_UNICODE_TOLOWER(*s);
3087 status = 1;
3088 } else if (Py_UNICODE_ISLOWER(*s)) {
3089 *s = Py_UNICODE_TOUPPER(*s);
3090 status = 1;
3091 }
3092 s++;
3093 }
3094
3095 return status;
3096}
3097
3098static
3099int fixcapitalize(PyUnicodeObject *self)
3100{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003101 int len = self->length;
3102 Py_UNICODE *s = self->str;
3103 int status = 0;
3104
3105 if (len == 0)
3106 return 0;
3107 if (Py_UNICODE_ISLOWER(*s)) {
3108 *s = Py_UNICODE_TOUPPER(*s);
3109 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003111 s++;
3112 while (--len > 0) {
3113 if (Py_UNICODE_ISUPPER(*s)) {
3114 *s = Py_UNICODE_TOLOWER(*s);
3115 status = 1;
3116 }
3117 s++;
3118 }
3119 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120}
3121
3122static
3123int fixtitle(PyUnicodeObject *self)
3124{
3125 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3126 register Py_UNICODE *e;
3127 int previous_is_cased;
3128
3129 /* Shortcut for single character strings */
3130 if (PyUnicode_GET_SIZE(self) == 1) {
3131 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3132 if (*p != ch) {
3133 *p = ch;
3134 return 1;
3135 }
3136 else
3137 return 0;
3138 }
3139
3140 e = p + PyUnicode_GET_SIZE(self);
3141 previous_is_cased = 0;
3142 for (; p < e; p++) {
3143 register const Py_UNICODE ch = *p;
3144
3145 if (previous_is_cased)
3146 *p = Py_UNICODE_TOLOWER(ch);
3147 else
3148 *p = Py_UNICODE_TOTITLE(ch);
3149
3150 if (Py_UNICODE_ISLOWER(ch) ||
3151 Py_UNICODE_ISUPPER(ch) ||
3152 Py_UNICODE_ISTITLE(ch))
3153 previous_is_cased = 1;
3154 else
3155 previous_is_cased = 0;
3156 }
3157 return 1;
3158}
3159
3160PyObject *PyUnicode_Join(PyObject *separator,
3161 PyObject *seq)
3162{
3163 Py_UNICODE *sep;
3164 int seplen;
3165 PyUnicodeObject *res = NULL;
3166 int reslen = 0;
3167 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 int sz = 100;
3169 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003170 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
Tim Peters2cfe3682001-05-05 05:36:48 +00003172 it = PyObject_GetIter(seq);
3173 if (it == NULL)
3174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
3176 if (separator == NULL) {
3177 Py_UNICODE blank = ' ';
3178 sep = &blank;
3179 seplen = 1;
3180 }
3181 else {
3182 separator = PyUnicode_FromObject(separator);
3183 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003184 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 sep = PyUnicode_AS_UNICODE(separator);
3186 seplen = PyUnicode_GET_SIZE(separator);
3187 }
3188
3189 res = _PyUnicode_New(sz);
3190 if (res == NULL)
3191 goto onError;
3192 p = PyUnicode_AS_UNICODE(res);
3193 reslen = 0;
3194
Tim Peters2cfe3682001-05-05 05:36:48 +00003195 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003197 PyObject *item = PyIter_Next(it);
3198 if (item == NULL) {
3199 if (PyErr_Occurred())
3200 goto onError;
3201 break;
3202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 if (!PyUnicode_Check(item)) {
3204 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003205 if (!PyString_Check(item)) {
3206 PyErr_Format(PyExc_TypeError,
3207 "sequence item %i: expected string or Unicode,"
3208 " %.80s found",
3209 i, item->ob_type->tp_name);
3210 Py_DECREF(item);
3211 goto onError;
3212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 v = PyUnicode_FromObject(item);
3214 Py_DECREF(item);
3215 item = v;
3216 if (item == NULL)
3217 goto onError;
3218 }
3219 itemlen = PyUnicode_GET_SIZE(item);
3220 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003221 if (_PyUnicode_Resize(&res, sz*2)) {
3222 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 sz *= 2;
3226 p = PyUnicode_AS_UNICODE(res) + reslen;
3227 }
3228 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003229 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 p += seplen;
3231 reslen += seplen;
3232 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003233 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 p += itemlen;
3235 reslen += itemlen;
3236 Py_DECREF(item);
3237 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003238 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 goto onError;
3240
3241 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003242 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 return (PyObject *)res;
3244
3245 onError:
3246 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003247 Py_XDECREF(res);
3248 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 return NULL;
3250}
3251
3252static
3253PyUnicodeObject *pad(PyUnicodeObject *self,
3254 int left,
3255 int right,
3256 Py_UNICODE fill)
3257{
3258 PyUnicodeObject *u;
3259
3260 if (left < 0)
3261 left = 0;
3262 if (right < 0)
3263 right = 0;
3264
Tim Peters7a29bd52001-09-12 03:03:31 +00003265 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 Py_INCREF(self);
3267 return self;
3268 }
3269
3270 u = _PyUnicode_New(left + self->length + right);
3271 if (u) {
3272 if (left)
3273 Py_UNICODE_FILL(u->str, fill, left);
3274 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3275 if (right)
3276 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3277 }
3278
3279 return u;
3280}
3281
3282#define SPLIT_APPEND(data, left, right) \
3283 str = PyUnicode_FromUnicode(data + left, right - left); \
3284 if (!str) \
3285 goto onError; \
3286 if (PyList_Append(list, str)) { \
3287 Py_DECREF(str); \
3288 goto onError; \
3289 } \
3290 else \
3291 Py_DECREF(str);
3292
3293static
3294PyObject *split_whitespace(PyUnicodeObject *self,
3295 PyObject *list,
3296 int maxcount)
3297{
3298 register int i;
3299 register int j;
3300 int len = self->length;
3301 PyObject *str;
3302
3303 for (i = j = 0; i < len; ) {
3304 /* find a token */
3305 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3306 i++;
3307 j = i;
3308 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3309 i++;
3310 if (j < i) {
3311 if (maxcount-- <= 0)
3312 break;
3313 SPLIT_APPEND(self->str, j, i);
3314 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3315 i++;
3316 j = i;
3317 }
3318 }
3319 if (j < len) {
3320 SPLIT_APPEND(self->str, j, len);
3321 }
3322 return list;
3323
3324 onError:
3325 Py_DECREF(list);
3326 return NULL;
3327}
3328
3329PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003330 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331{
3332 register int i;
3333 register int j;
3334 int len;
3335 PyObject *list;
3336 PyObject *str;
3337 Py_UNICODE *data;
3338
3339 string = PyUnicode_FromObject(string);
3340 if (string == NULL)
3341 return NULL;
3342 data = PyUnicode_AS_UNICODE(string);
3343 len = PyUnicode_GET_SIZE(string);
3344
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 list = PyList_New(0);
3346 if (!list)
3347 goto onError;
3348
3349 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003350 int eol;
3351
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 /* Find a line and append it */
3353 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3354 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355
3356 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003357 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 if (i < len) {
3359 if (data[i] == '\r' && i + 1 < len &&
3360 data[i+1] == '\n')
3361 i += 2;
3362 else
3363 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003364 if (keepends)
3365 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
Guido van Rossum86662912000-04-11 15:38:46 +00003367 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 j = i;
3369 }
3370 if (j < len) {
3371 SPLIT_APPEND(data, j, len);
3372 }
3373
3374 Py_DECREF(string);
3375 return list;
3376
3377 onError:
3378 Py_DECREF(list);
3379 Py_DECREF(string);
3380 return NULL;
3381}
3382
3383static
3384PyObject *split_char(PyUnicodeObject *self,
3385 PyObject *list,
3386 Py_UNICODE ch,
3387 int maxcount)
3388{
3389 register int i;
3390 register int j;
3391 int len = self->length;
3392 PyObject *str;
3393
3394 for (i = j = 0; i < len; ) {
3395 if (self->str[i] == ch) {
3396 if (maxcount-- <= 0)
3397 break;
3398 SPLIT_APPEND(self->str, j, i);
3399 i = j = i + 1;
3400 } else
3401 i++;
3402 }
3403 if (j <= len) {
3404 SPLIT_APPEND(self->str, j, len);
3405 }
3406 return list;
3407
3408 onError:
3409 Py_DECREF(list);
3410 return NULL;
3411}
3412
3413static
3414PyObject *split_substring(PyUnicodeObject *self,
3415 PyObject *list,
3416 PyUnicodeObject *substring,
3417 int maxcount)
3418{
3419 register int i;
3420 register int j;
3421 int len = self->length;
3422 int sublen = substring->length;
3423 PyObject *str;
3424
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003425 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 if (Py_UNICODE_MATCH(self, i, substring)) {
3427 if (maxcount-- <= 0)
3428 break;
3429 SPLIT_APPEND(self->str, j, i);
3430 i = j = i + sublen;
3431 } else
3432 i++;
3433 }
3434 if (j <= len) {
3435 SPLIT_APPEND(self->str, j, len);
3436 }
3437 return list;
3438
3439 onError:
3440 Py_DECREF(list);
3441 return NULL;
3442}
3443
3444#undef SPLIT_APPEND
3445
3446static
3447PyObject *split(PyUnicodeObject *self,
3448 PyUnicodeObject *substring,
3449 int maxcount)
3450{
3451 PyObject *list;
3452
3453 if (maxcount < 0)
3454 maxcount = INT_MAX;
3455
3456 list = PyList_New(0);
3457 if (!list)
3458 return NULL;
3459
3460 if (substring == NULL)
3461 return split_whitespace(self,list,maxcount);
3462
3463 else if (substring->length == 1)
3464 return split_char(self,list,substring->str[0],maxcount);
3465
3466 else if (substring->length == 0) {
3467 Py_DECREF(list);
3468 PyErr_SetString(PyExc_ValueError, "empty separator");
3469 return NULL;
3470 }
3471 else
3472 return split_substring(self,list,substring,maxcount);
3473}
3474
3475static
3476PyObject *strip(PyUnicodeObject *self,
3477 int left,
3478 int right)
3479{
3480 Py_UNICODE *p = self->str;
3481 int start = 0;
3482 int end = self->length;
3483
3484 if (left)
3485 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3486 start++;
3487
3488 if (right)
3489 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3490 end--;
3491
Tim Peters7a29bd52001-09-12 03:03:31 +00003492 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 /* couldn't strip anything off, return original string */
3494 Py_INCREF(self);
3495 return (PyObject*) self;
3496 }
3497
3498 return (PyObject*) PyUnicode_FromUnicode(
3499 self->str + start,
3500 end - start
3501 );
3502}
3503
3504static
3505PyObject *replace(PyUnicodeObject *self,
3506 PyUnicodeObject *str1,
3507 PyUnicodeObject *str2,
3508 int maxcount)
3509{
3510 PyUnicodeObject *u;
3511
3512 if (maxcount < 0)
3513 maxcount = INT_MAX;
3514
3515 if (str1->length == 1 && str2->length == 1) {
3516 int i;
3517
3518 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003519 if (!findchar(self->str, self->length, str1->str[0]) &&
3520 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 /* nothing to replace, return original string */
3522 Py_INCREF(self);
3523 u = self;
3524 } else {
3525 Py_UNICODE u1 = str1->str[0];
3526 Py_UNICODE u2 = str2->str[0];
3527
3528 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003529 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 self->length
3531 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003532 if (u != NULL) {
3533 Py_UNICODE_COPY(u->str, self->str,
3534 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 for (i = 0; i < u->length; i++)
3536 if (u->str[i] == u1) {
3537 if (--maxcount < 0)
3538 break;
3539 u->str[i] = u2;
3540 }
3541 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543
3544 } else {
3545 int n, i;
3546 Py_UNICODE *p;
3547
3548 /* replace strings */
3549 n = count(self, 0, self->length, str1);
3550 if (n > maxcount)
3551 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003552 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 /* nothing to replace, return original string */
3554 Py_INCREF(self);
3555 u = self;
3556 } else {
3557 u = _PyUnicode_New(
3558 self->length + n * (str2->length - str1->length));
3559 if (u) {
3560 i = 0;
3561 p = u->str;
3562 while (i <= self->length - str1->length)
3563 if (Py_UNICODE_MATCH(self, i, str1)) {
3564 /* replace string segment */
3565 Py_UNICODE_COPY(p, str2->str, str2->length);
3566 p += str2->length;
3567 i += str1->length;
3568 if (--n <= 0) {
3569 /* copy remaining part */
3570 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3571 break;
3572 }
3573 } else
3574 *p++ = self->str[i++];
3575 }
3576 }
3577 }
3578
3579 return (PyObject *) u;
3580}
3581
3582/* --- Unicode Object Methods --------------------------------------------- */
3583
3584static char title__doc__[] =
3585"S.title() -> unicode\n\
3586\n\
3587Return a titlecased version of S, i.e. words start with title case\n\
3588characters, all remaining cased characters have lower case.";
3589
3590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003591unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 return fixup(self, fixtitle);
3594}
3595
3596static char capitalize__doc__[] =
3597"S.capitalize() -> unicode\n\
3598\n\
3599Return a capitalized version of S, i.e. make the first character\n\
3600have upper case.";
3601
3602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003603unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 return fixup(self, fixcapitalize);
3606}
3607
3608#if 0
3609static char capwords__doc__[] =
3610"S.capwords() -> unicode\n\
3611\n\
3612Apply .capitalize() to all words in S and return the result with\n\
3613normalized whitespace (all whitespace strings are replaced by ' ').";
3614
3615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003616unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617{
3618 PyObject *list;
3619 PyObject *item;
3620 int i;
3621
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 /* Split into words */
3623 list = split(self, NULL, -1);
3624 if (!list)
3625 return NULL;
3626
3627 /* Capitalize each word */
3628 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3629 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3630 fixcapitalize);
3631 if (item == NULL)
3632 goto onError;
3633 Py_DECREF(PyList_GET_ITEM(list, i));
3634 PyList_SET_ITEM(list, i, item);
3635 }
3636
3637 /* Join the words to form a new string */
3638 item = PyUnicode_Join(NULL, list);
3639
3640onError:
3641 Py_DECREF(list);
3642 return (PyObject *)item;
3643}
3644#endif
3645
3646static char center__doc__[] =
3647"S.center(width) -> unicode\n\
3648\n\
3649Return S centered in a Unicode string of length width. Padding is done\n\
3650using spaces.";
3651
3652static PyObject *
3653unicode_center(PyUnicodeObject *self, PyObject *args)
3654{
3655 int marg, left;
3656 int width;
3657
3658 if (!PyArg_ParseTuple(args, "i:center", &width))
3659 return NULL;
3660
Tim Peters7a29bd52001-09-12 03:03:31 +00003661 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 Py_INCREF(self);
3663 return (PyObject*) self;
3664 }
3665
3666 marg = width - self->length;
3667 left = marg / 2 + (marg & width & 1);
3668
3669 return (PyObject*) pad(self, left, marg - left, ' ');
3670}
3671
Marc-André Lemburge5034372000-08-08 08:04:29 +00003672#if 0
3673
3674/* This code should go into some future Unicode collation support
3675 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003676 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003677
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003678/* speedy UTF-16 code point order comparison */
3679/* gleaned from: */
3680/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3681
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003682static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003683{
3684 0, 0, 0, 0, 0, 0, 0, 0,
3685 0, 0, 0, 0, 0, 0, 0, 0,
3686 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003687 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003688};
3689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690static int
3691unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3692{
3693 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 Py_UNICODE *s1 = str1->str;
3696 Py_UNICODE *s2 = str2->str;
3697
3698 len1 = str1->length;
3699 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003700
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003702 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003703
3704 c1 = *s1++;
3705 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003706
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003707 if (c1 > (1<<11) * 26)
3708 c1 += utf16Fixup[c1>>11];
3709 if (c2 > (1<<11) * 26)
3710 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003711 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003712
3713 if (c1 != c2)
3714 return (c1 < c2) ? -1 : 1;
3715
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003716 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 }
3718
3719 return (len1 < len2) ? -1 : (len1 != len2);
3720}
3721
Marc-André Lemburge5034372000-08-08 08:04:29 +00003722#else
3723
3724static int
3725unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3726{
3727 register int len1, len2;
3728
3729 Py_UNICODE *s1 = str1->str;
3730 Py_UNICODE *s2 = str2->str;
3731
3732 len1 = str1->length;
3733 len2 = str2->length;
3734
3735 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003736 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003737
Fredrik Lundh45714e92001-06-26 16:39:36 +00003738 c1 = *s1++;
3739 c2 = *s2++;
3740
3741 if (c1 != c2)
3742 return (c1 < c2) ? -1 : 1;
3743
Marc-André Lemburge5034372000-08-08 08:04:29 +00003744 len1--; len2--;
3745 }
3746
3747 return (len1 < len2) ? -1 : (len1 != len2);
3748}
3749
3750#endif
3751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752int PyUnicode_Compare(PyObject *left,
3753 PyObject *right)
3754{
3755 PyUnicodeObject *u = NULL, *v = NULL;
3756 int result;
3757
3758 /* Coerce the two arguments */
3759 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3760 if (u == NULL)
3761 goto onError;
3762 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3763 if (v == NULL)
3764 goto onError;
3765
Thomas Wouters7e474022000-07-16 12:04:32 +00003766 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 if (v == u) {
3768 Py_DECREF(u);
3769 Py_DECREF(v);
3770 return 0;
3771 }
3772
3773 result = unicode_compare(u, v);
3774
3775 Py_DECREF(u);
3776 Py_DECREF(v);
3777 return result;
3778
3779onError:
3780 Py_XDECREF(u);
3781 Py_XDECREF(v);
3782 return -1;
3783}
3784
Guido van Rossum403d68b2000-03-13 15:55:09 +00003785int PyUnicode_Contains(PyObject *container,
3786 PyObject *element)
3787{
3788 PyUnicodeObject *u = NULL, *v = NULL;
3789 int result;
3790 register const Py_UNICODE *p, *e;
3791 register Py_UNICODE ch;
3792
3793 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003794 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003795 if (v == NULL) {
3796 PyErr_SetString(PyExc_TypeError,
3797 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003798 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003799 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003800 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3801 if (u == NULL) {
3802 Py_DECREF(v);
3803 goto onError;
3804 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003805
3806 /* Check v in u */
3807 if (PyUnicode_GET_SIZE(v) != 1) {
3808 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003809 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003810 goto onError;
3811 }
3812 ch = *PyUnicode_AS_UNICODE(v);
3813 p = PyUnicode_AS_UNICODE(u);
3814 e = p + PyUnicode_GET_SIZE(u);
3815 result = 0;
3816 while (p < e) {
3817 if (*p++ == ch) {
3818 result = 1;
3819 break;
3820 }
3821 }
3822
3823 Py_DECREF(u);
3824 Py_DECREF(v);
3825 return result;
3826
3827onError:
3828 Py_XDECREF(u);
3829 Py_XDECREF(v);
3830 return -1;
3831}
3832
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833/* Concat to string or Unicode object giving a new Unicode object. */
3834
3835PyObject *PyUnicode_Concat(PyObject *left,
3836 PyObject *right)
3837{
3838 PyUnicodeObject *u = NULL, *v = NULL, *w;
3839
3840 /* Coerce the two arguments */
3841 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3842 if (u == NULL)
3843 goto onError;
3844 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3845 if (v == NULL)
3846 goto onError;
3847
3848 /* Shortcuts */
3849 if (v == unicode_empty) {
3850 Py_DECREF(v);
3851 return (PyObject *)u;
3852 }
3853 if (u == unicode_empty) {
3854 Py_DECREF(u);
3855 return (PyObject *)v;
3856 }
3857
3858 /* Concat the two Unicode strings */
3859 w = _PyUnicode_New(u->length + v->length);
3860 if (w == NULL)
3861 goto onError;
3862 Py_UNICODE_COPY(w->str, u->str, u->length);
3863 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3864
3865 Py_DECREF(u);
3866 Py_DECREF(v);
3867 return (PyObject *)w;
3868
3869onError:
3870 Py_XDECREF(u);
3871 Py_XDECREF(v);
3872 return NULL;
3873}
3874
3875static char count__doc__[] =
3876"S.count(sub[, start[, end]]) -> int\n\
3877\n\
3878Return the number of occurrences of substring sub in Unicode string\n\
3879S[start:end]. Optional arguments start and end are\n\
3880interpreted as in slice notation.";
3881
3882static PyObject *
3883unicode_count(PyUnicodeObject *self, PyObject *args)
3884{
3885 PyUnicodeObject *substring;
3886 int start = 0;
3887 int end = INT_MAX;
3888 PyObject *result;
3889
Guido van Rossumb8872e62000-05-09 14:14:27 +00003890 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3891 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 return NULL;
3893
3894 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3895 (PyObject *)substring);
3896 if (substring == NULL)
3897 return NULL;
3898
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 if (start < 0)
3900 start += self->length;
3901 if (start < 0)
3902 start = 0;
3903 if (end > self->length)
3904 end = self->length;
3905 if (end < 0)
3906 end += self->length;
3907 if (end < 0)
3908 end = 0;
3909
3910 result = PyInt_FromLong((long) count(self, start, end, substring));
3911
3912 Py_DECREF(substring);
3913 return result;
3914}
3915
3916static char encode__doc__[] =
3917"S.encode([encoding[,errors]]) -> string\n\
3918\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003919Return an encoded string version of S. Default encoding is the current\n\
3920default string encoding. errors may be given to set a different error\n\
3921handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3922a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923
3924static PyObject *
3925unicode_encode(PyUnicodeObject *self, PyObject *args)
3926{
3927 char *encoding = NULL;
3928 char *errors = NULL;
3929 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3930 return NULL;
3931 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3932}
3933
3934static char expandtabs__doc__[] =
3935"S.expandtabs([tabsize]) -> unicode\n\
3936\n\
3937Return a copy of S where all tab characters are expanded using spaces.\n\
3938If tabsize is not given, a tab size of 8 characters is assumed.";
3939
3940static PyObject*
3941unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3942{
3943 Py_UNICODE *e;
3944 Py_UNICODE *p;
3945 Py_UNICODE *q;
3946 int i, j;
3947 PyUnicodeObject *u;
3948 int tabsize = 8;
3949
3950 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3951 return NULL;
3952
Thomas Wouters7e474022000-07-16 12:04:32 +00003953 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 i = j = 0;
3955 e = self->str + self->length;
3956 for (p = self->str; p < e; p++)
3957 if (*p == '\t') {
3958 if (tabsize > 0)
3959 j += tabsize - (j % tabsize);
3960 }
3961 else {
3962 j++;
3963 if (*p == '\n' || *p == '\r') {
3964 i += j;
3965 j = 0;
3966 }
3967 }
3968
3969 /* Second pass: create output string and fill it */
3970 u = _PyUnicode_New(i + j);
3971 if (!u)
3972 return NULL;
3973
3974 j = 0;
3975 q = u->str;
3976
3977 for (p = self->str; p < e; p++)
3978 if (*p == '\t') {
3979 if (tabsize > 0) {
3980 i = tabsize - (j % tabsize);
3981 j += i;
3982 while (i--)
3983 *q++ = ' ';
3984 }
3985 }
3986 else {
3987 j++;
3988 *q++ = *p;
3989 if (*p == '\n' || *p == '\r')
3990 j = 0;
3991 }
3992
3993 return (PyObject*) u;
3994}
3995
3996static char find__doc__[] =
3997"S.find(sub [,start [,end]]) -> int\n\
3998\n\
3999Return the lowest index in S where substring sub is found,\n\
4000such that sub is contained within s[start,end]. Optional\n\
4001arguments start and end are interpreted as in slice notation.\n\
4002\n\
4003Return -1 on failure.";
4004
4005static PyObject *
4006unicode_find(PyUnicodeObject *self, PyObject *args)
4007{
4008 PyUnicodeObject *substring;
4009 int start = 0;
4010 int end = INT_MAX;
4011 PyObject *result;
4012
Guido van Rossumb8872e62000-05-09 14:14:27 +00004013 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4014 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 return NULL;
4016 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4017 (PyObject *)substring);
4018 if (substring == NULL)
4019 return NULL;
4020
4021 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4022
4023 Py_DECREF(substring);
4024 return result;
4025}
4026
4027static PyObject *
4028unicode_getitem(PyUnicodeObject *self, int index)
4029{
4030 if (index < 0 || index >= self->length) {
4031 PyErr_SetString(PyExc_IndexError, "string index out of range");
4032 return NULL;
4033 }
4034
4035 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4036}
4037
4038static long
4039unicode_hash(PyUnicodeObject *self)
4040{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004041 /* Since Unicode objects compare equal to their ASCII string
4042 counterparts, they should use the individual character values
4043 as basis for their hash value. This is needed to assure that
4044 strings and Unicode objects behave in the same way as
4045 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046
Fredrik Lundhdde61642000-07-10 18:27:47 +00004047 register int len;
4048 register Py_UNICODE *p;
4049 register long x;
4050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 if (self->hash != -1)
4052 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004053 len = PyUnicode_GET_SIZE(self);
4054 p = PyUnicode_AS_UNICODE(self);
4055 x = *p << 7;
4056 while (--len >= 0)
4057 x = (1000003*x) ^ *p++;
4058 x ^= PyUnicode_GET_SIZE(self);
4059 if (x == -1)
4060 x = -2;
4061 self->hash = x;
4062 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063}
4064
4065static char index__doc__[] =
4066"S.index(sub [,start [,end]]) -> int\n\
4067\n\
4068Like S.find() but raise ValueError when the substring is not found.";
4069
4070static PyObject *
4071unicode_index(PyUnicodeObject *self, PyObject *args)
4072{
4073 int result;
4074 PyUnicodeObject *substring;
4075 int start = 0;
4076 int end = INT_MAX;
4077
Guido van Rossumb8872e62000-05-09 14:14:27 +00004078 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4079 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 return NULL;
4081
4082 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4083 (PyObject *)substring);
4084 if (substring == NULL)
4085 return NULL;
4086
4087 result = findstring(self, substring, start, end, 1);
4088
4089 Py_DECREF(substring);
4090 if (result < 0) {
4091 PyErr_SetString(PyExc_ValueError, "substring not found");
4092 return NULL;
4093 }
4094 return PyInt_FromLong(result);
4095}
4096
4097static char islower__doc__[] =
4098"S.islower() -> int\n\
4099\n\
4100Return 1 if all cased characters in S are lowercase and there is\n\
4101at least one cased character in S, 0 otherwise.";
4102
4103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004104unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105{
4106 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4107 register const Py_UNICODE *e;
4108 int cased;
4109
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 /* Shortcut for single character strings */
4111 if (PyUnicode_GET_SIZE(self) == 1)
4112 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4113
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004114 /* Special case for empty strings */
4115 if (PyString_GET_SIZE(self) == 0)
4116 return PyInt_FromLong(0);
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 e = p + PyUnicode_GET_SIZE(self);
4119 cased = 0;
4120 for (; p < e; p++) {
4121 register const Py_UNICODE ch = *p;
4122
4123 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4124 return PyInt_FromLong(0);
4125 else if (!cased && Py_UNICODE_ISLOWER(ch))
4126 cased = 1;
4127 }
4128 return PyInt_FromLong(cased);
4129}
4130
4131static char isupper__doc__[] =
4132"S.isupper() -> int\n\
4133\n\
4134Return 1 if all cased characters in S are uppercase and there is\n\
4135at least one cased character in S, 0 otherwise.";
4136
4137static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004138unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139{
4140 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4141 register const Py_UNICODE *e;
4142 int cased;
4143
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 /* Shortcut for single character strings */
4145 if (PyUnicode_GET_SIZE(self) == 1)
4146 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4147
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004148 /* Special case for empty strings */
4149 if (PyString_GET_SIZE(self) == 0)
4150 return PyInt_FromLong(0);
4151
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 e = p + PyUnicode_GET_SIZE(self);
4153 cased = 0;
4154 for (; p < e; p++) {
4155 register const Py_UNICODE ch = *p;
4156
4157 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4158 return PyInt_FromLong(0);
4159 else if (!cased && Py_UNICODE_ISUPPER(ch))
4160 cased = 1;
4161 }
4162 return PyInt_FromLong(cased);
4163}
4164
4165static char istitle__doc__[] =
4166"S.istitle() -> int\n\
4167\n\
4168Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4169may only follow uncased characters and lowercase characters only cased\n\
4170ones. Return 0 otherwise.";
4171
4172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004173unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174{
4175 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4176 register const Py_UNICODE *e;
4177 int cased, previous_is_cased;
4178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 /* Shortcut for single character strings */
4180 if (PyUnicode_GET_SIZE(self) == 1)
4181 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4182 (Py_UNICODE_ISUPPER(*p) != 0));
4183
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004184 /* Special case for empty strings */
4185 if (PyString_GET_SIZE(self) == 0)
4186 return PyInt_FromLong(0);
4187
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 e = p + PyUnicode_GET_SIZE(self);
4189 cased = 0;
4190 previous_is_cased = 0;
4191 for (; p < e; p++) {
4192 register const Py_UNICODE ch = *p;
4193
4194 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4195 if (previous_is_cased)
4196 return PyInt_FromLong(0);
4197 previous_is_cased = 1;
4198 cased = 1;
4199 }
4200 else if (Py_UNICODE_ISLOWER(ch)) {
4201 if (!previous_is_cased)
4202 return PyInt_FromLong(0);
4203 previous_is_cased = 1;
4204 cased = 1;
4205 }
4206 else
4207 previous_is_cased = 0;
4208 }
4209 return PyInt_FromLong(cased);
4210}
4211
4212static char isspace__doc__[] =
4213"S.isspace() -> int\n\
4214\n\
4215Return 1 if there are only whitespace characters in S,\n\
42160 otherwise.";
4217
4218static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004219unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220{
4221 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4222 register const Py_UNICODE *e;
4223
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 /* Shortcut for single character strings */
4225 if (PyUnicode_GET_SIZE(self) == 1 &&
4226 Py_UNICODE_ISSPACE(*p))
4227 return PyInt_FromLong(1);
4228
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004229 /* Special case for empty strings */
4230 if (PyString_GET_SIZE(self) == 0)
4231 return PyInt_FromLong(0);
4232
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 e = p + PyUnicode_GET_SIZE(self);
4234 for (; p < e; p++) {
4235 if (!Py_UNICODE_ISSPACE(*p))
4236 return PyInt_FromLong(0);
4237 }
4238 return PyInt_FromLong(1);
4239}
4240
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004241static char isalpha__doc__[] =
4242"S.isalpha() -> int\n\
4243\n\
4244Return 1 if all characters in S are alphabetic\n\
4245and there is at least one character in S, 0 otherwise.";
4246
4247static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004248unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004249{
4250 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4251 register const Py_UNICODE *e;
4252
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004253 /* Shortcut for single character strings */
4254 if (PyUnicode_GET_SIZE(self) == 1 &&
4255 Py_UNICODE_ISALPHA(*p))
4256 return PyInt_FromLong(1);
4257
4258 /* Special case for empty strings */
4259 if (PyString_GET_SIZE(self) == 0)
4260 return PyInt_FromLong(0);
4261
4262 e = p + PyUnicode_GET_SIZE(self);
4263 for (; p < e; p++) {
4264 if (!Py_UNICODE_ISALPHA(*p))
4265 return PyInt_FromLong(0);
4266 }
4267 return PyInt_FromLong(1);
4268}
4269
4270static char isalnum__doc__[] =
4271"S.isalnum() -> int\n\
4272\n\
4273Return 1 if all characters in S are alphanumeric\n\
4274and there is at least one character in S, 0 otherwise.";
4275
4276static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004277unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004278{
4279 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4280 register const Py_UNICODE *e;
4281
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004282 /* Shortcut for single character strings */
4283 if (PyUnicode_GET_SIZE(self) == 1 &&
4284 Py_UNICODE_ISALNUM(*p))
4285 return PyInt_FromLong(1);
4286
4287 /* Special case for empty strings */
4288 if (PyString_GET_SIZE(self) == 0)
4289 return PyInt_FromLong(0);
4290
4291 e = p + PyUnicode_GET_SIZE(self);
4292 for (; p < e; p++) {
4293 if (!Py_UNICODE_ISALNUM(*p))
4294 return PyInt_FromLong(0);
4295 }
4296 return PyInt_FromLong(1);
4297}
4298
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299static char isdecimal__doc__[] =
4300"S.isdecimal() -> int\n\
4301\n\
4302Return 1 if there are only decimal characters in S,\n\
43030 otherwise.";
4304
4305static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004306unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307{
4308 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4309 register const Py_UNICODE *e;
4310
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 /* Shortcut for single character strings */
4312 if (PyUnicode_GET_SIZE(self) == 1 &&
4313 Py_UNICODE_ISDECIMAL(*p))
4314 return PyInt_FromLong(1);
4315
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004316 /* Special case for empty strings */
4317 if (PyString_GET_SIZE(self) == 0)
4318 return PyInt_FromLong(0);
4319
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 e = p + PyUnicode_GET_SIZE(self);
4321 for (; p < e; p++) {
4322 if (!Py_UNICODE_ISDECIMAL(*p))
4323 return PyInt_FromLong(0);
4324 }
4325 return PyInt_FromLong(1);
4326}
4327
4328static char isdigit__doc__[] =
4329"S.isdigit() -> int\n\
4330\n\
4331Return 1 if there are only digit characters in S,\n\
43320 otherwise.";
4333
4334static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004335unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336{
4337 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4338 register const Py_UNICODE *e;
4339
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 /* Shortcut for single character strings */
4341 if (PyUnicode_GET_SIZE(self) == 1 &&
4342 Py_UNICODE_ISDIGIT(*p))
4343 return PyInt_FromLong(1);
4344
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004345 /* Special case for empty strings */
4346 if (PyString_GET_SIZE(self) == 0)
4347 return PyInt_FromLong(0);
4348
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 e = p + PyUnicode_GET_SIZE(self);
4350 for (; p < e; p++) {
4351 if (!Py_UNICODE_ISDIGIT(*p))
4352 return PyInt_FromLong(0);
4353 }
4354 return PyInt_FromLong(1);
4355}
4356
4357static char isnumeric__doc__[] =
4358"S.isnumeric() -> int\n\
4359\n\
4360Return 1 if there are only numeric characters in S,\n\
43610 otherwise.";
4362
4363static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004364unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365{
4366 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4367 register const Py_UNICODE *e;
4368
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 /* Shortcut for single character strings */
4370 if (PyUnicode_GET_SIZE(self) == 1 &&
4371 Py_UNICODE_ISNUMERIC(*p))
4372 return PyInt_FromLong(1);
4373
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004374 /* Special case for empty strings */
4375 if (PyString_GET_SIZE(self) == 0)
4376 return PyInt_FromLong(0);
4377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 e = p + PyUnicode_GET_SIZE(self);
4379 for (; p < e; p++) {
4380 if (!Py_UNICODE_ISNUMERIC(*p))
4381 return PyInt_FromLong(0);
4382 }
4383 return PyInt_FromLong(1);
4384}
4385
4386static char join__doc__[] =
4387"S.join(sequence) -> unicode\n\
4388\n\
4389Return a string which is the concatenation of the strings in the\n\
4390sequence. The separator between elements is S.";
4391
4392static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004393unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004395 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396}
4397
4398static int
4399unicode_length(PyUnicodeObject *self)
4400{
4401 return self->length;
4402}
4403
4404static char ljust__doc__[] =
4405"S.ljust(width) -> unicode\n\
4406\n\
4407Return S left justified in a Unicode string of length width. Padding is\n\
4408done using spaces.";
4409
4410static PyObject *
4411unicode_ljust(PyUnicodeObject *self, PyObject *args)
4412{
4413 int width;
4414 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4415 return NULL;
4416
Tim Peters7a29bd52001-09-12 03:03:31 +00004417 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 Py_INCREF(self);
4419 return (PyObject*) self;
4420 }
4421
4422 return (PyObject*) pad(self, 0, width - self->length, ' ');
4423}
4424
4425static char lower__doc__[] =
4426"S.lower() -> unicode\n\
4427\n\
4428Return a copy of the string S converted to lowercase.";
4429
4430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004431unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 return fixup(self, fixlower);
4434}
4435
4436static char lstrip__doc__[] =
4437"S.lstrip() -> unicode\n\
4438\n\
4439Return a copy of the string S with leading whitespace removed.";
4440
4441static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004442unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 return strip(self, 1, 0);
4445}
4446
4447static PyObject*
4448unicode_repeat(PyUnicodeObject *str, int len)
4449{
4450 PyUnicodeObject *u;
4451 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004452 int nchars;
4453 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
4455 if (len < 0)
4456 len = 0;
4457
Tim Peters7a29bd52001-09-12 03:03:31 +00004458 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 /* no repeat, return original string */
4460 Py_INCREF(str);
4461 return (PyObject*) str;
4462 }
Tim Peters8f422462000-09-09 06:13:41 +00004463
4464 /* ensure # of chars needed doesn't overflow int and # of bytes
4465 * needed doesn't overflow size_t
4466 */
4467 nchars = len * str->length;
4468 if (len && nchars / len != str->length) {
4469 PyErr_SetString(PyExc_OverflowError,
4470 "repeated string is too long");
4471 return NULL;
4472 }
4473 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4474 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4475 PyErr_SetString(PyExc_OverflowError,
4476 "repeated string is too long");
4477 return NULL;
4478 }
4479 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (!u)
4481 return NULL;
4482
4483 p = u->str;
4484
4485 while (len-- > 0) {
4486 Py_UNICODE_COPY(p, str->str, str->length);
4487 p += str->length;
4488 }
4489
4490 return (PyObject*) u;
4491}
4492
4493PyObject *PyUnicode_Replace(PyObject *obj,
4494 PyObject *subobj,
4495 PyObject *replobj,
4496 int maxcount)
4497{
4498 PyObject *self;
4499 PyObject *str1;
4500 PyObject *str2;
4501 PyObject *result;
4502
4503 self = PyUnicode_FromObject(obj);
4504 if (self == NULL)
4505 return NULL;
4506 str1 = PyUnicode_FromObject(subobj);
4507 if (str1 == NULL) {
4508 Py_DECREF(self);
4509 return NULL;
4510 }
4511 str2 = PyUnicode_FromObject(replobj);
4512 if (str2 == NULL) {
4513 Py_DECREF(self);
4514 Py_DECREF(str1);
4515 return NULL;
4516 }
4517 result = replace((PyUnicodeObject *)self,
4518 (PyUnicodeObject *)str1,
4519 (PyUnicodeObject *)str2,
4520 maxcount);
4521 Py_DECREF(self);
4522 Py_DECREF(str1);
4523 Py_DECREF(str2);
4524 return result;
4525}
4526
4527static char replace__doc__[] =
4528"S.replace (old, new[, maxsplit]) -> unicode\n\
4529\n\
4530Return a copy of S with all occurrences of substring\n\
4531old replaced by new. If the optional argument maxsplit is\n\
4532given, only the first maxsplit occurrences are replaced.";
4533
4534static PyObject*
4535unicode_replace(PyUnicodeObject *self, PyObject *args)
4536{
4537 PyUnicodeObject *str1;
4538 PyUnicodeObject *str2;
4539 int maxcount = -1;
4540 PyObject *result;
4541
4542 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4543 return NULL;
4544 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4545 if (str1 == NULL)
4546 return NULL;
4547 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4548 if (str2 == NULL)
4549 return NULL;
4550
4551 result = replace(self, str1, str2, maxcount);
4552
4553 Py_DECREF(str1);
4554 Py_DECREF(str2);
4555 return result;
4556}
4557
4558static
4559PyObject *unicode_repr(PyObject *unicode)
4560{
4561 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4562 PyUnicode_GET_SIZE(unicode),
4563 1);
4564}
4565
4566static char rfind__doc__[] =
4567"S.rfind(sub [,start [,end]]) -> int\n\
4568\n\
4569Return the highest index in S where substring sub is found,\n\
4570such that sub is contained within s[start,end]. Optional\n\
4571arguments start and end are interpreted as in slice notation.\n\
4572\n\
4573Return -1 on failure.";
4574
4575static PyObject *
4576unicode_rfind(PyUnicodeObject *self, PyObject *args)
4577{
4578 PyUnicodeObject *substring;
4579 int start = 0;
4580 int end = INT_MAX;
4581 PyObject *result;
4582
Guido van Rossumb8872e62000-05-09 14:14:27 +00004583 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4584 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 return NULL;
4586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4587 (PyObject *)substring);
4588 if (substring == NULL)
4589 return NULL;
4590
4591 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4592
4593 Py_DECREF(substring);
4594 return result;
4595}
4596
4597static char rindex__doc__[] =
4598"S.rindex(sub [,start [,end]]) -> int\n\
4599\n\
4600Like S.rfind() but raise ValueError when the substring is not found.";
4601
4602static PyObject *
4603unicode_rindex(PyUnicodeObject *self, PyObject *args)
4604{
4605 int result;
4606 PyUnicodeObject *substring;
4607 int start = 0;
4608 int end = INT_MAX;
4609
Guido van Rossumb8872e62000-05-09 14:14:27 +00004610 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4611 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 return NULL;
4613 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4614 (PyObject *)substring);
4615 if (substring == NULL)
4616 return NULL;
4617
4618 result = findstring(self, substring, start, end, -1);
4619
4620 Py_DECREF(substring);
4621 if (result < 0) {
4622 PyErr_SetString(PyExc_ValueError, "substring not found");
4623 return NULL;
4624 }
4625 return PyInt_FromLong(result);
4626}
4627
4628static char rjust__doc__[] =
4629"S.rjust(width) -> unicode\n\
4630\n\
4631Return S right justified in a Unicode string of length width. Padding is\n\
4632done using spaces.";
4633
4634static PyObject *
4635unicode_rjust(PyUnicodeObject *self, PyObject *args)
4636{
4637 int width;
4638 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4639 return NULL;
4640
Tim Peters7a29bd52001-09-12 03:03:31 +00004641 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 Py_INCREF(self);
4643 return (PyObject*) self;
4644 }
4645
4646 return (PyObject*) pad(self, width - self->length, 0, ' ');
4647}
4648
4649static char rstrip__doc__[] =
4650"S.rstrip() -> unicode\n\
4651\n\
4652Return a copy of the string S with trailing whitespace removed.";
4653
4654static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004655unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 return strip(self, 0, 1);
4658}
4659
4660static PyObject*
4661unicode_slice(PyUnicodeObject *self, int start, int end)
4662{
4663 /* standard clamping */
4664 if (start < 0)
4665 start = 0;
4666 if (end < 0)
4667 end = 0;
4668 if (end > self->length)
4669 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004670 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 /* full slice, return original string */
4672 Py_INCREF(self);
4673 return (PyObject*) self;
4674 }
4675 if (start > end)
4676 start = end;
4677 /* copy slice */
4678 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4679 end - start);
4680}
4681
4682PyObject *PyUnicode_Split(PyObject *s,
4683 PyObject *sep,
4684 int maxsplit)
4685{
4686 PyObject *result;
4687
4688 s = PyUnicode_FromObject(s);
4689 if (s == NULL)
4690 return NULL;
4691 if (sep != NULL) {
4692 sep = PyUnicode_FromObject(sep);
4693 if (sep == NULL) {
4694 Py_DECREF(s);
4695 return NULL;
4696 }
4697 }
4698
4699 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4700
4701 Py_DECREF(s);
4702 Py_XDECREF(sep);
4703 return result;
4704}
4705
4706static char split__doc__[] =
4707"S.split([sep [,maxsplit]]) -> list of strings\n\
4708\n\
4709Return a list of the words in S, using sep as the\n\
4710delimiter string. If maxsplit is given, at most maxsplit\n\
4711splits are done. If sep is not specified, any whitespace string\n\
4712is a separator.";
4713
4714static PyObject*
4715unicode_split(PyUnicodeObject *self, PyObject *args)
4716{
4717 PyObject *substring = Py_None;
4718 int maxcount = -1;
4719
4720 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4721 return NULL;
4722
4723 if (substring == Py_None)
4724 return split(self, NULL, maxcount);
4725 else if (PyUnicode_Check(substring))
4726 return split(self, (PyUnicodeObject *)substring, maxcount);
4727 else
4728 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4729}
4730
4731static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004732"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733\n\
4734Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004735Line breaks are not included in the resulting list unless keepends\n\
4736is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737
4738static PyObject*
4739unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4740{
Guido van Rossum86662912000-04-11 15:38:46 +00004741 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Guido van Rossum86662912000-04-11 15:38:46 +00004743 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 return NULL;
4745
Guido van Rossum86662912000-04-11 15:38:46 +00004746 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747}
4748
4749static
4750PyObject *unicode_str(PyUnicodeObject *self)
4751{
Fred Drakee4315f52000-05-09 19:53:39 +00004752 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753}
4754
4755static char strip__doc__[] =
4756"S.strip() -> unicode\n\
4757\n\
4758Return a copy of S with leading and trailing whitespace removed.";
4759
4760static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004761unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 return strip(self, 1, 1);
4764}
4765
4766static char swapcase__doc__[] =
4767"S.swapcase() -> unicode\n\
4768\n\
4769Return a copy of S with uppercase characters converted to lowercase\n\
4770and vice versa.";
4771
4772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004773unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 return fixup(self, fixswapcase);
4776}
4777
4778static char translate__doc__[] =
4779"S.translate(table) -> unicode\n\
4780\n\
4781Return a copy of the string S, where all characters have been mapped\n\
4782through the given translation table, which must be a mapping of\n\
4783Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4784are left untouched. Characters mapped to None are deleted.";
4785
4786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004787unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 return PyUnicode_TranslateCharmap(self->str,
4790 self->length,
4791 table,
4792 "ignore");
4793}
4794
4795static char upper__doc__[] =
4796"S.upper() -> unicode\n\
4797\n\
4798Return a copy of S converted to uppercase.";
4799
4800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004801unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 return fixup(self, fixupper);
4804}
4805
4806#if 0
4807static char zfill__doc__[] =
4808"S.zfill(width) -> unicode\n\
4809\n\
4810Pad a numeric string x with zeros on the left, to fill a field\n\
4811of the specified width. The string x is never truncated.";
4812
4813static PyObject *
4814unicode_zfill(PyUnicodeObject *self, PyObject *args)
4815{
4816 int fill;
4817 PyUnicodeObject *u;
4818
4819 int width;
4820 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4821 return NULL;
4822
4823 if (self->length >= width) {
4824 Py_INCREF(self);
4825 return (PyObject*) self;
4826 }
4827
4828 fill = width - self->length;
4829
4830 u = pad(self, fill, 0, '0');
4831
4832 if (u->str[fill] == '+' || u->str[fill] == '-') {
4833 /* move sign to beginning of string */
4834 u->str[0] = u->str[fill];
4835 u->str[fill] = '0';
4836 }
4837
4838 return (PyObject*) u;
4839}
4840#endif
4841
4842#if 0
4843static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004844unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return PyInt_FromLong(unicode_freelist_size);
4847}
4848#endif
4849
4850static char startswith__doc__[] =
4851"S.startswith(prefix[, start[, end]]) -> int\n\
4852\n\
4853Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4854optional start, test S beginning at that position. With optional end, stop\n\
4855comparing S at that position.";
4856
4857static PyObject *
4858unicode_startswith(PyUnicodeObject *self,
4859 PyObject *args)
4860{
4861 PyUnicodeObject *substring;
4862 int start = 0;
4863 int end = INT_MAX;
4864 PyObject *result;
4865
Guido van Rossumb8872e62000-05-09 14:14:27 +00004866 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4867 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 return NULL;
4869 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4870 (PyObject *)substring);
4871 if (substring == NULL)
4872 return NULL;
4873
4874 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4875
4876 Py_DECREF(substring);
4877 return result;
4878}
4879
4880
4881static char endswith__doc__[] =
4882"S.endswith(suffix[, start[, end]]) -> int\n\
4883\n\
4884Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4885optional start, test S beginning at that position. With optional end, stop\n\
4886comparing S at that position.";
4887
4888static PyObject *
4889unicode_endswith(PyUnicodeObject *self,
4890 PyObject *args)
4891{
4892 PyUnicodeObject *substring;
4893 int start = 0;
4894 int end = INT_MAX;
4895 PyObject *result;
4896
Guido van Rossumb8872e62000-05-09 14:14:27 +00004897 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4898 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 return NULL;
4900 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4901 (PyObject *)substring);
4902 if (substring == NULL)
4903 return NULL;
4904
4905 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4906
4907 Py_DECREF(substring);
4908 return result;
4909}
4910
4911
4912static PyMethodDef unicode_methods[] = {
4913
4914 /* Order is according to common usage: often used methods should
4915 appear first, since lookup is done sequentially. */
4916
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004917 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4918 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4919 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4920 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4921 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4922 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4923 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4924 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4925 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4926 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4927 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4928 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4929 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4930 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4931/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4932 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4933 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4934 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4935 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4936 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4937 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4938 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4939 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4940 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4941 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4942 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4943 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4944 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4945 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4946 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4947 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4948 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4949 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4950 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4951 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004953 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4954 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955#endif
4956
4957#if 0
4958 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004959 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960#endif
4961
4962 {NULL, NULL}
4963};
4964
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965static PySequenceMethods unicode_as_sequence = {
4966 (inquiry) unicode_length, /* sq_length */
4967 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4968 (intargfunc) unicode_repeat, /* sq_repeat */
4969 (intargfunc) unicode_getitem, /* sq_item */
4970 (intintargfunc) unicode_slice, /* sq_slice */
4971 0, /* sq_ass_item */
4972 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004973 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974};
4975
4976static int
4977unicode_buffer_getreadbuf(PyUnicodeObject *self,
4978 int index,
4979 const void **ptr)
4980{
4981 if (index != 0) {
4982 PyErr_SetString(PyExc_SystemError,
4983 "accessing non-existent unicode segment");
4984 return -1;
4985 }
4986 *ptr = (void *) self->str;
4987 return PyUnicode_GET_DATA_SIZE(self);
4988}
4989
4990static int
4991unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4992 const void **ptr)
4993{
4994 PyErr_SetString(PyExc_TypeError,
4995 "cannot use unicode as modifyable buffer");
4996 return -1;
4997}
4998
4999static int
5000unicode_buffer_getsegcount(PyUnicodeObject *self,
5001 int *lenp)
5002{
5003 if (lenp)
5004 *lenp = PyUnicode_GET_DATA_SIZE(self);
5005 return 1;
5006}
5007
5008static int
5009unicode_buffer_getcharbuf(PyUnicodeObject *self,
5010 int index,
5011 const void **ptr)
5012{
5013 PyObject *str;
5014
5015 if (index != 0) {
5016 PyErr_SetString(PyExc_SystemError,
5017 "accessing non-existent unicode segment");
5018 return -1;
5019 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005020 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 if (str == NULL)
5022 return -1;
5023 *ptr = (void *) PyString_AS_STRING(str);
5024 return PyString_GET_SIZE(str);
5025}
5026
5027/* Helpers for PyUnicode_Format() */
5028
5029static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005030getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031{
5032 int argidx = *p_argidx;
5033 if (argidx < arglen) {
5034 (*p_argidx)++;
5035 if (arglen < 0)
5036 return args;
5037 else
5038 return PyTuple_GetItem(args, argidx);
5039 }
5040 PyErr_SetString(PyExc_TypeError,
5041 "not enough arguments for format string");
5042 return NULL;
5043}
5044
5045#define F_LJUST (1<<0)
5046#define F_SIGN (1<<1)
5047#define F_BLANK (1<<2)
5048#define F_ALT (1<<3)
5049#define F_ZERO (1<<4)
5050
5051static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053{
5054 register int i;
5055 int len;
5056 va_list va;
5057 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059
5060 /* First, format the string as char array, then expand to Py_UNICODE
5061 array. */
5062 charbuffer = (char *)buffer;
5063 len = vsprintf(charbuffer, format, va);
5064 for (i = len - 1; i >= 0; i--)
5065 buffer[i] = (Py_UNICODE) charbuffer[i];
5066
5067 va_end(va);
5068 return len;
5069}
5070
5071static int
5072formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005073 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 int flags,
5075 int prec,
5076 int type,
5077 PyObject *v)
5078{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005079 /* fmt = '%#.' + `prec` + `type`
5080 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 char fmt[20];
5082 double x;
5083
5084 x = PyFloat_AsDouble(v);
5085 if (x == -1.0 && PyErr_Occurred())
5086 return -1;
5087 if (prec < 0)
5088 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5090 type = 'g';
5091 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005092 /* worst case length calc to ensure no buffer overrun:
5093 fmt = %#.<prec>g
5094 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5095 for any double rep.)
5096 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5097 If prec=0 the effective precision is 1 (the leading digit is
5098 always given), therefore increase by one to 10+prec. */
5099 if (buflen <= (size_t)10 + (size_t)prec) {
5100 PyErr_SetString(PyExc_OverflowError,
5101 "formatted float is too long (precision too long?)");
5102 return -1;
5103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 return usprintf(buf, fmt, x);
5105}
5106
Tim Peters38fd5b62000-09-21 05:43:11 +00005107static PyObject*
5108formatlong(PyObject *val, int flags, int prec, int type)
5109{
5110 char *buf;
5111 int i, len;
5112 PyObject *str; /* temporary string object. */
5113 PyUnicodeObject *result;
5114
5115 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5116 if (!str)
5117 return NULL;
5118 result = _PyUnicode_New(len);
5119 for (i = 0; i < len; i++)
5120 result->str[i] = buf[i];
5121 result->str[len] = 0;
5122 Py_DECREF(str);
5123 return (PyObject*)result;
5124}
5125
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126static int
5127formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005128 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 int flags,
5130 int prec,
5131 int type,
5132 PyObject *v)
5133{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005134 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005135 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5136 + 1 + 1 = 24*/
5137 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005139 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140
5141 x = PyInt_AsLong(v);
5142 if (x == -1 && PyErr_Occurred())
5143 return -1;
5144 if (prec < 0)
5145 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005146 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5147 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5148 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5149 PyErr_SetString(PyExc_OverflowError,
5150 "formatted integer is too long (precision too long?)");
5151 return -1;
5152 }
Tim Petersfff53252001-04-12 18:38:48 +00005153 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5154 * but we want it (for consistency with other %#x conversions, and
5155 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005156 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5157 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5158 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005159 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005160 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5161 /* Only way to know what the platform does is to try it. */
5162 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5163 if (fmt[1] != (char)type) {
5164 /* Supply our own leading 0x/0X -- needed under std C */
5165 use_native_c_format = 0;
5166 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5167 }
5168 }
5169 if (use_native_c_format)
5170 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 return usprintf(buf, fmt, x);
5172}
5173
5174static int
5175formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005176 size_t buflen,
5177 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005179 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005180 if (PyUnicode_Check(v)) {
5181 if (PyUnicode_GET_SIZE(v) != 1)
5182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005186 else if (PyString_Check(v)) {
5187 if (PyString_GET_SIZE(v) != 1)
5188 goto onError;
5189 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191
5192 else {
5193 /* Integer input truncated to a character */
5194 long x;
5195 x = PyInt_AsLong(v);
5196 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005197 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 buf[0] = (char) x;
5199 }
5200 buf[1] = '\0';
5201 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005202
5203 onError:
5204 PyErr_SetString(PyExc_TypeError,
5205 "%c requires int or char");
5206 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207}
5208
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005209/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5210
5211 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5212 chars are formatted. XXX This is a magic number. Each formatting
5213 routine does bounds checking to ensure no overflow, but a better
5214 solution may be to malloc a buffer of appropriate size for each
5215 format. For now, the current solution is sufficient.
5216*/
5217#define FORMATBUFLEN (size_t)120
5218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219PyObject *PyUnicode_Format(PyObject *format,
5220 PyObject *args)
5221{
5222 Py_UNICODE *fmt, *res;
5223 int fmtcnt, rescnt, reslen, arglen, argidx;
5224 int args_owned = 0;
5225 PyUnicodeObject *result = NULL;
5226 PyObject *dict = NULL;
5227 PyObject *uformat;
5228
5229 if (format == NULL || args == NULL) {
5230 PyErr_BadInternalCall();
5231 return NULL;
5232 }
5233 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005234 if (uformat == NULL)
5235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 fmt = PyUnicode_AS_UNICODE(uformat);
5237 fmtcnt = PyUnicode_GET_SIZE(uformat);
5238
5239 reslen = rescnt = fmtcnt + 100;
5240 result = _PyUnicode_New(reslen);
5241 if (result == NULL)
5242 goto onError;
5243 res = PyUnicode_AS_UNICODE(result);
5244
5245 if (PyTuple_Check(args)) {
5246 arglen = PyTuple_Size(args);
5247 argidx = 0;
5248 }
5249 else {
5250 arglen = -1;
5251 argidx = -2;
5252 }
5253 if (args->ob_type->tp_as_mapping)
5254 dict = args;
5255
5256 while (--fmtcnt >= 0) {
5257 if (*fmt != '%') {
5258 if (--rescnt < 0) {
5259 rescnt = fmtcnt + 100;
5260 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005261 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 return NULL;
5263 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5264 --rescnt;
5265 }
5266 *res++ = *fmt++;
5267 }
5268 else {
5269 /* Got a format specifier */
5270 int flags = 0;
5271 int width = -1;
5272 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 Py_UNICODE c = '\0';
5274 Py_UNICODE fill;
5275 PyObject *v = NULL;
5276 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005277 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 Py_UNICODE sign;
5279 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005280 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281
5282 fmt++;
5283 if (*fmt == '(') {
5284 Py_UNICODE *keystart;
5285 int keylen;
5286 PyObject *key;
5287 int pcount = 1;
5288
5289 if (dict == NULL) {
5290 PyErr_SetString(PyExc_TypeError,
5291 "format requires a mapping");
5292 goto onError;
5293 }
5294 ++fmt;
5295 --fmtcnt;
5296 keystart = fmt;
5297 /* Skip over balanced parentheses */
5298 while (pcount > 0 && --fmtcnt >= 0) {
5299 if (*fmt == ')')
5300 --pcount;
5301 else if (*fmt == '(')
5302 ++pcount;
5303 fmt++;
5304 }
5305 keylen = fmt - keystart - 1;
5306 if (fmtcnt < 0 || pcount > 0) {
5307 PyErr_SetString(PyExc_ValueError,
5308 "incomplete format key");
5309 goto onError;
5310 }
Fred Drakee4315f52000-05-09 19:53:39 +00005311 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 then looked up since Python uses strings to hold
5313 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005314 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 key = PyUnicode_EncodeUTF8(keystart,
5316 keylen,
5317 NULL);
5318 if (key == NULL)
5319 goto onError;
5320 if (args_owned) {
5321 Py_DECREF(args);
5322 args_owned = 0;
5323 }
5324 args = PyObject_GetItem(dict, key);
5325 Py_DECREF(key);
5326 if (args == NULL) {
5327 goto onError;
5328 }
5329 args_owned = 1;
5330 arglen = -1;
5331 argidx = -2;
5332 }
5333 while (--fmtcnt >= 0) {
5334 switch (c = *fmt++) {
5335 case '-': flags |= F_LJUST; continue;
5336 case '+': flags |= F_SIGN; continue;
5337 case ' ': flags |= F_BLANK; continue;
5338 case '#': flags |= F_ALT; continue;
5339 case '0': flags |= F_ZERO; continue;
5340 }
5341 break;
5342 }
5343 if (c == '*') {
5344 v = getnextarg(args, arglen, &argidx);
5345 if (v == NULL)
5346 goto onError;
5347 if (!PyInt_Check(v)) {
5348 PyErr_SetString(PyExc_TypeError,
5349 "* wants int");
5350 goto onError;
5351 }
5352 width = PyInt_AsLong(v);
5353 if (width < 0) {
5354 flags |= F_LJUST;
5355 width = -width;
5356 }
5357 if (--fmtcnt >= 0)
5358 c = *fmt++;
5359 }
5360 else if (c >= '0' && c <= '9') {
5361 width = c - '0';
5362 while (--fmtcnt >= 0) {
5363 c = *fmt++;
5364 if (c < '0' || c > '9')
5365 break;
5366 if ((width*10) / 10 != width) {
5367 PyErr_SetString(PyExc_ValueError,
5368 "width too big");
5369 goto onError;
5370 }
5371 width = width*10 + (c - '0');
5372 }
5373 }
5374 if (c == '.') {
5375 prec = 0;
5376 if (--fmtcnt >= 0)
5377 c = *fmt++;
5378 if (c == '*') {
5379 v = getnextarg(args, arglen, &argidx);
5380 if (v == NULL)
5381 goto onError;
5382 if (!PyInt_Check(v)) {
5383 PyErr_SetString(PyExc_TypeError,
5384 "* wants int");
5385 goto onError;
5386 }
5387 prec = PyInt_AsLong(v);
5388 if (prec < 0)
5389 prec = 0;
5390 if (--fmtcnt >= 0)
5391 c = *fmt++;
5392 }
5393 else if (c >= '0' && c <= '9') {
5394 prec = c - '0';
5395 while (--fmtcnt >= 0) {
5396 c = Py_CHARMASK(*fmt++);
5397 if (c < '0' || c > '9')
5398 break;
5399 if ((prec*10) / 10 != prec) {
5400 PyErr_SetString(PyExc_ValueError,
5401 "prec too big");
5402 goto onError;
5403 }
5404 prec = prec*10 + (c - '0');
5405 }
5406 }
5407 } /* prec */
5408 if (fmtcnt >= 0) {
5409 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 if (--fmtcnt >= 0)
5411 c = *fmt++;
5412 }
5413 }
5414 if (fmtcnt < 0) {
5415 PyErr_SetString(PyExc_ValueError,
5416 "incomplete format");
5417 goto onError;
5418 }
5419 if (c != '%') {
5420 v = getnextarg(args, arglen, &argidx);
5421 if (v == NULL)
5422 goto onError;
5423 }
5424 sign = 0;
5425 fill = ' ';
5426 switch (c) {
5427
5428 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005429 pbuf = formatbuf;
5430 /* presume that buffer length is at least 1 */
5431 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 len = 1;
5433 break;
5434
5435 case 's':
5436 case 'r':
5437 if (PyUnicode_Check(v) && c == 's') {
5438 temp = v;
5439 Py_INCREF(temp);
5440 }
5441 else {
5442 PyObject *unicode;
5443 if (c == 's')
5444 temp = PyObject_Str(v);
5445 else
5446 temp = PyObject_Repr(v);
5447 if (temp == NULL)
5448 goto onError;
5449 if (!PyString_Check(temp)) {
5450 /* XXX Note: this should never happen, since
5451 PyObject_Repr() and PyObject_Str() assure
5452 this */
5453 Py_DECREF(temp);
5454 PyErr_SetString(PyExc_TypeError,
5455 "%s argument has non-string str()");
5456 goto onError;
5457 }
Fred Drakee4315f52000-05-09 19:53:39 +00005458 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005460 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 "strict");
5462 Py_DECREF(temp);
5463 temp = unicode;
5464 if (temp == NULL)
5465 goto onError;
5466 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005467 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 len = PyUnicode_GET_SIZE(temp);
5469 if (prec >= 0 && len > prec)
5470 len = prec;
5471 break;
5472
5473 case 'i':
5474 case 'd':
5475 case 'u':
5476 case 'o':
5477 case 'x':
5478 case 'X':
5479 if (c == 'i')
5480 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005481 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005482 temp = formatlong(v, flags, prec, c);
5483 if (!temp)
5484 goto onError;
5485 pbuf = PyUnicode_AS_UNICODE(temp);
5486 len = PyUnicode_GET_SIZE(temp);
5487 /* unbounded ints can always produce
5488 a sign character! */
5489 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005491 else {
5492 pbuf = formatbuf;
5493 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5494 flags, prec, c, v);
5495 if (len < 0)
5496 goto onError;
5497 /* only d conversion is signed */
5498 sign = c == 'd';
5499 }
5500 if (flags & F_ZERO)
5501 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 break;
5503
5504 case 'e':
5505 case 'E':
5506 case 'f':
5507 case 'g':
5508 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005509 pbuf = formatbuf;
5510 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5511 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 if (len < 0)
5513 goto onError;
5514 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005515 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 fill = '0';
5517 break;
5518
5519 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005520 pbuf = formatbuf;
5521 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 if (len < 0)
5523 goto onError;
5524 break;
5525
5526 default:
5527 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005528 "unsupported format character '%c' (0x%x) "
5529 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005530 (31<=c && c<=126) ? c : '?',
5531 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 goto onError;
5533 }
5534 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005535 if (*pbuf == '-' || *pbuf == '+') {
5536 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 len--;
5538 }
5539 else if (flags & F_SIGN)
5540 sign = '+';
5541 else if (flags & F_BLANK)
5542 sign = ' ';
5543 else
5544 sign = 0;
5545 }
5546 if (width < len)
5547 width = len;
5548 if (rescnt < width + (sign != 0)) {
5549 reslen -= rescnt;
5550 rescnt = width + fmtcnt + 100;
5551 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005552 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 return NULL;
5554 res = PyUnicode_AS_UNICODE(result)
5555 + reslen - rescnt;
5556 }
5557 if (sign) {
5558 if (fill != ' ')
5559 *res++ = sign;
5560 rescnt--;
5561 if (width > len)
5562 width--;
5563 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005564 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5565 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005566 assert(pbuf[1] == c);
5567 if (fill != ' ') {
5568 *res++ = *pbuf++;
5569 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005570 }
Tim Petersfff53252001-04-12 18:38:48 +00005571 rescnt -= 2;
5572 width -= 2;
5573 if (width < 0)
5574 width = 0;
5575 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 if (width > len && !(flags & F_LJUST)) {
5578 do {
5579 --rescnt;
5580 *res++ = fill;
5581 } while (--width > len);
5582 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005583 if (fill == ' ') {
5584 if (sign)
5585 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005586 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005587 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005588 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005589 *res++ = *pbuf++;
5590 *res++ = *pbuf++;
5591 }
5592 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005593 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 res += len;
5595 rescnt -= len;
5596 while (--width >= len) {
5597 --rescnt;
5598 *res++ = ' ';
5599 }
5600 if (dict && (argidx < arglen) && c != '%') {
5601 PyErr_SetString(PyExc_TypeError,
5602 "not all arguments converted");
5603 goto onError;
5604 }
5605 Py_XDECREF(temp);
5606 } /* '%' */
5607 } /* until end */
5608 if (argidx < arglen && !dict) {
5609 PyErr_SetString(PyExc_TypeError,
5610 "not all arguments converted");
5611 goto onError;
5612 }
5613
5614 if (args_owned) {
5615 Py_DECREF(args);
5616 }
5617 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005618 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005619 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 return (PyObject *)result;
5621
5622 onError:
5623 Py_XDECREF(result);
5624 Py_DECREF(uformat);
5625 if (args_owned) {
5626 Py_DECREF(args);
5627 }
5628 return NULL;
5629}
5630
5631static PyBufferProcs unicode_as_buffer = {
5632 (getreadbufferproc) unicode_buffer_getreadbuf,
5633 (getwritebufferproc) unicode_buffer_getwritebuf,
5634 (getsegcountproc) unicode_buffer_getsegcount,
5635 (getcharbufferproc) unicode_buffer_getcharbuf,
5636};
5637
Guido van Rossume023fe02001-08-30 03:12:59 +00005638staticforward PyObject *
5639unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5640
Tim Peters6d6c1a32001-08-02 04:15:00 +00005641static PyObject *
5642unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5643{
5644 PyObject *x = NULL;
5645 static char *kwlist[] = {"string", "encoding", "errors", 0};
5646 char *encoding = NULL;
5647 char *errors = NULL;
5648
Guido van Rossume023fe02001-08-30 03:12:59 +00005649 if (type != &PyUnicode_Type)
5650 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005651 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5652 kwlist, &x, &encoding, &errors))
5653 return NULL;
5654 if (x == NULL)
5655 return (PyObject *)_PyUnicode_New(0);
5656 return PyUnicode_FromEncodedObject(x, encoding, errors);
5657}
5658
Guido van Rossume023fe02001-08-30 03:12:59 +00005659static PyObject *
5660unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5661{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005662 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005663 int n;
5664
5665 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5666 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5667 if (tmp == NULL)
5668 return NULL;
5669 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005670 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5671 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005672 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005673 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5674 if (pnew->str == NULL) {
5675 _Py_ForgetReference((PyObject *)pnew);
5676 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005677 return NULL;
5678 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005679 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5680 pnew->length = n;
5681 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005682 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005683 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005684}
5685
Tim Peters6d6c1a32001-08-02 04:15:00 +00005686static char unicode_doc[] =
5687"unicode(string [, encoding[, errors]]) -> object\n\
5688\n\
5689Create a new Unicode object from the given encoded string.\n\
5690encoding defaults to the current default string encoding and \n\
5691errors, defining the error handling, to 'strict'.";
5692
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693PyTypeObject PyUnicode_Type = {
5694 PyObject_HEAD_INIT(&PyType_Type)
5695 0, /* ob_size */
5696 "unicode", /* tp_name */
5697 sizeof(PyUnicodeObject), /* tp_size */
5698 0, /* tp_itemsize */
5699 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005700 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005702 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 0, /* tp_setattr */
5704 (cmpfunc) unicode_compare, /* tp_compare */
5705 (reprfunc) unicode_repr, /* tp_repr */
5706 0, /* tp_as_number */
5707 &unicode_as_sequence, /* tp_as_sequence */
5708 0, /* tp_as_mapping */
5709 (hashfunc) unicode_hash, /* tp_hash*/
5710 0, /* tp_call*/
5711 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005712 PyObject_GenericGetAttr, /* tp_getattro */
5713 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005715 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005716 unicode_doc, /* tp_doc */
5717 0, /* tp_traverse */
5718 0, /* tp_clear */
5719 0, /* tp_richcompare */
5720 0, /* tp_weaklistoffset */
5721 0, /* tp_iter */
5722 0, /* tp_iternext */
5723 unicode_methods, /* tp_methods */
5724 0, /* tp_members */
5725 0, /* tp_getset */
5726 0, /* tp_base */
5727 0, /* tp_dict */
5728 0, /* tp_descr_get */
5729 0, /* tp_descr_set */
5730 0, /* tp_dictoffset */
5731 0, /* tp_init */
5732 0, /* tp_alloc */
5733 unicode_new, /* tp_new */
Guido van Rossum9475a232001-10-05 20:51:39 +00005734 _PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735};
5736
5737/* Initialize the Unicode implementation */
5738
Thomas Wouters78890102000-07-22 19:25:51 +00005739void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005741 int i;
5742
Fred Drakee4315f52000-05-09 19:53:39 +00005743 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005744 unicode_freelist = NULL;
5745 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005747 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005748 for (i = 0; i < 256; i++)
5749 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750}
5751
5752/* Finalize the Unicode implementation */
5753
5754void
Thomas Wouters78890102000-07-22 19:25:51 +00005755_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005757 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005758 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005760 Py_XDECREF(unicode_empty);
5761 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005762
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005763 for (i = 0; i < 256; i++) {
5764 if (unicode_latin1[i]) {
5765 Py_DECREF(unicode_latin1[i]);
5766 unicode_latin1[i] = NULL;
5767 }
5768 }
5769
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005770 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 PyUnicodeObject *v = u;
5772 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005773 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005774 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005775 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005776 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005778 unicode_freelist = NULL;
5779 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780}