blob: 57ef62a7138fe9496022f62471f66a28836f9c99 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum9475a232001-10-05 20:51:39 +0000229 if (!PyUnicode_CheckExact(unicode)) {
230 unicode->ob_type->tp_free((PyObject *)unicode);
231 return;
232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000234 /* Keep-Alive optimization */
235 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str = NULL;
238 unicode->length = 0;
239 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 if (unicode->defenc) {
241 Py_DECREF(unicode->defenc);
242 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 }
244 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 *(PyUnicodeObject **)unicode = unicode_freelist;
246 unicode_freelist = unicode;
247 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 }
249 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000251 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 }
254}
255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256int PyUnicode_Resize(PyObject **unicode,
257 int length)
258{
259 register PyUnicodeObject *v;
260
261 /* Argument checks */
262 if (unicode == NULL) {
263 PyErr_BadInternalCall();
264 return -1;
265 }
266 v = (PyUnicodeObject *)*unicode;
267 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
268 PyErr_BadInternalCall();
269 return -1;
270 }
271
272 /* Resizing unicode_empty and single character objects is not
273 possible since these are being shared. We simply return a fresh
274 copy with the same Unicode content. */
275 if (v->length != length &&
276 (v == unicode_empty || v->length == 1)) {
277 PyUnicodeObject *w = _PyUnicode_New(length);
278 if (w == NULL)
279 return -1;
280 Py_UNICODE_COPY(w->str, v->str,
281 length < v->length ? length : v->length);
282 *unicode = (PyObject *)w;
283 return 0;
284 }
285
286 /* Note that we don't have to modify *unicode for unshared Unicode
287 objects, since we can modify them in-place. */
288 return unicode_resize(v, length);
289}
290
291/* Internal API for use in unicodeobject.c only ! */
292#define _PyUnicode_Resize(unicodevar, length) \
293 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
294
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
296 int size)
297{
298 PyUnicodeObject *unicode;
299
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000300 /* If the Unicode data is known at construction time, we can apply
301 some optimizations which share commonly used objects. */
302 if (u != NULL) {
303
304 /* Optimization for empty strings */
305 if (size == 0 && unicode_empty != NULL) {
306 Py_INCREF(unicode_empty);
307 return (PyObject *)unicode_empty;
308 }
309
310 /* Single character Unicode objects in the Latin-1 range are
311 shared when using this constructor */
312 if (size == 1 && *u < 256) {
313 unicode = unicode_latin1[*u];
314 if (!unicode) {
315 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 if (!unicode)
317 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000318 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000319 unicode_latin1[*u] = unicode;
320 }
321 Py_INCREF(unicode);
322 return (PyObject *)unicode;
323 }
324 }
325
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode = _PyUnicode_New(size);
327 if (!unicode)
328 return NULL;
329
330 /* Copy the Unicode data into the new object */
331 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333
334 return (PyObject *)unicode;
335}
336
337#ifdef HAVE_WCHAR_H
338
339PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
340 int size)
341{
342 PyUnicodeObject *unicode;
343
344 if (w == NULL) {
345 PyErr_BadInternalCall();
346 return NULL;
347 }
348
349 unicode = _PyUnicode_New(size);
350 if (!unicode)
351 return NULL;
352
353 /* Copy the wchar_t data into the new object */
354#ifdef HAVE_USABLE_WCHAR_T
355 memcpy(unicode->str, w, size * sizeof(wchar_t));
356#else
357 {
358 register Py_UNICODE *u;
359 register int i;
360 u = PyUnicode_AS_UNICODE(unicode);
361 for (i = size; i >= 0; i--)
362 *u++ = *w++;
363 }
364#endif
365
366 return (PyObject *)unicode;
367}
368
369int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
370 register wchar_t *w,
371 int size)
372{
373 if (unicode == NULL) {
374 PyErr_BadInternalCall();
375 return -1;
376 }
377 if (size > PyUnicode_GET_SIZE(unicode))
378 size = PyUnicode_GET_SIZE(unicode);
379#ifdef HAVE_USABLE_WCHAR_T
380 memcpy(w, unicode->str, size * sizeof(wchar_t));
381#else
382 {
383 register Py_UNICODE *u;
384 register int i;
385 u = PyUnicode_AS_UNICODE(unicode);
386 for (i = size; i >= 0; i--)
387 *w++ = *u++;
388 }
389#endif
390
391 return size;
392}
393
394#endif
395
396PyObject *PyUnicode_FromObject(register PyObject *obj)
397{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000398 /* XXX Perhaps we should make this API an alias of
399 PyObject_Unicode() instead ?! */
400 if (PyUnicode_CheckExact(obj)) {
401 Py_INCREF(obj);
402 return obj;
403 }
404 if (PyUnicode_Check(obj)) {
405 /* For a Unicode subtype that's not a Unicode object,
406 return a true Unicode object with the same data. */
407 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
408 PyUnicode_GET_SIZE(obj));
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
411}
412
413PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
414 const char *encoding,
415 const char *errors)
416{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000417 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 int owned = 0;
420 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 if (obj == NULL) {
423 PyErr_BadInternalCall();
424 return NULL;
425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000427#if 0
428 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000429 that no encodings is given and then redirect to
430 PyObject_Unicode() which then applies the additional logic for
431 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000432
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000433 NOTE: This API should really only be used for object which
434 represent *encoded* Unicode !
435
436 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000437 if (PyUnicode_Check(obj)) {
438 if (encoding) {
439 PyErr_SetString(PyExc_TypeError,
440 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000441 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000442 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000443 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000444 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000445#else
446 if (PyUnicode_Check(obj)) {
447 PyErr_SetString(PyExc_TypeError,
448 "decoding Unicode is not supported");
449 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000451#endif
452
453 /* Coerce object */
454 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000455 s = PyString_AS_STRING(obj);
456 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000458 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
459 /* Overwrite the error message with something more useful in
460 case of a TypeError. */
461 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463 "coercing to Unicode: need string or buffer, "
464 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465 obj->ob_type->tp_name);
466 goto onError;
467 }
468
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (len == 0) {
471 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000472 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474 else
475 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000476
Greg Steinaf36a3a2000-07-17 09:04:43 +0000477 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000478 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000479 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000480 return v;
481
482 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000483 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000484 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000485 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487}
488
489PyObject *PyUnicode_Decode(const char *s,
490 int size,
491 const char *encoding,
492 const char *errors)
493{
494 PyObject *buffer = NULL, *unicode;
495
Fred Drakee4315f52000-05-09 19:53:39 +0000496 if (encoding == NULL)
497 encoding = PyUnicode_GetDefaultEncoding();
498
499 /* Shortcuts for common default encodings */
500 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000502 else if (strcmp(encoding, "latin-1") == 0)
503 return PyUnicode_DecodeLatin1(s, size, errors);
504 else if (strcmp(encoding, "ascii") == 0)
505 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506
507 /* Decode via the codec registry */
508 buffer = PyBuffer_FromMemory((void *)s, size);
509 if (buffer == NULL)
510 goto onError;
511 unicode = PyCodec_Decode(buffer, encoding, errors);
512 if (unicode == NULL)
513 goto onError;
514 if (!PyUnicode_Check(unicode)) {
515 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000516 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode->ob_type->tp_name);
518 Py_DECREF(unicode);
519 goto onError;
520 }
521 Py_DECREF(buffer);
522 return unicode;
523
524 onError:
525 Py_XDECREF(buffer);
526 return NULL;
527}
528
529PyObject *PyUnicode_Encode(const Py_UNICODE *s,
530 int size,
531 const char *encoding,
532 const char *errors)
533{
534 PyObject *v, *unicode;
535
536 unicode = PyUnicode_FromUnicode(s, size);
537 if (unicode == NULL)
538 return NULL;
539 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
540 Py_DECREF(unicode);
541 return v;
542}
543
544PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
545 const char *encoding,
546 const char *errors)
547{
548 PyObject *v;
549
550 if (!PyUnicode_Check(unicode)) {
551 PyErr_BadArgument();
552 goto onError;
553 }
Fred Drakee4315f52000-05-09 19:53:39 +0000554
555 if (encoding == NULL)
556 encoding = PyUnicode_GetDefaultEncoding();
557
558 /* Shortcuts for common default encodings */
559 if (errors == NULL) {
560 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000561 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000562 else if (strcmp(encoding, "latin-1") == 0)
563 return PyUnicode_AsLatin1String(unicode);
564 else if (strcmp(encoding, "ascii") == 0)
565 return PyUnicode_AsASCIIString(unicode);
566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567
568 /* Encode via the codec registry */
569 v = PyCodec_Encode(unicode, encoding, errors);
570 if (v == NULL)
571 goto onError;
572 /* XXX Should we really enforce this ? */
573 if (!PyString_Check(v)) {
574 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000575 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576 v->ob_type->tp_name);
577 Py_DECREF(v);
578 goto onError;
579 }
580 return v;
581
582 onError:
583 return NULL;
584}
585
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000586PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
587 const char *errors)
588{
589 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
590
591 if (v)
592 return v;
593 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
594 if (v && errors == NULL)
595 ((PyUnicodeObject *)unicode)->defenc = v;
596 return v;
597}
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
600{
601 if (!PyUnicode_Check(unicode)) {
602 PyErr_BadArgument();
603 goto onError;
604 }
605 return PyUnicode_AS_UNICODE(unicode);
606
607 onError:
608 return NULL;
609}
610
611int PyUnicode_GetSize(PyObject *unicode)
612{
613 if (!PyUnicode_Check(unicode)) {
614 PyErr_BadArgument();
615 goto onError;
616 }
617 return PyUnicode_GET_SIZE(unicode);
618
619 onError:
620 return -1;
621}
622
Thomas Wouters78890102000-07-22 19:25:51 +0000623const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000624{
625 return unicode_default_encoding;
626}
627
628int PyUnicode_SetDefaultEncoding(const char *encoding)
629{
630 PyObject *v;
631
632 /* Make sure the encoding is valid. As side effect, this also
633 loads the encoding into the codec registry cache. */
634 v = _PyCodec_Lookup(encoding);
635 if (v == NULL)
636 goto onError;
637 Py_DECREF(v);
638 strncpy(unicode_default_encoding,
639 encoding,
640 sizeof(unicode_default_encoding));
641 return 0;
642
643 onError:
644 return -1;
645}
646
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000647/* --- UTF-7 Codec -------------------------------------------------------- */
648
649/* see RFC2152 for details */
650
651static
652char utf7_special[128] = {
653 /* indicate whether a UTF-7 character is special i.e. cannot be directly
654 encoded:
655 0 - not special
656 1 - special
657 2 - whitespace (optional)
658 3 - RFC2152 Set O (optional) */
659 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
660 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
661 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
663 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
665 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
667
668};
669
670#define SPECIAL(c, encodeO, encodeWS) \
671 (((c)>127 || utf7_special[(c)] == 1) || \
672 (encodeWS && (utf7_special[(c)] == 2)) || \
673 (encodeO && (utf7_special[(c)] == 3)))
674
675#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
676#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
677#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
678 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
679
680#define ENCODE(out, ch, bits) \
681 while (bits >= 6) { \
682 *out++ = B64(ch >> (bits-6)); \
683 bits -= 6; \
684 }
685
686#define DECODE(out, ch, bits, surrogate) \
687 while (bits >= 16) { \
688 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
689 bits -= 16; \
690 if (surrogate) { \
691 /* We have already generated an error for the high surrogate
692 so let's not bother seeing if the low surrogate is correct or not */\
693 surrogate = 0; \
694 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
695 /* This is a surrogate pair. Unfortunately we can't represent \
696 it in a 16-bit character */ \
697 surrogate = 1; \
698 errmsg = "code pairs are not supported"; \
699 goto utf7Error; \
700 } else { \
701 *out++ = outCh; \
702 } \
703 } \
704
705static
706int utf7_decoding_error(Py_UNICODE **dest,
707 const char *errors,
708 const char *details)
709{
710 if ((errors == NULL) ||
711 (strcmp(errors,"strict") == 0)) {
712 PyErr_Format(PyExc_UnicodeError,
713 "UTF-7 decoding error: %.400s",
714 details);
715 return -1;
716 }
717 else if (strcmp(errors,"ignore") == 0) {
718 return 0;
719 }
720 else if (strcmp(errors,"replace") == 0) {
721 if (dest != NULL) {
722 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
723 (*dest)++;
724 }
725 return 0;
726 }
727 else {
728 PyErr_Format(PyExc_ValueError,
729 "UTF-7 decoding error; unknown error handling code: %.400s",
730 errors);
731 return -1;
732 }
733}
734
735PyObject *PyUnicode_DecodeUTF7(const char *s,
736 int size,
737 const char *errors)
738{
739 const char *e;
740 PyUnicodeObject *unicode;
741 Py_UNICODE *p;
742 const char *errmsg = "";
743 int inShift = 0;
744 unsigned int bitsleft = 0;
745 unsigned long charsleft = 0;
746 int surrogate = 0;
747
748 unicode = _PyUnicode_New(size);
749 if (!unicode)
750 return NULL;
751 if (size == 0)
752 return (PyObject *)unicode;
753
754 p = unicode->str;
755 e = s + size;
756
757 while (s < e) {
758 Py_UNICODE ch = *s;
759
760 if (inShift) {
761 if ((ch == '-') || !B64CHAR(ch)) {
762 inShift = 0;
763 s++;
764
765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
766 if (bitsleft >= 6) {
767 /* The shift sequence has a partial character in it. If
768 bitsleft < 6 then we could just classify it as padding
769 but that is not the case here */
770
771 errmsg = "partial character in shift sequence";
772 goto utf7Error;
773 }
774 /* According to RFC2152 the remaining bits should be zero. We
775 choose to signal an error/insert a replacement character
776 here so indicate the potential of a misencoded character. */
777
778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
780 errmsg = "non-zero padding bits in shift sequence";
781 goto utf7Error;
782 }
783
784 if (ch == '-') {
785 if ((s < e) && (*(s) == '-')) {
786 *p++ = '-';
787 inShift = 1;
788 }
789 } else if (SPECIAL(ch,0,0)) {
790 errmsg = "unexpected special character";
791 goto utf7Error;
792 } else {
793 *p++ = ch;
794 }
795 } else {
796 charsleft = (charsleft << 6) | UB64(ch);
797 bitsleft += 6;
798 s++;
799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
800 }
801 }
802 else if ( ch == '+' ) {
803 s++;
804 if (s < e && *s == '-') {
805 s++;
806 *p++ = '+';
807 } else
808 {
809 inShift = 1;
810 bitsleft = 0;
811 }
812 }
813 else if (SPECIAL(ch,0,0)) {
814 errmsg = "unexpected special character";
815 s++;
816 goto utf7Error;
817 }
818 else {
819 *p++ = ch;
820 s++;
821 }
822 continue;
823 utf7Error:
824 if (utf7_decoding_error(&p, errors, errmsg))
825 goto onError;
826 }
827
828 if (inShift) {
829 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
830 goto onError;
831 }
832
833 if (_PyUnicode_Resize(&unicode, p - unicode->str))
834 goto onError;
835
836 return (PyObject *)unicode;
837
838onError:
839 Py_DECREF(unicode);
840 return NULL;
841}
842
843
844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
845 int size,
846 int encodeSetO,
847 int encodeWhiteSpace,
848 const char *errors)
849{
850 PyObject *v;
851 /* It might be possible to tighten this worst case */
852 unsigned int cbAllocated = 5 * size;
853 int inShift = 0;
854 int i = 0;
855 unsigned int bitsleft = 0;
856 unsigned long charsleft = 0;
857 char * out;
858 char * start;
859
860 if (size == 0)
861 return PyString_FromStringAndSize(NULL, 0);
862
863 v = PyString_FromStringAndSize(NULL, cbAllocated);
864 if (v == NULL)
865 return NULL;
866
867 start = out = PyString_AS_STRING(v);
868 for (;i < size; ++i) {
869 Py_UNICODE ch = s[i];
870
871 if (!inShift) {
872 if (ch == '+') {
873 *out++ = '+';
874 *out++ = '-';
875 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
876 charsleft = ch;
877 bitsleft = 16;
878 *out++ = '+';
879 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
880 inShift = bitsleft > 0;
881 } else {
882 *out++ = (char) ch;
883 }
884 } else {
885 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
886 *out++ = B64(charsleft << (6-bitsleft));
887 charsleft = 0;
888 bitsleft = 0;
889 /* Characters not in the BASE64 set implicitly unshift the sequence
890 so no '-' is required, except if the character is itself a '-' */
891 if (B64CHAR(ch) || ch == '-') {
892 *out++ = '-';
893 }
894 inShift = 0;
895 *out++ = (char) ch;
896 } else {
897 bitsleft += 16;
898 charsleft = (charsleft << 16) | ch;
899 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
900
901 /* If the next character is special then we dont' need to terminate
902 the shift sequence. If the next character is not a BASE64 character
903 or '-' then the shift sequence will be terminated implicitly and we
904 don't have to insert a '-'. */
905
906 if (bitsleft == 0) {
907 if (i + 1 < size) {
908 Py_UNICODE ch2 = s[i+1];
909
910 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
911
912 } else if (B64CHAR(ch2) || ch2 == '-') {
913 *out++ = '-';
914 inShift = 0;
915 } else {
916 inShift = 0;
917 }
918
919 }
920 else {
921 *out++ = '-';
922 inShift = 0;
923 }
924 }
925 }
926 }
927 }
928 if (bitsleft) {
929 *out++= B64(charsleft << (6-bitsleft) );
930 *out++ = '-';
931 }
932
933 if (_PyString_Resize(&v, out - start)) {
934 Py_DECREF(v);
935 return NULL;
936 }
937 return v;
938}
939
940#undef SPECIAL
941#undef B64
942#undef B64CHAR
943#undef UB64
944#undef ENCODE
945#undef DECODE
946
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947/* --- UTF-8 Codec -------------------------------------------------------- */
948
949static
950char utf8_code_length[256] = {
951 /* Map UTF-8 encoded prefix byte to sequence length. zero means
952 illegal prefix. see RFC 2279 for details */
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
965 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
966 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
967 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
968 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
969};
970
971static
972int utf8_decoding_error(const char **source,
973 Py_UNICODE **dest,
974 const char *errors,
975 const char *details)
976{
977 if ((errors == NULL) ||
978 (strcmp(errors,"strict") == 0)) {
979 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000980 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000981 details);
982 return -1;
983 }
984 else if (strcmp(errors,"ignore") == 0) {
985 (*source)++;
986 return 0;
987 }
988 else if (strcmp(errors,"replace") == 0) {
989 (*source)++;
990 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
991 (*dest)++;
992 return 0;
993 }
994 else {
995 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000996 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997 errors);
998 return -1;
999 }
1000}
1001
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002PyObject *PyUnicode_DecodeUTF8(const char *s,
1003 int size,
1004 const char *errors)
1005{
1006 int n;
1007 const char *e;
1008 PyUnicodeObject *unicode;
1009 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001010 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011
1012 /* Note: size will always be longer than the resulting Unicode
1013 character count */
1014 unicode = _PyUnicode_New(size);
1015 if (!unicode)
1016 return NULL;
1017 if (size == 0)
1018 return (PyObject *)unicode;
1019
1020 /* Unpack UTF-8 encoded data */
1021 p = unicode->str;
1022 e = s + size;
1023
1024 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026
1027 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001028 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 s++;
1030 continue;
1031 }
1032
1033 n = utf8_code_length[ch];
1034
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001035 if (s + n > e) {
1036 errmsg = "unexpected end of data";
1037 goto utf8Error;
1038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039
1040 switch (n) {
1041
1042 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001043 errmsg = "unexpected code byte";
1044 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
1046 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001047 errmsg = "internal error";
1048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049
1050 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001051 if ((s[1] & 0xc0) != 0x80) {
1052 errmsg = "invalid data";
1053 goto utf8Error;
1054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001056 if (ch < 0x80) {
1057 errmsg = "illegal encoding";
1058 goto utf8Error;
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001061 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062 break;
1063
1064 case 3:
1065 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001066 (s[2] & 0xc0) != 0x80) {
1067 errmsg = "invalid data";
1068 goto utf8Error;
1069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001071 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1072 errmsg = "illegal encoding";
1073 goto utf8Error;
1074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001076 *p++ = (Py_UNICODE)ch;
1077 break;
1078
1079 case 4:
1080 if ((s[1] & 0xc0) != 0x80 ||
1081 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001082 (s[3] & 0xc0) != 0x80) {
1083 errmsg = "invalid data";
1084 goto utf8Error;
1085 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001086 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1087 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1088 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001089 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001090 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001091 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001092 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 errmsg = "illegal encoding";
1095 goto utf8Error;
1096 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001097#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001098 *p++ = (Py_UNICODE)ch;
1099#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001100 /* compute and append the two surrogates: */
1101
1102 /* translate from 10000..10FFFF to 0..FFFF */
1103 ch -= 0x10000;
1104
1105 /* high surrogate = top 10 bits added to D800 */
1106 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1107
1108 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001109 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001110#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 break;
1112
1113 default:
1114 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 errmsg = "unsupported Unicode code range";
1116 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 }
1118 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 continue;
1120
1121 utf8Error:
1122 if (utf8_decoding_error(&s, &p, errors, errmsg))
1123 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 }
1125
1126 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001127 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 goto onError;
1129
1130 return (PyObject *)unicode;
1131
1132onError:
1133 Py_DECREF(unicode);
1134 return NULL;
1135}
1136
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001137/* Not used anymore, now that the encoder supports UTF-16
1138 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001139#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140static
1141int utf8_encoding_error(const Py_UNICODE **source,
1142 char **dest,
1143 const char *errors,
1144 const char *details)
1145{
1146 if ((errors == NULL) ||
1147 (strcmp(errors,"strict") == 0)) {
1148 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001149 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 details);
1151 return -1;
1152 }
1153 else if (strcmp(errors,"ignore") == 0) {
1154 return 0;
1155 }
1156 else if (strcmp(errors,"replace") == 0) {
1157 **dest = '?';
1158 (*dest)++;
1159 return 0;
1160 }
1161 else {
1162 PyErr_Format(PyExc_ValueError,
1163 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001164 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 errors);
1166 return -1;
1167 }
1168}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001169#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170
1171PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1172 int size,
1173 const char *errors)
1174{
1175 PyObject *v;
1176 char *p;
1177 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001178 Py_UCS4 ch2;
1179 unsigned int cbAllocated = 3 * size;
1180 unsigned int cbWritten = 0;
1181 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001183 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 if (v == NULL)
1185 return NULL;
1186 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001187 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 while (i < size) {
1191 Py_UCS4 ch = s[i++];
1192 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001194 cbWritten++;
1195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 else if (ch < 0x0800) {
1197 *p++ = 0xc0 | (ch >> 6);
1198 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001199 cbWritten += 2;
1200 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001201 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001202 /* Check for high surrogate */
1203 if (0xD800 <= ch && ch <= 0xDBFF) {
1204 if (i != size) {
1205 ch2 = s[i];
1206 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1207
1208 if (cbWritten >= (cbAllocated - 4)) {
1209 /* Provide enough room for some more
1210 surrogates */
1211 cbAllocated += 4*10;
1212 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001213 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001214 }
1215
1216 /* combine the two values */
1217 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1218
1219 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001220 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001221 i++;
1222 cbWritten += 4;
1223 }
1224 }
1225 }
1226 else {
1227 *p++ = (char)(0xe0 | (ch >> 12));
1228 cbWritten += 3;
1229 }
1230 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1231 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001232 } else {
1233 *p++ = 0xf0 | (ch>>18);
1234 *p++ = 0x80 | ((ch>>12) & 0x3f);
1235 *p++ = 0x80 | ((ch>>6) & 0x3f);
1236 *p++ = 0x80 | (ch & 0x3f);
1237 cbWritten += 4;
1238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001241 if (_PyString_Resize(&v, p - q))
1242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 return v;
1244
1245 onError:
1246 Py_DECREF(v);
1247 return NULL;
1248}
1249
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1251{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 if (!PyUnicode_Check(unicode)) {
1253 PyErr_BadArgument();
1254 return NULL;
1255 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001256 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1257 PyUnicode_GET_SIZE(unicode),
1258 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259}
1260
1261/* --- UTF-16 Codec ------------------------------------------------------- */
1262
1263static
Tim Peters772747b2001-08-09 22:21:55 +00001264int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 const char *errors,
1266 const char *details)
1267{
1268 if ((errors == NULL) ||
1269 (strcmp(errors,"strict") == 0)) {
1270 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001271 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 details);
1273 return -1;
1274 }
1275 else if (strcmp(errors,"ignore") == 0) {
1276 return 0;
1277 }
1278 else if (strcmp(errors,"replace") == 0) {
1279 if (dest) {
1280 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1281 (*dest)++;
1282 }
1283 return 0;
1284 }
1285 else {
1286 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001287 "UTF-16 decoding error; "
1288 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 errors);
1290 return -1;
1291 }
1292}
1293
Tim Peters772747b2001-08-09 22:21:55 +00001294PyObject *
1295PyUnicode_DecodeUTF16(const char *s,
1296 int size,
1297 const char *errors,
1298 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299{
1300 PyUnicodeObject *unicode;
1301 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001302 const unsigned char *q, *e;
1303 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001304 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001305 /* Offsets from q for retrieving byte pairs in the right order. */
1306#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1307 int ihi = 1, ilo = 0;
1308#else
1309 int ihi = 0, ilo = 1;
1310#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311
1312 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001313 if (size & 1) {
1314 if (utf16_decoding_error(NULL, errors, "truncated data"))
1315 return NULL;
1316 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 }
1318
1319 /* Note: size will always be longer than the resulting Unicode
1320 character count */
1321 unicode = _PyUnicode_New(size);
1322 if (!unicode)
1323 return NULL;
1324 if (size == 0)
1325 return (PyObject *)unicode;
1326
1327 /* Unpack UTF-16 encoded data */
1328 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001329 q = (unsigned char *)s;
1330 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331
1332 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001333 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001335 /* Check for BOM marks (U+FEFF) in the input and adjust current
1336 byte order setting accordingly. In native mode, the leading BOM
1337 mark is skipped, in all other modes, it is copied to the output
1338 stream as-is (giving a ZWNBSP character). */
1339 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001340 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001341#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001342 if (bom == 0xFEFF) {
1343 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001344 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001345 }
1346 else if (bom == 0xFFFE) {
1347 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001348 bo = 1;
1349 }
1350#else
Tim Peters772747b2001-08-09 22:21:55 +00001351 if (bom == 0xFEFF) {
1352 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001353 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001354 }
1355 else if (bom == 0xFFFE) {
1356 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001357 bo = -1;
1358 }
1359#endif
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361
Tim Peters772747b2001-08-09 22:21:55 +00001362 if (bo == -1) {
1363 /* force LE */
1364 ihi = 1;
1365 ilo = 0;
1366 }
1367 else if (bo == 1) {
1368 /* force BE */
1369 ihi = 0;
1370 ilo = 1;
1371 }
1372
1373 while (q < e) {
1374 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1375 q += 2;
1376
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 if (ch < 0xD800 || ch > 0xDFFF) {
1378 *p++ = ch;
1379 continue;
1380 }
1381
1382 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001383 if (q >= e) {
1384 errmsg = "unexpected end of data";
1385 goto utf16Error;
1386 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001387 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001388 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1389 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001390 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001391#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001392 *p++ = ch;
1393 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001394#else
1395 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001396#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001397 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001398 }
1399 else {
1400 errmsg = "illegal UTF-16 surrogate";
1401 goto utf16Error;
1402 }
1403
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 errmsg = "illegal encoding";
1406 /* Fall through to report the error */
1407
1408 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001409 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001410 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 }
1412
1413 if (byteorder)
1414 *byteorder = bo;
1415
1416 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001417 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418 goto onError;
1419
1420 return (PyObject *)unicode;
1421
1422onError:
1423 Py_DECREF(unicode);
1424 return NULL;
1425}
1426
Tim Peters772747b2001-08-09 22:21:55 +00001427PyObject *
1428PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1429 int size,
1430 const char *errors,
1431 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432{
1433 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001434 unsigned char *p;
1435 int i, pairs;
1436 /* Offsets from p for storing byte pairs in the right order. */
1437#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1438 int ihi = 1, ilo = 0;
1439#else
1440 int ihi = 0, ilo = 1;
1441#endif
1442
1443#define STORECHAR(CH) \
1444 do { \
1445 p[ihi] = ((CH) >> 8) & 0xff; \
1446 p[ilo] = (CH) & 0xff; \
1447 p += 2; \
1448 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001450 for (i = pairs = 0; i < size; i++)
1451 if (s[i] >= 0x10000)
1452 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001454 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 if (v == NULL)
1456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457
Tim Peters772747b2001-08-09 22:21:55 +00001458 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001460 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001461 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001462 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001463
1464 if (byteorder == -1) {
1465 /* force LE */
1466 ihi = 1;
1467 ilo = 0;
1468 }
1469 else if (byteorder == 1) {
1470 /* force BE */
1471 ihi = 0;
1472 ilo = 1;
1473 }
1474
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001475 while (size-- > 0) {
1476 Py_UNICODE ch = *s++;
1477 Py_UNICODE ch2 = 0;
1478 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001479 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1480 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 }
Tim Peters772747b2001-08-09 22:21:55 +00001482 STORECHAR(ch);
1483 if (ch2)
1484 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001485 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001487#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488}
1489
1490PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1491{
1492 if (!PyUnicode_Check(unicode)) {
1493 PyErr_BadArgument();
1494 return NULL;
1495 }
1496 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1497 PyUnicode_GET_SIZE(unicode),
1498 NULL,
1499 0);
1500}
1501
1502/* --- Unicode Escape Codec ----------------------------------------------- */
1503
1504static
1505int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001506 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 const char *errors,
1508 const char *details)
1509{
1510 if ((errors == NULL) ||
1511 (strcmp(errors,"strict") == 0)) {
1512 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001513 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514 details);
1515 return -1;
1516 }
1517 else if (strcmp(errors,"ignore") == 0) {
1518 return 0;
1519 }
1520 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001521 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 return 0;
1523 }
1524 else {
1525 PyErr_Format(PyExc_ValueError,
1526 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001527 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001528 errors);
1529 return -1;
1530 }
1531}
1532
Fredrik Lundh06d12682001-01-24 07:59:11 +00001533static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001534
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1536 int size,
1537 const char *errors)
1538{
1539 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001540 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001542 char* message;
1543 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1544
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 /* Escaped strings will always be longer than the resulting
1546 Unicode string, so we start with size here and then reduce the
1547 length after conversion to the true value. */
1548 v = _PyUnicode_New(size);
1549 if (v == NULL)
1550 goto onError;
1551 if (size == 0)
1552 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 p = buf = PyUnicode_AS_UNICODE(v);
1555 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001556
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 while (s < end) {
1558 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001559 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001560 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
1562 /* Non-escape characters are interpreted as Unicode ordinals */
1563 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001564 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 continue;
1566 }
1567
1568 /* \ - Escapes */
1569 s++;
1570 switch (*s++) {
1571
1572 /* \x escapes */
1573 case '\n': break;
1574 case '\\': *p++ = '\\'; break;
1575 case '\'': *p++ = '\''; break;
1576 case '\"': *p++ = '\"'; break;
1577 case 'b': *p++ = '\b'; break;
1578 case 'f': *p++ = '\014'; break; /* FF */
1579 case 't': *p++ = '\t'; break;
1580 case 'n': *p++ = '\n'; break;
1581 case 'r': *p++ = '\r'; break;
1582 case 'v': *p++ = '\013'; break; /* VT */
1583 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1584
1585 /* \OOO (octal) escapes */
1586 case '0': case '1': case '2': case '3':
1587 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001588 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001590 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001592 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001594 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 break;
1596
Fredrik Lundhccc74732001-02-18 22:13:49 +00001597 /* hex escapes */
1598 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001600 digits = 2;
1601 message = "truncated \\xXX escape";
1602 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603
Fredrik Lundhccc74732001-02-18 22:13:49 +00001604 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001606 digits = 4;
1607 message = "truncated \\uXXXX escape";
1608 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609
Fredrik Lundhccc74732001-02-18 22:13:49 +00001610 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001611 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001612 digits = 8;
1613 message = "truncated \\UXXXXXXXX escape";
1614 hexescape:
1615 chr = 0;
1616 for (i = 0; i < digits; i++) {
1617 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001618 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001619 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001620 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001621 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001622 i++;
1623 break;
1624 }
1625 chr = (chr<<4) & ~0xF;
1626 if (c >= '0' && c <= '9')
1627 chr += c - '0';
1628 else if (c >= 'a' && c <= 'f')
1629 chr += 10 + c - 'a';
1630 else
1631 chr += 10 + c - 'A';
1632 }
1633 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001634 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001635 /* when we get here, chr is a 32-bit unicode character */
1636 if (chr <= 0xffff)
1637 /* UCS-2 character */
1638 *p++ = (Py_UNICODE) chr;
1639 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001640 /* UCS-4 character. Either store directly, or as
1641 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001642#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001643 *p++ = chr;
1644#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 chr -= 0x10000L;
1646 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001647 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 } else {
1650 if (unicodeescape_decoding_error(
1651 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001652 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001653 )
1654 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001655 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001656 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657 break;
1658
1659 /* \N{name} */
1660 case 'N':
1661 message = "malformed \\N character escape";
1662 if (ucnhash_CAPI == NULL) {
1663 /* load the unicode data module */
1664 PyObject *m, *v;
1665 m = PyImport_ImportModule("unicodedata");
1666 if (m == NULL)
1667 goto ucnhashError;
1668 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1669 Py_DECREF(m);
1670 if (v == NULL)
1671 goto ucnhashError;
1672 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1673 Py_DECREF(v);
1674 if (ucnhash_CAPI == NULL)
1675 goto ucnhashError;
1676 }
1677 if (*s == '{') {
1678 const char *start = s+1;
1679 /* look for the closing brace */
1680 while (*s != '}' && s < end)
1681 s++;
1682 if (s > start && s < end && *s == '}') {
1683 /* found a name. look it up in the unicode database */
1684 message = "unknown Unicode character name";
1685 s++;
1686 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1687 goto store;
1688 }
1689 }
1690 if (unicodeescape_decoding_error(&s, &x, errors, message))
1691 goto onError;
1692 *p++ = x;
1693 break;
1694
1695 default:
1696 *p++ = '\\';
1697 *p++ = (unsigned char)s[-1];
1698 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 }
1700 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001701 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001702 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 return (PyObject *)v;
1704
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001706 PyErr_SetString(
1707 PyExc_UnicodeError,
1708 "\\N escapes not supported (can't load unicodedata module)"
1709 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001710 return NULL;
1711
Fredrik Lundhccc74732001-02-18 22:13:49 +00001712onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 Py_XDECREF(v);
1714 return NULL;
1715}
1716
1717/* Return a Unicode-Escape string version of the Unicode object.
1718
1719 If quotes is true, the string is enclosed in u"" or u'' quotes as
1720 appropriate.
1721
1722*/
1723
Barry Warsaw51ac5802000-03-20 16:36:48 +00001724static const Py_UNICODE *findchar(const Py_UNICODE *s,
1725 int size,
1726 Py_UNICODE ch);
1727
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728static
1729PyObject *unicodeescape_string(const Py_UNICODE *s,
1730 int size,
1731 int quotes)
1732{
1733 PyObject *repr;
1734 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001736 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737
1738 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1739 if (repr == NULL)
1740 return NULL;
1741
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001742 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743
1744 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 *p++ = 'u';
1746 *p++ = (findchar(s, size, '\'') &&
1747 !findchar(s, size, '"')) ? '"' : '\'';
1748 }
1749 while (size-- > 0) {
1750 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001751
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001753 if (quotes &&
1754 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 *p++ = '\\';
1756 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001757 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001759
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001760#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001761 /* Map 21-bit characters to '\U00xxxxxx' */
1762 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001763 int offset = p - PyString_AS_STRING(repr);
1764
1765 /* Resize the string if necessary */
1766 if (offset + 12 > PyString_GET_SIZE(repr)) {
1767 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1768 goto onError;
1769 p = PyString_AS_STRING(repr) + offset;
1770 }
1771
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001772 *p++ = '\\';
1773 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001774 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1777 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1778 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1780 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001781 *p++ = hexdigit[ch & 0x0000000F];
1782 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001783 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001784#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001785 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1786 else if (ch >= 0xD800 && ch < 0xDC00) {
1787 Py_UNICODE ch2;
1788 Py_UCS4 ucs;
1789
1790 ch2 = *s++;
1791 size--;
1792 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1793 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1794 *p++ = '\\';
1795 *p++ = 'U';
1796 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1799 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1800 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1802 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1803 *p++ = hexdigit[ucs & 0x0000000F];
1804 continue;
1805 }
1806 /* Fall through: isolated surrogates are copied as-is */
1807 s--;
1808 size++;
1809 }
1810
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001812 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 *p++ = '\\';
1814 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001815 *p++ = hexdigit[(ch >> 12) & 0x000F];
1816 *p++ = hexdigit[(ch >> 8) & 0x000F];
1817 *p++ = hexdigit[(ch >> 4) & 0x000F];
1818 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001820
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001821 /* Map special whitespace to '\t', \n', '\r' */
1822 else if (ch == '\t') {
1823 *p++ = '\\';
1824 *p++ = 't';
1825 }
1826 else if (ch == '\n') {
1827 *p++ = '\\';
1828 *p++ = 'n';
1829 }
1830 else if (ch == '\r') {
1831 *p++ = '\\';
1832 *p++ = 'r';
1833 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001834
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001835 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 else if (ch < ' ' || ch >= 128) {
1837 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001838 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001839 *p++ = hexdigit[(ch >> 4) & 0x000F];
1840 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001842
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 /* Copy everything else as-is */
1844 else
1845 *p++ = (char) ch;
1846 }
1847 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001848 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
1850 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001851 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001852 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853
1854 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001855
1856 onError:
1857 Py_DECREF(repr);
1858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859}
1860
1861PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1862 int size)
1863{
1864 return unicodeescape_string(s, size, 0);
1865}
1866
1867PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1868{
1869 if (!PyUnicode_Check(unicode)) {
1870 PyErr_BadArgument();
1871 return NULL;
1872 }
1873 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1874 PyUnicode_GET_SIZE(unicode));
1875}
1876
1877/* --- Raw Unicode Escape Codec ------------------------------------------- */
1878
1879PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1880 int size,
1881 const char *errors)
1882{
1883 PyUnicodeObject *v;
1884 Py_UNICODE *p, *buf;
1885 const char *end;
1886 const char *bs;
1887
1888 /* Escaped strings will always be longer than the resulting
1889 Unicode string, so we start with size here and then reduce the
1890 length after conversion to the true value. */
1891 v = _PyUnicode_New(size);
1892 if (v == NULL)
1893 goto onError;
1894 if (size == 0)
1895 return (PyObject *)v;
1896 p = buf = PyUnicode_AS_UNICODE(v);
1897 end = s + size;
1898 while (s < end) {
1899 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001900 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 int i;
1902
1903 /* Non-escape characters are interpreted as Unicode ordinals */
1904 if (*s != '\\') {
1905 *p++ = (unsigned char)*s++;
1906 continue;
1907 }
1908
1909 /* \u-escapes are only interpreted iff the number of leading
1910 backslashes if odd */
1911 bs = s;
1912 for (;s < end;) {
1913 if (*s != '\\')
1914 break;
1915 *p++ = (unsigned char)*s++;
1916 }
1917 if (((s - bs) & 1) == 0 ||
1918 s >= end ||
1919 *s != 'u') {
1920 continue;
1921 }
1922 p--;
1923 s++;
1924
1925 /* \uXXXX with 4 hex digits */
1926 for (x = 0, i = 0; i < 4; i++) {
1927 c = (unsigned char)s[i];
1928 if (!isxdigit(c)) {
1929 if (unicodeescape_decoding_error(&s, &x, errors,
1930 "truncated \\uXXXX"))
1931 goto onError;
1932 i++;
1933 break;
1934 }
1935 x = (x<<4) & ~0xF;
1936 if (c >= '0' && c <= '9')
1937 x += c - '0';
1938 else if (c >= 'a' && c <= 'f')
1939 x += 10 + c - 'a';
1940 else
1941 x += 10 + c - 'A';
1942 }
1943 s += i;
1944 *p++ = x;
1945 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001946 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001947 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 return (PyObject *)v;
1949
1950 onError:
1951 Py_XDECREF(v);
1952 return NULL;
1953}
1954
1955PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1956 int size)
1957{
1958 PyObject *repr;
1959 char *p;
1960 char *q;
1961
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001962 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 repr = PyString_FromStringAndSize(NULL, 6 * size);
1965 if (repr == NULL)
1966 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001967 if (size == 0)
1968 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969
1970 p = q = PyString_AS_STRING(repr);
1971 while (size-- > 0) {
1972 Py_UNICODE ch = *s++;
1973 /* Map 16-bit characters to '\uxxxx' */
1974 if (ch >= 256) {
1975 *p++ = '\\';
1976 *p++ = 'u';
1977 *p++ = hexdigit[(ch >> 12) & 0xf];
1978 *p++ = hexdigit[(ch >> 8) & 0xf];
1979 *p++ = hexdigit[(ch >> 4) & 0xf];
1980 *p++ = hexdigit[ch & 15];
1981 }
1982 /* Copy everything else as-is */
1983 else
1984 *p++ = (char) ch;
1985 }
1986 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001987 if (_PyString_Resize(&repr, p - q))
1988 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
1990 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001991
1992 onError:
1993 Py_DECREF(repr);
1994 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995}
1996
1997PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1998{
1999 if (!PyUnicode_Check(unicode)) {
2000 PyErr_BadArgument();
2001 return NULL;
2002 }
2003 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2004 PyUnicode_GET_SIZE(unicode));
2005}
2006
2007/* --- Latin-1 Codec ------------------------------------------------------ */
2008
2009PyObject *PyUnicode_DecodeLatin1(const char *s,
2010 int size,
2011 const char *errors)
2012{
2013 PyUnicodeObject *v;
2014 Py_UNICODE *p;
2015
2016 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002017 if (size == 1 && *(unsigned char*)s < 256) {
2018 Py_UNICODE r = *(unsigned char*)s;
2019 return PyUnicode_FromUnicode(&r, 1);
2020 }
2021
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 v = _PyUnicode_New(size);
2023 if (v == NULL)
2024 goto onError;
2025 if (size == 0)
2026 return (PyObject *)v;
2027 p = PyUnicode_AS_UNICODE(v);
2028 while (size-- > 0)
2029 *p++ = (unsigned char)*s++;
2030 return (PyObject *)v;
2031
2032 onError:
2033 Py_XDECREF(v);
2034 return NULL;
2035}
2036
2037static
2038int latin1_encoding_error(const Py_UNICODE **source,
2039 char **dest,
2040 const char *errors,
2041 const char *details)
2042{
2043 if ((errors == NULL) ||
2044 (strcmp(errors,"strict") == 0)) {
2045 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002046 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 details);
2048 return -1;
2049 }
2050 else if (strcmp(errors,"ignore") == 0) {
2051 return 0;
2052 }
2053 else if (strcmp(errors,"replace") == 0) {
2054 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002055 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return 0;
2057 }
2058 else {
2059 PyErr_Format(PyExc_ValueError,
2060 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002061 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 errors);
2063 return -1;
2064 }
2065}
2066
2067PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2068 int size,
2069 const char *errors)
2070{
2071 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002072 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002073
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 repr = PyString_FromStringAndSize(NULL, size);
2075 if (repr == NULL)
2076 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002077 if (size == 0)
2078 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079
2080 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002081 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 while (size-- > 0) {
2083 Py_UNICODE ch = *p++;
2084 if (ch >= 256) {
2085 if (latin1_encoding_error(&p, &s, errors,
2086 "ordinal not in range(256)"))
2087 goto onError;
2088 }
2089 else
2090 *s++ = (char)ch;
2091 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002092 /* Resize if error handling skipped some characters */
2093 if (s - start < PyString_GET_SIZE(repr))
2094 if (_PyString_Resize(&repr, s - start))
2095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 return repr;
2097
2098 onError:
2099 Py_DECREF(repr);
2100 return NULL;
2101}
2102
2103PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2104{
2105 if (!PyUnicode_Check(unicode)) {
2106 PyErr_BadArgument();
2107 return NULL;
2108 }
2109 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2110 PyUnicode_GET_SIZE(unicode),
2111 NULL);
2112}
2113
2114/* --- 7-bit ASCII Codec -------------------------------------------------- */
2115
2116static
2117int ascii_decoding_error(const char **source,
2118 Py_UNICODE **dest,
2119 const char *errors,
2120 const char *details)
2121{
2122 if ((errors == NULL) ||
2123 (strcmp(errors,"strict") == 0)) {
2124 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002125 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 details);
2127 return -1;
2128 }
2129 else if (strcmp(errors,"ignore") == 0) {
2130 return 0;
2131 }
2132 else if (strcmp(errors,"replace") == 0) {
2133 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2134 (*dest)++;
2135 return 0;
2136 }
2137 else {
2138 PyErr_Format(PyExc_ValueError,
2139 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002140 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 errors);
2142 return -1;
2143 }
2144}
2145
2146PyObject *PyUnicode_DecodeASCII(const char *s,
2147 int size,
2148 const char *errors)
2149{
2150 PyUnicodeObject *v;
2151 Py_UNICODE *p;
2152
2153 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002154 if (size == 1 && *(unsigned char*)s < 128) {
2155 Py_UNICODE r = *(unsigned char*)s;
2156 return PyUnicode_FromUnicode(&r, 1);
2157 }
2158
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 v = _PyUnicode_New(size);
2160 if (v == NULL)
2161 goto onError;
2162 if (size == 0)
2163 return (PyObject *)v;
2164 p = PyUnicode_AS_UNICODE(v);
2165 while (size-- > 0) {
2166 register unsigned char c;
2167
2168 c = (unsigned char)*s++;
2169 if (c < 128)
2170 *p++ = c;
2171 else if (ascii_decoding_error(&s, &p, errors,
2172 "ordinal not in range(128)"))
2173 goto onError;
2174 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002175 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002176 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002177 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 return (PyObject *)v;
2179
2180 onError:
2181 Py_XDECREF(v);
2182 return NULL;
2183}
2184
2185static
2186int ascii_encoding_error(const Py_UNICODE **source,
2187 char **dest,
2188 const char *errors,
2189 const char *details)
2190{
2191 if ((errors == NULL) ||
2192 (strcmp(errors,"strict") == 0)) {
2193 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002194 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 details);
2196 return -1;
2197 }
2198 else if (strcmp(errors,"ignore") == 0) {
2199 return 0;
2200 }
2201 else if (strcmp(errors,"replace") == 0) {
2202 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002203 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 return 0;
2205 }
2206 else {
2207 PyErr_Format(PyExc_ValueError,
2208 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002209 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 errors);
2211 return -1;
2212 }
2213}
2214
2215PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2216 int size,
2217 const char *errors)
2218{
2219 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002220 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002221
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 repr = PyString_FromStringAndSize(NULL, size);
2223 if (repr == NULL)
2224 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002225 if (size == 0)
2226 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227
2228 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002229 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 while (size-- > 0) {
2231 Py_UNICODE ch = *p++;
2232 if (ch >= 128) {
2233 if (ascii_encoding_error(&p, &s, errors,
2234 "ordinal not in range(128)"))
2235 goto onError;
2236 }
2237 else
2238 *s++ = (char)ch;
2239 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002240 /* Resize if error handling skipped some characters */
2241 if (s - start < PyString_GET_SIZE(repr))
2242 if (_PyString_Resize(&repr, s - start))
2243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 return repr;
2245
2246 onError:
2247 Py_DECREF(repr);
2248 return NULL;
2249}
2250
2251PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2252{
2253 if (!PyUnicode_Check(unicode)) {
2254 PyErr_BadArgument();
2255 return NULL;
2256 }
2257 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2258 PyUnicode_GET_SIZE(unicode),
2259 NULL);
2260}
2261
Fredrik Lundh30831632001-06-26 15:11:00 +00002262#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002263
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002264/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002265
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002266PyObject *PyUnicode_DecodeMBCS(const char *s,
2267 int size,
2268 const char *errors)
2269{
2270 PyUnicodeObject *v;
2271 Py_UNICODE *p;
2272
2273 /* First get the size of the result */
2274 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002275 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002276 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2277
2278 v = _PyUnicode_New(usize);
2279 if (v == NULL)
2280 return NULL;
2281 if (usize == 0)
2282 return (PyObject *)v;
2283 p = PyUnicode_AS_UNICODE(v);
2284 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2285 Py_DECREF(v);
2286 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2287 }
2288
2289 return (PyObject *)v;
2290}
2291
2292PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2293 int size,
2294 const char *errors)
2295{
2296 PyObject *repr;
2297 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002298 DWORD mbcssize;
2299
2300 /* If there are no characters, bail now! */
2301 if (size==0)
2302 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002303
2304 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002305 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002306 if (mbcssize==0)
2307 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2308
2309 repr = PyString_FromStringAndSize(NULL, mbcssize);
2310 if (repr == NULL)
2311 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002312 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002313 return repr;
2314
2315 /* Do the conversion */
2316 s = PyString_AS_STRING(repr);
2317 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2318 Py_DECREF(repr);
2319 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2320 }
2321 return repr;
2322}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002323
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002324#endif /* MS_WIN32 */
2325
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326/* --- Character Mapping Codec -------------------------------------------- */
2327
2328static
2329int charmap_decoding_error(const char **source,
2330 Py_UNICODE **dest,
2331 const char *errors,
2332 const char *details)
2333{
2334 if ((errors == NULL) ||
2335 (strcmp(errors,"strict") == 0)) {
2336 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002337 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 details);
2339 return -1;
2340 }
2341 else if (strcmp(errors,"ignore") == 0) {
2342 return 0;
2343 }
2344 else if (strcmp(errors,"replace") == 0) {
2345 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2346 (*dest)++;
2347 return 0;
2348 }
2349 else {
2350 PyErr_Format(PyExc_ValueError,
2351 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002352 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 errors);
2354 return -1;
2355 }
2356}
2357
2358PyObject *PyUnicode_DecodeCharmap(const char *s,
2359 int size,
2360 PyObject *mapping,
2361 const char *errors)
2362{
2363 PyUnicodeObject *v;
2364 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002365 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366
2367 /* Default to Latin-1 */
2368 if (mapping == NULL)
2369 return PyUnicode_DecodeLatin1(s, size, errors);
2370
2371 v = _PyUnicode_New(size);
2372 if (v == NULL)
2373 goto onError;
2374 if (size == 0)
2375 return (PyObject *)v;
2376 p = PyUnicode_AS_UNICODE(v);
2377 while (size-- > 0) {
2378 unsigned char ch = *s++;
2379 PyObject *w, *x;
2380
2381 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2382 w = PyInt_FromLong((long)ch);
2383 if (w == NULL)
2384 goto onError;
2385 x = PyObject_GetItem(mapping, w);
2386 Py_DECREF(w);
2387 if (x == NULL) {
2388 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002389 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002391 x = Py_None;
2392 Py_INCREF(x);
2393 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002394 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395 }
2396
2397 /* Apply mapping */
2398 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002399 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 if (value < 0 || value > 65535) {
2401 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002402 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 Py_DECREF(x);
2404 goto onError;
2405 }
2406 *p++ = (Py_UNICODE)value;
2407 }
2408 else if (x == Py_None) {
2409 /* undefined mapping */
2410 if (charmap_decoding_error(&s, &p, errors,
2411 "character maps to <undefined>")) {
2412 Py_DECREF(x);
2413 goto onError;
2414 }
2415 }
2416 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002417 int targetsize = PyUnicode_GET_SIZE(x);
2418
2419 if (targetsize == 1)
2420 /* 1-1 mapping */
2421 *p++ = *PyUnicode_AS_UNICODE(x);
2422
2423 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002425 if (targetsize > extrachars) {
2426 /* resize first */
2427 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2428 int needed = (targetsize - extrachars) + \
2429 (targetsize << 2);
2430 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002431 if (_PyUnicode_Resize(&v,
2432 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002433 Py_DECREF(x);
2434 goto onError;
2435 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002436 p = PyUnicode_AS_UNICODE(v) + oldpos;
2437 }
2438 Py_UNICODE_COPY(p,
2439 PyUnicode_AS_UNICODE(x),
2440 targetsize);
2441 p += targetsize;
2442 extrachars -= targetsize;
2443 }
2444 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 }
2446 else {
2447 /* wrong return value */
2448 PyErr_SetString(PyExc_TypeError,
2449 "character mapping must return integer, None or unicode");
2450 Py_DECREF(x);
2451 goto onError;
2452 }
2453 Py_DECREF(x);
2454 }
2455 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002456 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 goto onError;
2458 return (PyObject *)v;
2459
2460 onError:
2461 Py_XDECREF(v);
2462 return NULL;
2463}
2464
2465static
2466int charmap_encoding_error(const Py_UNICODE **source,
2467 char **dest,
2468 const char *errors,
2469 const char *details)
2470{
2471 if ((errors == NULL) ||
2472 (strcmp(errors,"strict") == 0)) {
2473 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002474 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 details);
2476 return -1;
2477 }
2478 else if (strcmp(errors,"ignore") == 0) {
2479 return 0;
2480 }
2481 else if (strcmp(errors,"replace") == 0) {
2482 **dest = '?';
2483 (*dest)++;
2484 return 0;
2485 }
2486 else {
2487 PyErr_Format(PyExc_ValueError,
2488 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002489 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 errors);
2491 return -1;
2492 }
2493}
2494
2495PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2496 int size,
2497 PyObject *mapping,
2498 const char *errors)
2499{
2500 PyObject *v;
2501 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002502 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503
2504 /* Default to Latin-1 */
2505 if (mapping == NULL)
2506 return PyUnicode_EncodeLatin1(p, size, errors);
2507
2508 v = PyString_FromStringAndSize(NULL, size);
2509 if (v == NULL)
2510 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002511 if (size == 0)
2512 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 s = PyString_AS_STRING(v);
2514 while (size-- > 0) {
2515 Py_UNICODE ch = *p++;
2516 PyObject *w, *x;
2517
2518 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2519 w = PyInt_FromLong((long)ch);
2520 if (w == NULL)
2521 goto onError;
2522 x = PyObject_GetItem(mapping, w);
2523 Py_DECREF(w);
2524 if (x == NULL) {
2525 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002526 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002528 x = Py_None;
2529 Py_INCREF(x);
2530 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002531 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 }
2533
2534 /* Apply mapping */
2535 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002536 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 if (value < 0 || value > 255) {
2538 PyErr_SetString(PyExc_TypeError,
2539 "character mapping must be in range(256)");
2540 Py_DECREF(x);
2541 goto onError;
2542 }
2543 *s++ = (char)value;
2544 }
2545 else if (x == Py_None) {
2546 /* undefined mapping */
2547 if (charmap_encoding_error(&p, &s, errors,
2548 "character maps to <undefined>")) {
2549 Py_DECREF(x);
2550 goto onError;
2551 }
2552 }
2553 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002554 int targetsize = PyString_GET_SIZE(x);
2555
2556 if (targetsize == 1)
2557 /* 1-1 mapping */
2558 *s++ = *PyString_AS_STRING(x);
2559
2560 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002562 if (targetsize > extrachars) {
2563 /* resize first */
2564 int oldpos = (int)(s - PyString_AS_STRING(v));
2565 int needed = (targetsize - extrachars) + \
2566 (targetsize << 2);
2567 extrachars += needed;
2568 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002569 Py_DECREF(x);
2570 goto onError;
2571 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002572 s = PyString_AS_STRING(v) + oldpos;
2573 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002574 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002575 s += targetsize;
2576 extrachars -= targetsize;
2577 }
2578 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 }
2580 else {
2581 /* wrong return value */
2582 PyErr_SetString(PyExc_TypeError,
2583 "character mapping must return integer, None or unicode");
2584 Py_DECREF(x);
2585 goto onError;
2586 }
2587 Py_DECREF(x);
2588 }
2589 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2590 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2591 goto onError;
2592 return v;
2593
2594 onError:
2595 Py_DECREF(v);
2596 return NULL;
2597}
2598
2599PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2600 PyObject *mapping)
2601{
2602 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2603 PyErr_BadArgument();
2604 return NULL;
2605 }
2606 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2607 PyUnicode_GET_SIZE(unicode),
2608 mapping,
2609 NULL);
2610}
2611
2612static
2613int translate_error(const Py_UNICODE **source,
2614 Py_UNICODE **dest,
2615 const char *errors,
2616 const char *details)
2617{
2618 if ((errors == NULL) ||
2619 (strcmp(errors,"strict") == 0)) {
2620 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002621 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 details);
2623 return -1;
2624 }
2625 else if (strcmp(errors,"ignore") == 0) {
2626 return 0;
2627 }
2628 else if (strcmp(errors,"replace") == 0) {
2629 **dest = '?';
2630 (*dest)++;
2631 return 0;
2632 }
2633 else {
2634 PyErr_Format(PyExc_ValueError,
2635 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002636 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 errors);
2638 return -1;
2639 }
2640}
2641
2642PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2643 int size,
2644 PyObject *mapping,
2645 const char *errors)
2646{
2647 PyUnicodeObject *v;
2648 Py_UNICODE *p;
2649
2650 if (mapping == NULL) {
2651 PyErr_BadArgument();
2652 return NULL;
2653 }
2654
2655 /* Output will never be longer than input */
2656 v = _PyUnicode_New(size);
2657 if (v == NULL)
2658 goto onError;
2659 if (size == 0)
2660 goto done;
2661 p = PyUnicode_AS_UNICODE(v);
2662 while (size-- > 0) {
2663 Py_UNICODE ch = *s++;
2664 PyObject *w, *x;
2665
2666 /* Get mapping */
2667 w = PyInt_FromLong(ch);
2668 if (w == NULL)
2669 goto onError;
2670 x = PyObject_GetItem(mapping, w);
2671 Py_DECREF(w);
2672 if (x == NULL) {
2673 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2674 /* No mapping found: default to 1-1 mapping */
2675 PyErr_Clear();
2676 *p++ = ch;
2677 continue;
2678 }
2679 goto onError;
2680 }
2681
2682 /* Apply mapping */
2683 if (PyInt_Check(x))
2684 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2685 else if (x == Py_None) {
2686 /* undefined mapping */
2687 if (translate_error(&s, &p, errors,
2688 "character maps to <undefined>")) {
2689 Py_DECREF(x);
2690 goto onError;
2691 }
2692 }
2693 else if (PyUnicode_Check(x)) {
2694 if (PyUnicode_GET_SIZE(x) != 1) {
2695 /* 1-n mapping */
2696 PyErr_SetString(PyExc_NotImplementedError,
2697 "1-n mappings are currently not implemented");
2698 Py_DECREF(x);
2699 goto onError;
2700 }
2701 *p++ = *PyUnicode_AS_UNICODE(x);
2702 }
2703 else {
2704 /* wrong return value */
2705 PyErr_SetString(PyExc_TypeError,
2706 "translate mapping must return integer, None or unicode");
2707 Py_DECREF(x);
2708 goto onError;
2709 }
2710 Py_DECREF(x);
2711 }
2712 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002713 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002714 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715
2716 done:
2717 return (PyObject *)v;
2718
2719 onError:
2720 Py_XDECREF(v);
2721 return NULL;
2722}
2723
2724PyObject *PyUnicode_Translate(PyObject *str,
2725 PyObject *mapping,
2726 const char *errors)
2727{
2728 PyObject *result;
2729
2730 str = PyUnicode_FromObject(str);
2731 if (str == NULL)
2732 goto onError;
2733 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2734 PyUnicode_GET_SIZE(str),
2735 mapping,
2736 errors);
2737 Py_DECREF(str);
2738 return result;
2739
2740 onError:
2741 Py_XDECREF(str);
2742 return NULL;
2743}
2744
Guido van Rossum9e896b32000-04-05 20:11:21 +00002745/* --- Decimal Encoder ---------------------------------------------------- */
2746
2747int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2748 int length,
2749 char *output,
2750 const char *errors)
2751{
2752 Py_UNICODE *p, *end;
2753
2754 if (output == NULL) {
2755 PyErr_BadArgument();
2756 return -1;
2757 }
2758
2759 p = s;
2760 end = s + length;
2761 while (p < end) {
2762 register Py_UNICODE ch = *p++;
2763 int decimal;
2764
2765 if (Py_UNICODE_ISSPACE(ch)) {
2766 *output++ = ' ';
2767 continue;
2768 }
2769 decimal = Py_UNICODE_TODECIMAL(ch);
2770 if (decimal >= 0) {
2771 *output++ = '0' + decimal;
2772 continue;
2773 }
Guido van Rossumba477042000-04-06 18:18:10 +00002774 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002775 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002776 continue;
2777 }
2778 /* All other characters are considered invalid */
2779 if (errors == NULL || strcmp(errors, "strict") == 0) {
2780 PyErr_SetString(PyExc_ValueError,
2781 "invalid decimal Unicode string");
2782 goto onError;
2783 }
2784 else if (strcmp(errors, "ignore") == 0)
2785 continue;
2786 else if (strcmp(errors, "replace") == 0) {
2787 *output++ = '?';
2788 continue;
2789 }
2790 }
2791 /* 0-terminate the output string */
2792 *output++ = '\0';
2793 return 0;
2794
2795 onError:
2796 return -1;
2797}
2798
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799/* --- Helpers ------------------------------------------------------------ */
2800
2801static
2802int count(PyUnicodeObject *self,
2803 int start,
2804 int end,
2805 PyUnicodeObject *substring)
2806{
2807 int count = 0;
2808
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002809 if (start < 0)
2810 start += self->length;
2811 if (start < 0)
2812 start = 0;
2813 if (end > self->length)
2814 end = self->length;
2815 if (end < 0)
2816 end += self->length;
2817 if (end < 0)
2818 end = 0;
2819
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002820 if (substring->length == 0)
2821 return (end - start + 1);
2822
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 end -= substring->length;
2824
2825 while (start <= end)
2826 if (Py_UNICODE_MATCH(self, start, substring)) {
2827 count++;
2828 start += substring->length;
2829 } else
2830 start++;
2831
2832 return count;
2833}
2834
2835int PyUnicode_Count(PyObject *str,
2836 PyObject *substr,
2837 int start,
2838 int end)
2839{
2840 int result;
2841
2842 str = PyUnicode_FromObject(str);
2843 if (str == NULL)
2844 return -1;
2845 substr = PyUnicode_FromObject(substr);
2846 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002847 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 return -1;
2849 }
2850
2851 result = count((PyUnicodeObject *)str,
2852 start, end,
2853 (PyUnicodeObject *)substr);
2854
2855 Py_DECREF(str);
2856 Py_DECREF(substr);
2857 return result;
2858}
2859
2860static
2861int findstring(PyUnicodeObject *self,
2862 PyUnicodeObject *substring,
2863 int start,
2864 int end,
2865 int direction)
2866{
2867 if (start < 0)
2868 start += self->length;
2869 if (start < 0)
2870 start = 0;
2871
2872 if (substring->length == 0)
2873 return start;
2874
2875 if (end > self->length)
2876 end = self->length;
2877 if (end < 0)
2878 end += self->length;
2879 if (end < 0)
2880 end = 0;
2881
2882 end -= substring->length;
2883
2884 if (direction < 0) {
2885 for (; end >= start; end--)
2886 if (Py_UNICODE_MATCH(self, end, substring))
2887 return end;
2888 } else {
2889 for (; start <= end; start++)
2890 if (Py_UNICODE_MATCH(self, start, substring))
2891 return start;
2892 }
2893
2894 return -1;
2895}
2896
2897int PyUnicode_Find(PyObject *str,
2898 PyObject *substr,
2899 int start,
2900 int end,
2901 int direction)
2902{
2903 int result;
2904
2905 str = PyUnicode_FromObject(str);
2906 if (str == NULL)
2907 return -1;
2908 substr = PyUnicode_FromObject(substr);
2909 if (substr == NULL) {
2910 Py_DECREF(substr);
2911 return -1;
2912 }
2913
2914 result = findstring((PyUnicodeObject *)str,
2915 (PyUnicodeObject *)substr,
2916 start, end, direction);
2917 Py_DECREF(str);
2918 Py_DECREF(substr);
2919 return result;
2920}
2921
2922static
2923int tailmatch(PyUnicodeObject *self,
2924 PyUnicodeObject *substring,
2925 int start,
2926 int end,
2927 int direction)
2928{
2929 if (start < 0)
2930 start += self->length;
2931 if (start < 0)
2932 start = 0;
2933
2934 if (substring->length == 0)
2935 return 1;
2936
2937 if (end > self->length)
2938 end = self->length;
2939 if (end < 0)
2940 end += self->length;
2941 if (end < 0)
2942 end = 0;
2943
2944 end -= substring->length;
2945 if (end < start)
2946 return 0;
2947
2948 if (direction > 0) {
2949 if (Py_UNICODE_MATCH(self, end, substring))
2950 return 1;
2951 } else {
2952 if (Py_UNICODE_MATCH(self, start, substring))
2953 return 1;
2954 }
2955
2956 return 0;
2957}
2958
2959int PyUnicode_Tailmatch(PyObject *str,
2960 PyObject *substr,
2961 int start,
2962 int end,
2963 int direction)
2964{
2965 int result;
2966
2967 str = PyUnicode_FromObject(str);
2968 if (str == NULL)
2969 return -1;
2970 substr = PyUnicode_FromObject(substr);
2971 if (substr == NULL) {
2972 Py_DECREF(substr);
2973 return -1;
2974 }
2975
2976 result = tailmatch((PyUnicodeObject *)str,
2977 (PyUnicodeObject *)substr,
2978 start, end, direction);
2979 Py_DECREF(str);
2980 Py_DECREF(substr);
2981 return result;
2982}
2983
2984static
2985const Py_UNICODE *findchar(const Py_UNICODE *s,
2986 int size,
2987 Py_UNICODE ch)
2988{
2989 /* like wcschr, but doesn't stop at NULL characters */
2990
2991 while (size-- > 0) {
2992 if (*s == ch)
2993 return s;
2994 s++;
2995 }
2996
2997 return NULL;
2998}
2999
3000/* Apply fixfct filter to the Unicode object self and return a
3001 reference to the modified object */
3002
3003static
3004PyObject *fixup(PyUnicodeObject *self,
3005 int (*fixfct)(PyUnicodeObject *s))
3006{
3007
3008 PyUnicodeObject *u;
3009
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003010 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 if (u == NULL)
3012 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003013
3014 Py_UNICODE_COPY(u->str, self->str, self->length);
3015
Tim Peters7a29bd52001-09-12 03:03:31 +00003016 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 /* fixfct should return TRUE if it modified the buffer. If
3018 FALSE, return a reference to the original buffer instead
3019 (to save space, not time) */
3020 Py_INCREF(self);
3021 Py_DECREF(u);
3022 return (PyObject*) self;
3023 }
3024 return (PyObject*) u;
3025}
3026
3027static
3028int fixupper(PyUnicodeObject *self)
3029{
3030 int len = self->length;
3031 Py_UNICODE *s = self->str;
3032 int status = 0;
3033
3034 while (len-- > 0) {
3035 register Py_UNICODE ch;
3036
3037 ch = Py_UNICODE_TOUPPER(*s);
3038 if (ch != *s) {
3039 status = 1;
3040 *s = ch;
3041 }
3042 s++;
3043 }
3044
3045 return status;
3046}
3047
3048static
3049int fixlower(PyUnicodeObject *self)
3050{
3051 int len = self->length;
3052 Py_UNICODE *s = self->str;
3053 int status = 0;
3054
3055 while (len-- > 0) {
3056 register Py_UNICODE ch;
3057
3058 ch = Py_UNICODE_TOLOWER(*s);
3059 if (ch != *s) {
3060 status = 1;
3061 *s = ch;
3062 }
3063 s++;
3064 }
3065
3066 return status;
3067}
3068
3069static
3070int fixswapcase(PyUnicodeObject *self)
3071{
3072 int len = self->length;
3073 Py_UNICODE *s = self->str;
3074 int status = 0;
3075
3076 while (len-- > 0) {
3077 if (Py_UNICODE_ISUPPER(*s)) {
3078 *s = Py_UNICODE_TOLOWER(*s);
3079 status = 1;
3080 } else if (Py_UNICODE_ISLOWER(*s)) {
3081 *s = Py_UNICODE_TOUPPER(*s);
3082 status = 1;
3083 }
3084 s++;
3085 }
3086
3087 return status;
3088}
3089
3090static
3091int fixcapitalize(PyUnicodeObject *self)
3092{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003093 int len = self->length;
3094 Py_UNICODE *s = self->str;
3095 int status = 0;
3096
3097 if (len == 0)
3098 return 0;
3099 if (Py_UNICODE_ISLOWER(*s)) {
3100 *s = Py_UNICODE_TOUPPER(*s);
3101 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003103 s++;
3104 while (--len > 0) {
3105 if (Py_UNICODE_ISUPPER(*s)) {
3106 *s = Py_UNICODE_TOLOWER(*s);
3107 status = 1;
3108 }
3109 s++;
3110 }
3111 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112}
3113
3114static
3115int fixtitle(PyUnicodeObject *self)
3116{
3117 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3118 register Py_UNICODE *e;
3119 int previous_is_cased;
3120
3121 /* Shortcut for single character strings */
3122 if (PyUnicode_GET_SIZE(self) == 1) {
3123 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3124 if (*p != ch) {
3125 *p = ch;
3126 return 1;
3127 }
3128 else
3129 return 0;
3130 }
3131
3132 e = p + PyUnicode_GET_SIZE(self);
3133 previous_is_cased = 0;
3134 for (; p < e; p++) {
3135 register const Py_UNICODE ch = *p;
3136
3137 if (previous_is_cased)
3138 *p = Py_UNICODE_TOLOWER(ch);
3139 else
3140 *p = Py_UNICODE_TOTITLE(ch);
3141
3142 if (Py_UNICODE_ISLOWER(ch) ||
3143 Py_UNICODE_ISUPPER(ch) ||
3144 Py_UNICODE_ISTITLE(ch))
3145 previous_is_cased = 1;
3146 else
3147 previous_is_cased = 0;
3148 }
3149 return 1;
3150}
3151
3152PyObject *PyUnicode_Join(PyObject *separator,
3153 PyObject *seq)
3154{
3155 Py_UNICODE *sep;
3156 int seplen;
3157 PyUnicodeObject *res = NULL;
3158 int reslen = 0;
3159 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 int sz = 100;
3161 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003162 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163
Tim Peters2cfe3682001-05-05 05:36:48 +00003164 it = PyObject_GetIter(seq);
3165 if (it == NULL)
3166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167
3168 if (separator == NULL) {
3169 Py_UNICODE blank = ' ';
3170 sep = &blank;
3171 seplen = 1;
3172 }
3173 else {
3174 separator = PyUnicode_FromObject(separator);
3175 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003176 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 sep = PyUnicode_AS_UNICODE(separator);
3178 seplen = PyUnicode_GET_SIZE(separator);
3179 }
3180
3181 res = _PyUnicode_New(sz);
3182 if (res == NULL)
3183 goto onError;
3184 p = PyUnicode_AS_UNICODE(res);
3185 reslen = 0;
3186
Tim Peters2cfe3682001-05-05 05:36:48 +00003187 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003189 PyObject *item = PyIter_Next(it);
3190 if (item == NULL) {
3191 if (PyErr_Occurred())
3192 goto onError;
3193 break;
3194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 if (!PyUnicode_Check(item)) {
3196 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003197 if (!PyString_Check(item)) {
3198 PyErr_Format(PyExc_TypeError,
3199 "sequence item %i: expected string or Unicode,"
3200 " %.80s found",
3201 i, item->ob_type->tp_name);
3202 Py_DECREF(item);
3203 goto onError;
3204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 v = PyUnicode_FromObject(item);
3206 Py_DECREF(item);
3207 item = v;
3208 if (item == NULL)
3209 goto onError;
3210 }
3211 itemlen = PyUnicode_GET_SIZE(item);
3212 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003213 if (_PyUnicode_Resize(&res, sz*2)) {
3214 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 sz *= 2;
3218 p = PyUnicode_AS_UNICODE(res) + reslen;
3219 }
3220 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003221 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 p += seplen;
3223 reslen += seplen;
3224 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003225 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 p += itemlen;
3227 reslen += itemlen;
3228 Py_DECREF(item);
3229 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003230 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 goto onError;
3232
3233 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003234 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 return (PyObject *)res;
3236
3237 onError:
3238 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003239 Py_XDECREF(res);
3240 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 return NULL;
3242}
3243
3244static
3245PyUnicodeObject *pad(PyUnicodeObject *self,
3246 int left,
3247 int right,
3248 Py_UNICODE fill)
3249{
3250 PyUnicodeObject *u;
3251
3252 if (left < 0)
3253 left = 0;
3254 if (right < 0)
3255 right = 0;
3256
Tim Peters7a29bd52001-09-12 03:03:31 +00003257 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 Py_INCREF(self);
3259 return self;
3260 }
3261
3262 u = _PyUnicode_New(left + self->length + right);
3263 if (u) {
3264 if (left)
3265 Py_UNICODE_FILL(u->str, fill, left);
3266 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3267 if (right)
3268 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3269 }
3270
3271 return u;
3272}
3273
3274#define SPLIT_APPEND(data, left, right) \
3275 str = PyUnicode_FromUnicode(data + left, right - left); \
3276 if (!str) \
3277 goto onError; \
3278 if (PyList_Append(list, str)) { \
3279 Py_DECREF(str); \
3280 goto onError; \
3281 } \
3282 else \
3283 Py_DECREF(str);
3284
3285static
3286PyObject *split_whitespace(PyUnicodeObject *self,
3287 PyObject *list,
3288 int maxcount)
3289{
3290 register int i;
3291 register int j;
3292 int len = self->length;
3293 PyObject *str;
3294
3295 for (i = j = 0; i < len; ) {
3296 /* find a token */
3297 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3298 i++;
3299 j = i;
3300 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3301 i++;
3302 if (j < i) {
3303 if (maxcount-- <= 0)
3304 break;
3305 SPLIT_APPEND(self->str, j, i);
3306 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3307 i++;
3308 j = i;
3309 }
3310 }
3311 if (j < len) {
3312 SPLIT_APPEND(self->str, j, len);
3313 }
3314 return list;
3315
3316 onError:
3317 Py_DECREF(list);
3318 return NULL;
3319}
3320
3321PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003322 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323{
3324 register int i;
3325 register int j;
3326 int len;
3327 PyObject *list;
3328 PyObject *str;
3329 Py_UNICODE *data;
3330
3331 string = PyUnicode_FromObject(string);
3332 if (string == NULL)
3333 return NULL;
3334 data = PyUnicode_AS_UNICODE(string);
3335 len = PyUnicode_GET_SIZE(string);
3336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 list = PyList_New(0);
3338 if (!list)
3339 goto onError;
3340
3341 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003342 int eol;
3343
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 /* Find a line and append it */
3345 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3346 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
3348 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003349 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 if (i < len) {
3351 if (data[i] == '\r' && i + 1 < len &&
3352 data[i+1] == '\n')
3353 i += 2;
3354 else
3355 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003356 if (keepends)
3357 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 }
Guido van Rossum86662912000-04-11 15:38:46 +00003359 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 j = i;
3361 }
3362 if (j < len) {
3363 SPLIT_APPEND(data, j, len);
3364 }
3365
3366 Py_DECREF(string);
3367 return list;
3368
3369 onError:
3370 Py_DECREF(list);
3371 Py_DECREF(string);
3372 return NULL;
3373}
3374
3375static
3376PyObject *split_char(PyUnicodeObject *self,
3377 PyObject *list,
3378 Py_UNICODE ch,
3379 int maxcount)
3380{
3381 register int i;
3382 register int j;
3383 int len = self->length;
3384 PyObject *str;
3385
3386 for (i = j = 0; i < len; ) {
3387 if (self->str[i] == ch) {
3388 if (maxcount-- <= 0)
3389 break;
3390 SPLIT_APPEND(self->str, j, i);
3391 i = j = i + 1;
3392 } else
3393 i++;
3394 }
3395 if (j <= len) {
3396 SPLIT_APPEND(self->str, j, len);
3397 }
3398 return list;
3399
3400 onError:
3401 Py_DECREF(list);
3402 return NULL;
3403}
3404
3405static
3406PyObject *split_substring(PyUnicodeObject *self,
3407 PyObject *list,
3408 PyUnicodeObject *substring,
3409 int maxcount)
3410{
3411 register int i;
3412 register int j;
3413 int len = self->length;
3414 int sublen = substring->length;
3415 PyObject *str;
3416
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003417 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 if (Py_UNICODE_MATCH(self, i, substring)) {
3419 if (maxcount-- <= 0)
3420 break;
3421 SPLIT_APPEND(self->str, j, i);
3422 i = j = i + sublen;
3423 } else
3424 i++;
3425 }
3426 if (j <= len) {
3427 SPLIT_APPEND(self->str, j, len);
3428 }
3429 return list;
3430
3431 onError:
3432 Py_DECREF(list);
3433 return NULL;
3434}
3435
3436#undef SPLIT_APPEND
3437
3438static
3439PyObject *split(PyUnicodeObject *self,
3440 PyUnicodeObject *substring,
3441 int maxcount)
3442{
3443 PyObject *list;
3444
3445 if (maxcount < 0)
3446 maxcount = INT_MAX;
3447
3448 list = PyList_New(0);
3449 if (!list)
3450 return NULL;
3451
3452 if (substring == NULL)
3453 return split_whitespace(self,list,maxcount);
3454
3455 else if (substring->length == 1)
3456 return split_char(self,list,substring->str[0],maxcount);
3457
3458 else if (substring->length == 0) {
3459 Py_DECREF(list);
3460 PyErr_SetString(PyExc_ValueError, "empty separator");
3461 return NULL;
3462 }
3463 else
3464 return split_substring(self,list,substring,maxcount);
3465}
3466
3467static
3468PyObject *strip(PyUnicodeObject *self,
3469 int left,
3470 int right)
3471{
3472 Py_UNICODE *p = self->str;
3473 int start = 0;
3474 int end = self->length;
3475
3476 if (left)
3477 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3478 start++;
3479
3480 if (right)
3481 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3482 end--;
3483
Tim Peters7a29bd52001-09-12 03:03:31 +00003484 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 /* couldn't strip anything off, return original string */
3486 Py_INCREF(self);
3487 return (PyObject*) self;
3488 }
3489
3490 return (PyObject*) PyUnicode_FromUnicode(
3491 self->str + start,
3492 end - start
3493 );
3494}
3495
3496static
3497PyObject *replace(PyUnicodeObject *self,
3498 PyUnicodeObject *str1,
3499 PyUnicodeObject *str2,
3500 int maxcount)
3501{
3502 PyUnicodeObject *u;
3503
3504 if (maxcount < 0)
3505 maxcount = INT_MAX;
3506
3507 if (str1->length == 1 && str2->length == 1) {
3508 int i;
3509
3510 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003511 if (!findchar(self->str, self->length, str1->str[0]) &&
3512 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 /* nothing to replace, return original string */
3514 Py_INCREF(self);
3515 u = self;
3516 } else {
3517 Py_UNICODE u1 = str1->str[0];
3518 Py_UNICODE u2 = str2->str[0];
3519
3520 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003521 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 self->length
3523 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003524 if (u != NULL) {
3525 Py_UNICODE_COPY(u->str, self->str,
3526 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 for (i = 0; i < u->length; i++)
3528 if (u->str[i] == u1) {
3529 if (--maxcount < 0)
3530 break;
3531 u->str[i] = u2;
3532 }
3533 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535
3536 } else {
3537 int n, i;
3538 Py_UNICODE *p;
3539
3540 /* replace strings */
3541 n = count(self, 0, self->length, str1);
3542 if (n > maxcount)
3543 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003544 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 /* nothing to replace, return original string */
3546 Py_INCREF(self);
3547 u = self;
3548 } else {
3549 u = _PyUnicode_New(
3550 self->length + n * (str2->length - str1->length));
3551 if (u) {
3552 i = 0;
3553 p = u->str;
3554 while (i <= self->length - str1->length)
3555 if (Py_UNICODE_MATCH(self, i, str1)) {
3556 /* replace string segment */
3557 Py_UNICODE_COPY(p, str2->str, str2->length);
3558 p += str2->length;
3559 i += str1->length;
3560 if (--n <= 0) {
3561 /* copy remaining part */
3562 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3563 break;
3564 }
3565 } else
3566 *p++ = self->str[i++];
3567 }
3568 }
3569 }
3570
3571 return (PyObject *) u;
3572}
3573
3574/* --- Unicode Object Methods --------------------------------------------- */
3575
3576static char title__doc__[] =
3577"S.title() -> unicode\n\
3578\n\
3579Return a titlecased version of S, i.e. words start with title case\n\
3580characters, all remaining cased characters have lower case.";
3581
3582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003583unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 return fixup(self, fixtitle);
3586}
3587
3588static char capitalize__doc__[] =
3589"S.capitalize() -> unicode\n\
3590\n\
3591Return a capitalized version of S, i.e. make the first character\n\
3592have upper case.";
3593
3594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003595unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return fixup(self, fixcapitalize);
3598}
3599
3600#if 0
3601static char capwords__doc__[] =
3602"S.capwords() -> unicode\n\
3603\n\
3604Apply .capitalize() to all words in S and return the result with\n\
3605normalized whitespace (all whitespace strings are replaced by ' ').";
3606
3607static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003608unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609{
3610 PyObject *list;
3611 PyObject *item;
3612 int i;
3613
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 /* Split into words */
3615 list = split(self, NULL, -1);
3616 if (!list)
3617 return NULL;
3618
3619 /* Capitalize each word */
3620 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3621 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3622 fixcapitalize);
3623 if (item == NULL)
3624 goto onError;
3625 Py_DECREF(PyList_GET_ITEM(list, i));
3626 PyList_SET_ITEM(list, i, item);
3627 }
3628
3629 /* Join the words to form a new string */
3630 item = PyUnicode_Join(NULL, list);
3631
3632onError:
3633 Py_DECREF(list);
3634 return (PyObject *)item;
3635}
3636#endif
3637
3638static char center__doc__[] =
3639"S.center(width) -> unicode\n\
3640\n\
3641Return S centered in a Unicode string of length width. Padding is done\n\
3642using spaces.";
3643
3644static PyObject *
3645unicode_center(PyUnicodeObject *self, PyObject *args)
3646{
3647 int marg, left;
3648 int width;
3649
3650 if (!PyArg_ParseTuple(args, "i:center", &width))
3651 return NULL;
3652
Tim Peters7a29bd52001-09-12 03:03:31 +00003653 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 Py_INCREF(self);
3655 return (PyObject*) self;
3656 }
3657
3658 marg = width - self->length;
3659 left = marg / 2 + (marg & width & 1);
3660
3661 return (PyObject*) pad(self, left, marg - left, ' ');
3662}
3663
Marc-André Lemburge5034372000-08-08 08:04:29 +00003664#if 0
3665
3666/* This code should go into some future Unicode collation support
3667 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003668 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003669
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003670/* speedy UTF-16 code point order comparison */
3671/* gleaned from: */
3672/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3673
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003674static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003675{
3676 0, 0, 0, 0, 0, 0, 0, 0,
3677 0, 0, 0, 0, 0, 0, 0, 0,
3678 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003679 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003680};
3681
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682static int
3683unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3684{
3685 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003686
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 Py_UNICODE *s1 = str1->str;
3688 Py_UNICODE *s2 = str2->str;
3689
3690 len1 = str1->length;
3691 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003692
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003694 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003695
3696 c1 = *s1++;
3697 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003698
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003699 if (c1 > (1<<11) * 26)
3700 c1 += utf16Fixup[c1>>11];
3701 if (c2 > (1<<11) * 26)
3702 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003703 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003704
3705 if (c1 != c2)
3706 return (c1 < c2) ? -1 : 1;
3707
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003708 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 }
3710
3711 return (len1 < len2) ? -1 : (len1 != len2);
3712}
3713
Marc-André Lemburge5034372000-08-08 08:04:29 +00003714#else
3715
3716static int
3717unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3718{
3719 register int len1, len2;
3720
3721 Py_UNICODE *s1 = str1->str;
3722 Py_UNICODE *s2 = str2->str;
3723
3724 len1 = str1->length;
3725 len2 = str2->length;
3726
3727 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003728 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003729
Fredrik Lundh45714e92001-06-26 16:39:36 +00003730 c1 = *s1++;
3731 c2 = *s2++;
3732
3733 if (c1 != c2)
3734 return (c1 < c2) ? -1 : 1;
3735
Marc-André Lemburge5034372000-08-08 08:04:29 +00003736 len1--; len2--;
3737 }
3738
3739 return (len1 < len2) ? -1 : (len1 != len2);
3740}
3741
3742#endif
3743
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744int PyUnicode_Compare(PyObject *left,
3745 PyObject *right)
3746{
3747 PyUnicodeObject *u = NULL, *v = NULL;
3748 int result;
3749
3750 /* Coerce the two arguments */
3751 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3752 if (u == NULL)
3753 goto onError;
3754 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3755 if (v == NULL)
3756 goto onError;
3757
Thomas Wouters7e474022000-07-16 12:04:32 +00003758 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 if (v == u) {
3760 Py_DECREF(u);
3761 Py_DECREF(v);
3762 return 0;
3763 }
3764
3765 result = unicode_compare(u, v);
3766
3767 Py_DECREF(u);
3768 Py_DECREF(v);
3769 return result;
3770
3771onError:
3772 Py_XDECREF(u);
3773 Py_XDECREF(v);
3774 return -1;
3775}
3776
Guido van Rossum403d68b2000-03-13 15:55:09 +00003777int PyUnicode_Contains(PyObject *container,
3778 PyObject *element)
3779{
3780 PyUnicodeObject *u = NULL, *v = NULL;
3781 int result;
3782 register const Py_UNICODE *p, *e;
3783 register Py_UNICODE ch;
3784
3785 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003786 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003787 if (v == NULL) {
3788 PyErr_SetString(PyExc_TypeError,
3789 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003790 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003791 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003792 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3793 if (u == NULL) {
3794 Py_DECREF(v);
3795 goto onError;
3796 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003797
3798 /* Check v in u */
3799 if (PyUnicode_GET_SIZE(v) != 1) {
3800 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003801 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003802 goto onError;
3803 }
3804 ch = *PyUnicode_AS_UNICODE(v);
3805 p = PyUnicode_AS_UNICODE(u);
3806 e = p + PyUnicode_GET_SIZE(u);
3807 result = 0;
3808 while (p < e) {
3809 if (*p++ == ch) {
3810 result = 1;
3811 break;
3812 }
3813 }
3814
3815 Py_DECREF(u);
3816 Py_DECREF(v);
3817 return result;
3818
3819onError:
3820 Py_XDECREF(u);
3821 Py_XDECREF(v);
3822 return -1;
3823}
3824
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825/* Concat to string or Unicode object giving a new Unicode object. */
3826
3827PyObject *PyUnicode_Concat(PyObject *left,
3828 PyObject *right)
3829{
3830 PyUnicodeObject *u = NULL, *v = NULL, *w;
3831
3832 /* Coerce the two arguments */
3833 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3834 if (u == NULL)
3835 goto onError;
3836 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3837 if (v == NULL)
3838 goto onError;
3839
3840 /* Shortcuts */
3841 if (v == unicode_empty) {
3842 Py_DECREF(v);
3843 return (PyObject *)u;
3844 }
3845 if (u == unicode_empty) {
3846 Py_DECREF(u);
3847 return (PyObject *)v;
3848 }
3849
3850 /* Concat the two Unicode strings */
3851 w = _PyUnicode_New(u->length + v->length);
3852 if (w == NULL)
3853 goto onError;
3854 Py_UNICODE_COPY(w->str, u->str, u->length);
3855 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3856
3857 Py_DECREF(u);
3858 Py_DECREF(v);
3859 return (PyObject *)w;
3860
3861onError:
3862 Py_XDECREF(u);
3863 Py_XDECREF(v);
3864 return NULL;
3865}
3866
3867static char count__doc__[] =
3868"S.count(sub[, start[, end]]) -> int\n\
3869\n\
3870Return the number of occurrences of substring sub in Unicode string\n\
3871S[start:end]. Optional arguments start and end are\n\
3872interpreted as in slice notation.";
3873
3874static PyObject *
3875unicode_count(PyUnicodeObject *self, PyObject *args)
3876{
3877 PyUnicodeObject *substring;
3878 int start = 0;
3879 int end = INT_MAX;
3880 PyObject *result;
3881
Guido van Rossumb8872e62000-05-09 14:14:27 +00003882 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3883 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 return NULL;
3885
3886 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3887 (PyObject *)substring);
3888 if (substring == NULL)
3889 return NULL;
3890
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 if (start < 0)
3892 start += self->length;
3893 if (start < 0)
3894 start = 0;
3895 if (end > self->length)
3896 end = self->length;
3897 if (end < 0)
3898 end += self->length;
3899 if (end < 0)
3900 end = 0;
3901
3902 result = PyInt_FromLong((long) count(self, start, end, substring));
3903
3904 Py_DECREF(substring);
3905 return result;
3906}
3907
3908static char encode__doc__[] =
3909"S.encode([encoding[,errors]]) -> string\n\
3910\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003911Return an encoded string version of S. Default encoding is the current\n\
3912default string encoding. errors may be given to set a different error\n\
3913handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3914a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915
3916static PyObject *
3917unicode_encode(PyUnicodeObject *self, PyObject *args)
3918{
3919 char *encoding = NULL;
3920 char *errors = NULL;
3921 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3922 return NULL;
3923 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3924}
3925
3926static char expandtabs__doc__[] =
3927"S.expandtabs([tabsize]) -> unicode\n\
3928\n\
3929Return a copy of S where all tab characters are expanded using spaces.\n\
3930If tabsize is not given, a tab size of 8 characters is assumed.";
3931
3932static PyObject*
3933unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3934{
3935 Py_UNICODE *e;
3936 Py_UNICODE *p;
3937 Py_UNICODE *q;
3938 int i, j;
3939 PyUnicodeObject *u;
3940 int tabsize = 8;
3941
3942 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3943 return NULL;
3944
Thomas Wouters7e474022000-07-16 12:04:32 +00003945 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 i = j = 0;
3947 e = self->str + self->length;
3948 for (p = self->str; p < e; p++)
3949 if (*p == '\t') {
3950 if (tabsize > 0)
3951 j += tabsize - (j % tabsize);
3952 }
3953 else {
3954 j++;
3955 if (*p == '\n' || *p == '\r') {
3956 i += j;
3957 j = 0;
3958 }
3959 }
3960
3961 /* Second pass: create output string and fill it */
3962 u = _PyUnicode_New(i + j);
3963 if (!u)
3964 return NULL;
3965
3966 j = 0;
3967 q = u->str;
3968
3969 for (p = self->str; p < e; p++)
3970 if (*p == '\t') {
3971 if (tabsize > 0) {
3972 i = tabsize - (j % tabsize);
3973 j += i;
3974 while (i--)
3975 *q++ = ' ';
3976 }
3977 }
3978 else {
3979 j++;
3980 *q++ = *p;
3981 if (*p == '\n' || *p == '\r')
3982 j = 0;
3983 }
3984
3985 return (PyObject*) u;
3986}
3987
3988static char find__doc__[] =
3989"S.find(sub [,start [,end]]) -> int\n\
3990\n\
3991Return the lowest index in S where substring sub is found,\n\
3992such that sub is contained within s[start,end]. Optional\n\
3993arguments start and end are interpreted as in slice notation.\n\
3994\n\
3995Return -1 on failure.";
3996
3997static PyObject *
3998unicode_find(PyUnicodeObject *self, PyObject *args)
3999{
4000 PyUnicodeObject *substring;
4001 int start = 0;
4002 int end = INT_MAX;
4003 PyObject *result;
4004
Guido van Rossumb8872e62000-05-09 14:14:27 +00004005 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4006 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 return NULL;
4008 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4009 (PyObject *)substring);
4010 if (substring == NULL)
4011 return NULL;
4012
4013 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4014
4015 Py_DECREF(substring);
4016 return result;
4017}
4018
4019static PyObject *
4020unicode_getitem(PyUnicodeObject *self, int index)
4021{
4022 if (index < 0 || index >= self->length) {
4023 PyErr_SetString(PyExc_IndexError, "string index out of range");
4024 return NULL;
4025 }
4026
4027 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4028}
4029
4030static long
4031unicode_hash(PyUnicodeObject *self)
4032{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004033 /* Since Unicode objects compare equal to their ASCII string
4034 counterparts, they should use the individual character values
4035 as basis for their hash value. This is needed to assure that
4036 strings and Unicode objects behave in the same way as
4037 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
Fredrik Lundhdde61642000-07-10 18:27:47 +00004039 register int len;
4040 register Py_UNICODE *p;
4041 register long x;
4042
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 if (self->hash != -1)
4044 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004045 len = PyUnicode_GET_SIZE(self);
4046 p = PyUnicode_AS_UNICODE(self);
4047 x = *p << 7;
4048 while (--len >= 0)
4049 x = (1000003*x) ^ *p++;
4050 x ^= PyUnicode_GET_SIZE(self);
4051 if (x == -1)
4052 x = -2;
4053 self->hash = x;
4054 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055}
4056
4057static char index__doc__[] =
4058"S.index(sub [,start [,end]]) -> int\n\
4059\n\
4060Like S.find() but raise ValueError when the substring is not found.";
4061
4062static PyObject *
4063unicode_index(PyUnicodeObject *self, PyObject *args)
4064{
4065 int result;
4066 PyUnicodeObject *substring;
4067 int start = 0;
4068 int end = INT_MAX;
4069
Guido van Rossumb8872e62000-05-09 14:14:27 +00004070 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4071 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 return NULL;
4073
4074 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4075 (PyObject *)substring);
4076 if (substring == NULL)
4077 return NULL;
4078
4079 result = findstring(self, substring, start, end, 1);
4080
4081 Py_DECREF(substring);
4082 if (result < 0) {
4083 PyErr_SetString(PyExc_ValueError, "substring not found");
4084 return NULL;
4085 }
4086 return PyInt_FromLong(result);
4087}
4088
4089static char islower__doc__[] =
4090"S.islower() -> int\n\
4091\n\
4092Return 1 if all cased characters in S are lowercase and there is\n\
4093at least one cased character in S, 0 otherwise.";
4094
4095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004096unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097{
4098 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4099 register const Py_UNICODE *e;
4100 int cased;
4101
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 /* Shortcut for single character strings */
4103 if (PyUnicode_GET_SIZE(self) == 1)
4104 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4105
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004106 /* Special case for empty strings */
4107 if (PyString_GET_SIZE(self) == 0)
4108 return PyInt_FromLong(0);
4109
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 e = p + PyUnicode_GET_SIZE(self);
4111 cased = 0;
4112 for (; p < e; p++) {
4113 register const Py_UNICODE ch = *p;
4114
4115 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4116 return PyInt_FromLong(0);
4117 else if (!cased && Py_UNICODE_ISLOWER(ch))
4118 cased = 1;
4119 }
4120 return PyInt_FromLong(cased);
4121}
4122
4123static char isupper__doc__[] =
4124"S.isupper() -> int\n\
4125\n\
4126Return 1 if all cased characters in S are uppercase and there is\n\
4127at least one cased character in S, 0 otherwise.";
4128
4129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004130unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131{
4132 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4133 register const Py_UNICODE *e;
4134 int cased;
4135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 /* Shortcut for single character strings */
4137 if (PyUnicode_GET_SIZE(self) == 1)
4138 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004140 /* Special case for empty strings */
4141 if (PyString_GET_SIZE(self) == 0)
4142 return PyInt_FromLong(0);
4143
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 e = p + PyUnicode_GET_SIZE(self);
4145 cased = 0;
4146 for (; p < e; p++) {
4147 register const Py_UNICODE ch = *p;
4148
4149 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4150 return PyInt_FromLong(0);
4151 else if (!cased && Py_UNICODE_ISUPPER(ch))
4152 cased = 1;
4153 }
4154 return PyInt_FromLong(cased);
4155}
4156
4157static char istitle__doc__[] =
4158"S.istitle() -> int\n\
4159\n\
4160Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4161may only follow uncased characters and lowercase characters only cased\n\
4162ones. Return 0 otherwise.";
4163
4164static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004165unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166{
4167 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4168 register const Py_UNICODE *e;
4169 int cased, previous_is_cased;
4170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 /* Shortcut for single character strings */
4172 if (PyUnicode_GET_SIZE(self) == 1)
4173 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4174 (Py_UNICODE_ISUPPER(*p) != 0));
4175
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004176 /* Special case for empty strings */
4177 if (PyString_GET_SIZE(self) == 0)
4178 return PyInt_FromLong(0);
4179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 e = p + PyUnicode_GET_SIZE(self);
4181 cased = 0;
4182 previous_is_cased = 0;
4183 for (; p < e; p++) {
4184 register const Py_UNICODE ch = *p;
4185
4186 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4187 if (previous_is_cased)
4188 return PyInt_FromLong(0);
4189 previous_is_cased = 1;
4190 cased = 1;
4191 }
4192 else if (Py_UNICODE_ISLOWER(ch)) {
4193 if (!previous_is_cased)
4194 return PyInt_FromLong(0);
4195 previous_is_cased = 1;
4196 cased = 1;
4197 }
4198 else
4199 previous_is_cased = 0;
4200 }
4201 return PyInt_FromLong(cased);
4202}
4203
4204static char isspace__doc__[] =
4205"S.isspace() -> int\n\
4206\n\
4207Return 1 if there are only whitespace characters in S,\n\
42080 otherwise.";
4209
4210static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004211unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212{
4213 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4214 register const Py_UNICODE *e;
4215
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 /* Shortcut for single character strings */
4217 if (PyUnicode_GET_SIZE(self) == 1 &&
4218 Py_UNICODE_ISSPACE(*p))
4219 return PyInt_FromLong(1);
4220
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004221 /* Special case for empty strings */
4222 if (PyString_GET_SIZE(self) == 0)
4223 return PyInt_FromLong(0);
4224
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 e = p + PyUnicode_GET_SIZE(self);
4226 for (; p < e; p++) {
4227 if (!Py_UNICODE_ISSPACE(*p))
4228 return PyInt_FromLong(0);
4229 }
4230 return PyInt_FromLong(1);
4231}
4232
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004233static char isalpha__doc__[] =
4234"S.isalpha() -> int\n\
4235\n\
4236Return 1 if all characters in S are alphabetic\n\
4237and there is at least one character in S, 0 otherwise.";
4238
4239static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004240unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004241{
4242 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4243 register const Py_UNICODE *e;
4244
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004245 /* Shortcut for single character strings */
4246 if (PyUnicode_GET_SIZE(self) == 1 &&
4247 Py_UNICODE_ISALPHA(*p))
4248 return PyInt_FromLong(1);
4249
4250 /* Special case for empty strings */
4251 if (PyString_GET_SIZE(self) == 0)
4252 return PyInt_FromLong(0);
4253
4254 e = p + PyUnicode_GET_SIZE(self);
4255 for (; p < e; p++) {
4256 if (!Py_UNICODE_ISALPHA(*p))
4257 return PyInt_FromLong(0);
4258 }
4259 return PyInt_FromLong(1);
4260}
4261
4262static char isalnum__doc__[] =
4263"S.isalnum() -> int\n\
4264\n\
4265Return 1 if all characters in S are alphanumeric\n\
4266and there is at least one character in S, 0 otherwise.";
4267
4268static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004269unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004270{
4271 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4272 register const Py_UNICODE *e;
4273
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274 /* Shortcut for single character strings */
4275 if (PyUnicode_GET_SIZE(self) == 1 &&
4276 Py_UNICODE_ISALNUM(*p))
4277 return PyInt_FromLong(1);
4278
4279 /* Special case for empty strings */
4280 if (PyString_GET_SIZE(self) == 0)
4281 return PyInt_FromLong(0);
4282
4283 e = p + PyUnicode_GET_SIZE(self);
4284 for (; p < e; p++) {
4285 if (!Py_UNICODE_ISALNUM(*p))
4286 return PyInt_FromLong(0);
4287 }
4288 return PyInt_FromLong(1);
4289}
4290
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291static char isdecimal__doc__[] =
4292"S.isdecimal() -> int\n\
4293\n\
4294Return 1 if there are only decimal characters in S,\n\
42950 otherwise.";
4296
4297static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004298unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299{
4300 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4301 register const Py_UNICODE *e;
4302
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 /* Shortcut for single character strings */
4304 if (PyUnicode_GET_SIZE(self) == 1 &&
4305 Py_UNICODE_ISDECIMAL(*p))
4306 return PyInt_FromLong(1);
4307
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004308 /* Special case for empty strings */
4309 if (PyString_GET_SIZE(self) == 0)
4310 return PyInt_FromLong(0);
4311
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 e = p + PyUnicode_GET_SIZE(self);
4313 for (; p < e; p++) {
4314 if (!Py_UNICODE_ISDECIMAL(*p))
4315 return PyInt_FromLong(0);
4316 }
4317 return PyInt_FromLong(1);
4318}
4319
4320static char isdigit__doc__[] =
4321"S.isdigit() -> int\n\
4322\n\
4323Return 1 if there are only digit characters in S,\n\
43240 otherwise.";
4325
4326static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004327unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
4329 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4330 register const Py_UNICODE *e;
4331
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 /* Shortcut for single character strings */
4333 if (PyUnicode_GET_SIZE(self) == 1 &&
4334 Py_UNICODE_ISDIGIT(*p))
4335 return PyInt_FromLong(1);
4336
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004337 /* Special case for empty strings */
4338 if (PyString_GET_SIZE(self) == 0)
4339 return PyInt_FromLong(0);
4340
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 e = p + PyUnicode_GET_SIZE(self);
4342 for (; p < e; p++) {
4343 if (!Py_UNICODE_ISDIGIT(*p))
4344 return PyInt_FromLong(0);
4345 }
4346 return PyInt_FromLong(1);
4347}
4348
4349static char isnumeric__doc__[] =
4350"S.isnumeric() -> int\n\
4351\n\
4352Return 1 if there are only numeric characters in S,\n\
43530 otherwise.";
4354
4355static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004356unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357{
4358 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4359 register const Py_UNICODE *e;
4360
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 /* Shortcut for single character strings */
4362 if (PyUnicode_GET_SIZE(self) == 1 &&
4363 Py_UNICODE_ISNUMERIC(*p))
4364 return PyInt_FromLong(1);
4365
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004366 /* Special case for empty strings */
4367 if (PyString_GET_SIZE(self) == 0)
4368 return PyInt_FromLong(0);
4369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 e = p + PyUnicode_GET_SIZE(self);
4371 for (; p < e; p++) {
4372 if (!Py_UNICODE_ISNUMERIC(*p))
4373 return PyInt_FromLong(0);
4374 }
4375 return PyInt_FromLong(1);
4376}
4377
4378static char join__doc__[] =
4379"S.join(sequence) -> unicode\n\
4380\n\
4381Return a string which is the concatenation of the strings in the\n\
4382sequence. The separator between elements is S.";
4383
4384static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004385unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004387 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388}
4389
4390static int
4391unicode_length(PyUnicodeObject *self)
4392{
4393 return self->length;
4394}
4395
4396static char ljust__doc__[] =
4397"S.ljust(width) -> unicode\n\
4398\n\
4399Return S left justified in a Unicode string of length width. Padding is\n\
4400done using spaces.";
4401
4402static PyObject *
4403unicode_ljust(PyUnicodeObject *self, PyObject *args)
4404{
4405 int width;
4406 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4407 return NULL;
4408
Tim Peters7a29bd52001-09-12 03:03:31 +00004409 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 Py_INCREF(self);
4411 return (PyObject*) self;
4412 }
4413
4414 return (PyObject*) pad(self, 0, width - self->length, ' ');
4415}
4416
4417static char lower__doc__[] =
4418"S.lower() -> unicode\n\
4419\n\
4420Return a copy of the string S converted to lowercase.";
4421
4422static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004423unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 return fixup(self, fixlower);
4426}
4427
4428static char lstrip__doc__[] =
4429"S.lstrip() -> unicode\n\
4430\n\
4431Return a copy of the string S with leading whitespace removed.";
4432
4433static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004434unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 return strip(self, 1, 0);
4437}
4438
4439static PyObject*
4440unicode_repeat(PyUnicodeObject *str, int len)
4441{
4442 PyUnicodeObject *u;
4443 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004444 int nchars;
4445 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446
4447 if (len < 0)
4448 len = 0;
4449
Tim Peters7a29bd52001-09-12 03:03:31 +00004450 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 /* no repeat, return original string */
4452 Py_INCREF(str);
4453 return (PyObject*) str;
4454 }
Tim Peters8f422462000-09-09 06:13:41 +00004455
4456 /* ensure # of chars needed doesn't overflow int and # of bytes
4457 * needed doesn't overflow size_t
4458 */
4459 nchars = len * str->length;
4460 if (len && nchars / len != str->length) {
4461 PyErr_SetString(PyExc_OverflowError,
4462 "repeated string is too long");
4463 return NULL;
4464 }
4465 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4466 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4467 PyErr_SetString(PyExc_OverflowError,
4468 "repeated string is too long");
4469 return NULL;
4470 }
4471 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (!u)
4473 return NULL;
4474
4475 p = u->str;
4476
4477 while (len-- > 0) {
4478 Py_UNICODE_COPY(p, str->str, str->length);
4479 p += str->length;
4480 }
4481
4482 return (PyObject*) u;
4483}
4484
4485PyObject *PyUnicode_Replace(PyObject *obj,
4486 PyObject *subobj,
4487 PyObject *replobj,
4488 int maxcount)
4489{
4490 PyObject *self;
4491 PyObject *str1;
4492 PyObject *str2;
4493 PyObject *result;
4494
4495 self = PyUnicode_FromObject(obj);
4496 if (self == NULL)
4497 return NULL;
4498 str1 = PyUnicode_FromObject(subobj);
4499 if (str1 == NULL) {
4500 Py_DECREF(self);
4501 return NULL;
4502 }
4503 str2 = PyUnicode_FromObject(replobj);
4504 if (str2 == NULL) {
4505 Py_DECREF(self);
4506 Py_DECREF(str1);
4507 return NULL;
4508 }
4509 result = replace((PyUnicodeObject *)self,
4510 (PyUnicodeObject *)str1,
4511 (PyUnicodeObject *)str2,
4512 maxcount);
4513 Py_DECREF(self);
4514 Py_DECREF(str1);
4515 Py_DECREF(str2);
4516 return result;
4517}
4518
4519static char replace__doc__[] =
4520"S.replace (old, new[, maxsplit]) -> unicode\n\
4521\n\
4522Return a copy of S with all occurrences of substring\n\
4523old replaced by new. If the optional argument maxsplit is\n\
4524given, only the first maxsplit occurrences are replaced.";
4525
4526static PyObject*
4527unicode_replace(PyUnicodeObject *self, PyObject *args)
4528{
4529 PyUnicodeObject *str1;
4530 PyUnicodeObject *str2;
4531 int maxcount = -1;
4532 PyObject *result;
4533
4534 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4535 return NULL;
4536 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4537 if (str1 == NULL)
4538 return NULL;
4539 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4540 if (str2 == NULL)
4541 return NULL;
4542
4543 result = replace(self, str1, str2, maxcount);
4544
4545 Py_DECREF(str1);
4546 Py_DECREF(str2);
4547 return result;
4548}
4549
4550static
4551PyObject *unicode_repr(PyObject *unicode)
4552{
4553 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4554 PyUnicode_GET_SIZE(unicode),
4555 1);
4556}
4557
4558static char rfind__doc__[] =
4559"S.rfind(sub [,start [,end]]) -> int\n\
4560\n\
4561Return the highest index in S where substring sub is found,\n\
4562such that sub is contained within s[start,end]. Optional\n\
4563arguments start and end are interpreted as in slice notation.\n\
4564\n\
4565Return -1 on failure.";
4566
4567static PyObject *
4568unicode_rfind(PyUnicodeObject *self, PyObject *args)
4569{
4570 PyUnicodeObject *substring;
4571 int start = 0;
4572 int end = INT_MAX;
4573 PyObject *result;
4574
Guido van Rossumb8872e62000-05-09 14:14:27 +00004575 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4576 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 return NULL;
4578 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4579 (PyObject *)substring);
4580 if (substring == NULL)
4581 return NULL;
4582
4583 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4584
4585 Py_DECREF(substring);
4586 return result;
4587}
4588
4589static char rindex__doc__[] =
4590"S.rindex(sub [,start [,end]]) -> int\n\
4591\n\
4592Like S.rfind() but raise ValueError when the substring is not found.";
4593
4594static PyObject *
4595unicode_rindex(PyUnicodeObject *self, PyObject *args)
4596{
4597 int result;
4598 PyUnicodeObject *substring;
4599 int start = 0;
4600 int end = INT_MAX;
4601
Guido van Rossumb8872e62000-05-09 14:14:27 +00004602 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4603 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 return NULL;
4605 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4606 (PyObject *)substring);
4607 if (substring == NULL)
4608 return NULL;
4609
4610 result = findstring(self, substring, start, end, -1);
4611
4612 Py_DECREF(substring);
4613 if (result < 0) {
4614 PyErr_SetString(PyExc_ValueError, "substring not found");
4615 return NULL;
4616 }
4617 return PyInt_FromLong(result);
4618}
4619
4620static char rjust__doc__[] =
4621"S.rjust(width) -> unicode\n\
4622\n\
4623Return S right justified in a Unicode string of length width. Padding is\n\
4624done using spaces.";
4625
4626static PyObject *
4627unicode_rjust(PyUnicodeObject *self, PyObject *args)
4628{
4629 int width;
4630 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4631 return NULL;
4632
Tim Peters7a29bd52001-09-12 03:03:31 +00004633 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 Py_INCREF(self);
4635 return (PyObject*) self;
4636 }
4637
4638 return (PyObject*) pad(self, width - self->length, 0, ' ');
4639}
4640
4641static char rstrip__doc__[] =
4642"S.rstrip() -> unicode\n\
4643\n\
4644Return a copy of the string S with trailing whitespace removed.";
4645
4646static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004647unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 return strip(self, 0, 1);
4650}
4651
4652static PyObject*
4653unicode_slice(PyUnicodeObject *self, int start, int end)
4654{
4655 /* standard clamping */
4656 if (start < 0)
4657 start = 0;
4658 if (end < 0)
4659 end = 0;
4660 if (end > self->length)
4661 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004662 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 /* full slice, return original string */
4664 Py_INCREF(self);
4665 return (PyObject*) self;
4666 }
4667 if (start > end)
4668 start = end;
4669 /* copy slice */
4670 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4671 end - start);
4672}
4673
4674PyObject *PyUnicode_Split(PyObject *s,
4675 PyObject *sep,
4676 int maxsplit)
4677{
4678 PyObject *result;
4679
4680 s = PyUnicode_FromObject(s);
4681 if (s == NULL)
4682 return NULL;
4683 if (sep != NULL) {
4684 sep = PyUnicode_FromObject(sep);
4685 if (sep == NULL) {
4686 Py_DECREF(s);
4687 return NULL;
4688 }
4689 }
4690
4691 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4692
4693 Py_DECREF(s);
4694 Py_XDECREF(sep);
4695 return result;
4696}
4697
4698static char split__doc__[] =
4699"S.split([sep [,maxsplit]]) -> list of strings\n\
4700\n\
4701Return a list of the words in S, using sep as the\n\
4702delimiter string. If maxsplit is given, at most maxsplit\n\
4703splits are done. If sep is not specified, any whitespace string\n\
4704is a separator.";
4705
4706static PyObject*
4707unicode_split(PyUnicodeObject *self, PyObject *args)
4708{
4709 PyObject *substring = Py_None;
4710 int maxcount = -1;
4711
4712 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4713 return NULL;
4714
4715 if (substring == Py_None)
4716 return split(self, NULL, maxcount);
4717 else if (PyUnicode_Check(substring))
4718 return split(self, (PyUnicodeObject *)substring, maxcount);
4719 else
4720 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4721}
4722
4723static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004724"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725\n\
4726Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004727Line breaks are not included in the resulting list unless keepends\n\
4728is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729
4730static PyObject*
4731unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4732{
Guido van Rossum86662912000-04-11 15:38:46 +00004733 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734
Guido van Rossum86662912000-04-11 15:38:46 +00004735 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 return NULL;
4737
Guido van Rossum86662912000-04-11 15:38:46 +00004738 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739}
4740
4741static
4742PyObject *unicode_str(PyUnicodeObject *self)
4743{
Fred Drakee4315f52000-05-09 19:53:39 +00004744 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745}
4746
4747static char strip__doc__[] =
4748"S.strip() -> unicode\n\
4749\n\
4750Return a copy of S with leading and trailing whitespace removed.";
4751
4752static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004753unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 return strip(self, 1, 1);
4756}
4757
4758static char swapcase__doc__[] =
4759"S.swapcase() -> unicode\n\
4760\n\
4761Return a copy of S with uppercase characters converted to lowercase\n\
4762and vice versa.";
4763
4764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004765unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 return fixup(self, fixswapcase);
4768}
4769
4770static char translate__doc__[] =
4771"S.translate(table) -> unicode\n\
4772\n\
4773Return a copy of the string S, where all characters have been mapped\n\
4774through the given translation table, which must be a mapping of\n\
4775Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4776are left untouched. Characters mapped to None are deleted.";
4777
4778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004779unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 return PyUnicode_TranslateCharmap(self->str,
4782 self->length,
4783 table,
4784 "ignore");
4785}
4786
4787static char upper__doc__[] =
4788"S.upper() -> unicode\n\
4789\n\
4790Return a copy of S converted to uppercase.";
4791
4792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004793unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 return fixup(self, fixupper);
4796}
4797
4798#if 0
4799static char zfill__doc__[] =
4800"S.zfill(width) -> unicode\n\
4801\n\
4802Pad a numeric string x with zeros on the left, to fill a field\n\
4803of the specified width. The string x is never truncated.";
4804
4805static PyObject *
4806unicode_zfill(PyUnicodeObject *self, PyObject *args)
4807{
4808 int fill;
4809 PyUnicodeObject *u;
4810
4811 int width;
4812 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4813 return NULL;
4814
4815 if (self->length >= width) {
4816 Py_INCREF(self);
4817 return (PyObject*) self;
4818 }
4819
4820 fill = width - self->length;
4821
4822 u = pad(self, fill, 0, '0');
4823
4824 if (u->str[fill] == '+' || u->str[fill] == '-') {
4825 /* move sign to beginning of string */
4826 u->str[0] = u->str[fill];
4827 u->str[fill] = '0';
4828 }
4829
4830 return (PyObject*) u;
4831}
4832#endif
4833
4834#if 0
4835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004836unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 return PyInt_FromLong(unicode_freelist_size);
4839}
4840#endif
4841
4842static char startswith__doc__[] =
4843"S.startswith(prefix[, start[, end]]) -> int\n\
4844\n\
4845Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4846optional start, test S beginning at that position. With optional end, stop\n\
4847comparing S at that position.";
4848
4849static PyObject *
4850unicode_startswith(PyUnicodeObject *self,
4851 PyObject *args)
4852{
4853 PyUnicodeObject *substring;
4854 int start = 0;
4855 int end = INT_MAX;
4856 PyObject *result;
4857
Guido van Rossumb8872e62000-05-09 14:14:27 +00004858 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4859 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 return NULL;
4861 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4862 (PyObject *)substring);
4863 if (substring == NULL)
4864 return NULL;
4865
4866 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4867
4868 Py_DECREF(substring);
4869 return result;
4870}
4871
4872
4873static char endswith__doc__[] =
4874"S.endswith(suffix[, start[, end]]) -> int\n\
4875\n\
4876Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4877optional start, test S beginning at that position. With optional end, stop\n\
4878comparing S at that position.";
4879
4880static PyObject *
4881unicode_endswith(PyUnicodeObject *self,
4882 PyObject *args)
4883{
4884 PyUnicodeObject *substring;
4885 int start = 0;
4886 int end = INT_MAX;
4887 PyObject *result;
4888
Guido van Rossumb8872e62000-05-09 14:14:27 +00004889 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4890 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 return NULL;
4892 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4893 (PyObject *)substring);
4894 if (substring == NULL)
4895 return NULL;
4896
4897 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4898
4899 Py_DECREF(substring);
4900 return result;
4901}
4902
4903
4904static PyMethodDef unicode_methods[] = {
4905
4906 /* Order is according to common usage: often used methods should
4907 appear first, since lookup is done sequentially. */
4908
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004909 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4910 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4911 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4912 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4913 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4914 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4915 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4917 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4919 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4920 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4921 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4922 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4923/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4924 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4925 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4926 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4927 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4928 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4929 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4930 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4931 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4932 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4933 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4934 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4935 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4936 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4937 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4938 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4939 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4940 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4941 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4942 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4943 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004945 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4946 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947#endif
4948
4949#if 0
4950 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004951 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952#endif
4953
4954 {NULL, NULL}
4955};
4956
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957static PySequenceMethods unicode_as_sequence = {
4958 (inquiry) unicode_length, /* sq_length */
4959 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4960 (intargfunc) unicode_repeat, /* sq_repeat */
4961 (intargfunc) unicode_getitem, /* sq_item */
4962 (intintargfunc) unicode_slice, /* sq_slice */
4963 0, /* sq_ass_item */
4964 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004965 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966};
4967
4968static int
4969unicode_buffer_getreadbuf(PyUnicodeObject *self,
4970 int index,
4971 const void **ptr)
4972{
4973 if (index != 0) {
4974 PyErr_SetString(PyExc_SystemError,
4975 "accessing non-existent unicode segment");
4976 return -1;
4977 }
4978 *ptr = (void *) self->str;
4979 return PyUnicode_GET_DATA_SIZE(self);
4980}
4981
4982static int
4983unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4984 const void **ptr)
4985{
4986 PyErr_SetString(PyExc_TypeError,
4987 "cannot use unicode as modifyable buffer");
4988 return -1;
4989}
4990
4991static int
4992unicode_buffer_getsegcount(PyUnicodeObject *self,
4993 int *lenp)
4994{
4995 if (lenp)
4996 *lenp = PyUnicode_GET_DATA_SIZE(self);
4997 return 1;
4998}
4999
5000static int
5001unicode_buffer_getcharbuf(PyUnicodeObject *self,
5002 int index,
5003 const void **ptr)
5004{
5005 PyObject *str;
5006
5007 if (index != 0) {
5008 PyErr_SetString(PyExc_SystemError,
5009 "accessing non-existent unicode segment");
5010 return -1;
5011 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005012 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 if (str == NULL)
5014 return -1;
5015 *ptr = (void *) PyString_AS_STRING(str);
5016 return PyString_GET_SIZE(str);
5017}
5018
5019/* Helpers for PyUnicode_Format() */
5020
5021static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005022getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023{
5024 int argidx = *p_argidx;
5025 if (argidx < arglen) {
5026 (*p_argidx)++;
5027 if (arglen < 0)
5028 return args;
5029 else
5030 return PyTuple_GetItem(args, argidx);
5031 }
5032 PyErr_SetString(PyExc_TypeError,
5033 "not enough arguments for format string");
5034 return NULL;
5035}
5036
5037#define F_LJUST (1<<0)
5038#define F_SIGN (1<<1)
5039#define F_BLANK (1<<2)
5040#define F_ALT (1<<3)
5041#define F_ZERO (1<<4)
5042
5043static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045{
5046 register int i;
5047 int len;
5048 va_list va;
5049 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051
5052 /* First, format the string as char array, then expand to Py_UNICODE
5053 array. */
5054 charbuffer = (char *)buffer;
5055 len = vsprintf(charbuffer, format, va);
5056 for (i = len - 1; i >= 0; i--)
5057 buffer[i] = (Py_UNICODE) charbuffer[i];
5058
5059 va_end(va);
5060 return len;
5061}
5062
5063static int
5064formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005065 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 int flags,
5067 int prec,
5068 int type,
5069 PyObject *v)
5070{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005071 /* fmt = '%#.' + `prec` + `type`
5072 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 char fmt[20];
5074 double x;
5075
5076 x = PyFloat_AsDouble(v);
5077 if (x == -1.0 && PyErr_Occurred())
5078 return -1;
5079 if (prec < 0)
5080 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5082 type = 'g';
5083 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005084 /* worst case length calc to ensure no buffer overrun:
5085 fmt = %#.<prec>g
5086 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5087 for any double rep.)
5088 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5089 If prec=0 the effective precision is 1 (the leading digit is
5090 always given), therefore increase by one to 10+prec. */
5091 if (buflen <= (size_t)10 + (size_t)prec) {
5092 PyErr_SetString(PyExc_OverflowError,
5093 "formatted float is too long (precision too long?)");
5094 return -1;
5095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 return usprintf(buf, fmt, x);
5097}
5098
Tim Peters38fd5b62000-09-21 05:43:11 +00005099static PyObject*
5100formatlong(PyObject *val, int flags, int prec, int type)
5101{
5102 char *buf;
5103 int i, len;
5104 PyObject *str; /* temporary string object. */
5105 PyUnicodeObject *result;
5106
5107 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5108 if (!str)
5109 return NULL;
5110 result = _PyUnicode_New(len);
5111 for (i = 0; i < len; i++)
5112 result->str[i] = buf[i];
5113 result->str[len] = 0;
5114 Py_DECREF(str);
5115 return (PyObject*)result;
5116}
5117
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118static int
5119formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005120 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 int flags,
5122 int prec,
5123 int type,
5124 PyObject *v)
5125{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005126 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005127 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5128 + 1 + 1 = 24*/
5129 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005131 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132
5133 x = PyInt_AsLong(v);
5134 if (x == -1 && PyErr_Occurred())
5135 return -1;
5136 if (prec < 0)
5137 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005138 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5139 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5140 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5141 PyErr_SetString(PyExc_OverflowError,
5142 "formatted integer is too long (precision too long?)");
5143 return -1;
5144 }
Tim Petersfff53252001-04-12 18:38:48 +00005145 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5146 * but we want it (for consistency with other %#x conversions, and
5147 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005148 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5149 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5150 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005151 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005152 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5153 /* Only way to know what the platform does is to try it. */
5154 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5155 if (fmt[1] != (char)type) {
5156 /* Supply our own leading 0x/0X -- needed under std C */
5157 use_native_c_format = 0;
5158 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5159 }
5160 }
5161 if (use_native_c_format)
5162 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 return usprintf(buf, fmt, x);
5164}
5165
5166static int
5167formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005168 size_t buflen,
5169 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005171 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005172 if (PyUnicode_Check(v)) {
5173 if (PyUnicode_GET_SIZE(v) != 1)
5174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005178 else if (PyString_Check(v)) {
5179 if (PyString_GET_SIZE(v) != 1)
5180 goto onError;
5181 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
5184 else {
5185 /* Integer input truncated to a character */
5186 long x;
5187 x = PyInt_AsLong(v);
5188 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005189 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 buf[0] = (char) x;
5191 }
5192 buf[1] = '\0';
5193 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005194
5195 onError:
5196 PyErr_SetString(PyExc_TypeError,
5197 "%c requires int or char");
5198 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199}
5200
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005201/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5202
5203 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5204 chars are formatted. XXX This is a magic number. Each formatting
5205 routine does bounds checking to ensure no overflow, but a better
5206 solution may be to malloc a buffer of appropriate size for each
5207 format. For now, the current solution is sufficient.
5208*/
5209#define FORMATBUFLEN (size_t)120
5210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211PyObject *PyUnicode_Format(PyObject *format,
5212 PyObject *args)
5213{
5214 Py_UNICODE *fmt, *res;
5215 int fmtcnt, rescnt, reslen, arglen, argidx;
5216 int args_owned = 0;
5217 PyUnicodeObject *result = NULL;
5218 PyObject *dict = NULL;
5219 PyObject *uformat;
5220
5221 if (format == NULL || args == NULL) {
5222 PyErr_BadInternalCall();
5223 return NULL;
5224 }
5225 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005226 if (uformat == NULL)
5227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 fmt = PyUnicode_AS_UNICODE(uformat);
5229 fmtcnt = PyUnicode_GET_SIZE(uformat);
5230
5231 reslen = rescnt = fmtcnt + 100;
5232 result = _PyUnicode_New(reslen);
5233 if (result == NULL)
5234 goto onError;
5235 res = PyUnicode_AS_UNICODE(result);
5236
5237 if (PyTuple_Check(args)) {
5238 arglen = PyTuple_Size(args);
5239 argidx = 0;
5240 }
5241 else {
5242 arglen = -1;
5243 argidx = -2;
5244 }
5245 if (args->ob_type->tp_as_mapping)
5246 dict = args;
5247
5248 while (--fmtcnt >= 0) {
5249 if (*fmt != '%') {
5250 if (--rescnt < 0) {
5251 rescnt = fmtcnt + 100;
5252 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005253 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 return NULL;
5255 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5256 --rescnt;
5257 }
5258 *res++ = *fmt++;
5259 }
5260 else {
5261 /* Got a format specifier */
5262 int flags = 0;
5263 int width = -1;
5264 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 Py_UNICODE c = '\0';
5266 Py_UNICODE fill;
5267 PyObject *v = NULL;
5268 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005269 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 Py_UNICODE sign;
5271 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005272 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
5274 fmt++;
5275 if (*fmt == '(') {
5276 Py_UNICODE *keystart;
5277 int keylen;
5278 PyObject *key;
5279 int pcount = 1;
5280
5281 if (dict == NULL) {
5282 PyErr_SetString(PyExc_TypeError,
5283 "format requires a mapping");
5284 goto onError;
5285 }
5286 ++fmt;
5287 --fmtcnt;
5288 keystart = fmt;
5289 /* Skip over balanced parentheses */
5290 while (pcount > 0 && --fmtcnt >= 0) {
5291 if (*fmt == ')')
5292 --pcount;
5293 else if (*fmt == '(')
5294 ++pcount;
5295 fmt++;
5296 }
5297 keylen = fmt - keystart - 1;
5298 if (fmtcnt < 0 || pcount > 0) {
5299 PyErr_SetString(PyExc_ValueError,
5300 "incomplete format key");
5301 goto onError;
5302 }
Fred Drakee4315f52000-05-09 19:53:39 +00005303 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 then looked up since Python uses strings to hold
5305 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005306 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 key = PyUnicode_EncodeUTF8(keystart,
5308 keylen,
5309 NULL);
5310 if (key == NULL)
5311 goto onError;
5312 if (args_owned) {
5313 Py_DECREF(args);
5314 args_owned = 0;
5315 }
5316 args = PyObject_GetItem(dict, key);
5317 Py_DECREF(key);
5318 if (args == NULL) {
5319 goto onError;
5320 }
5321 args_owned = 1;
5322 arglen = -1;
5323 argidx = -2;
5324 }
5325 while (--fmtcnt >= 0) {
5326 switch (c = *fmt++) {
5327 case '-': flags |= F_LJUST; continue;
5328 case '+': flags |= F_SIGN; continue;
5329 case ' ': flags |= F_BLANK; continue;
5330 case '#': flags |= F_ALT; continue;
5331 case '0': flags |= F_ZERO; continue;
5332 }
5333 break;
5334 }
5335 if (c == '*') {
5336 v = getnextarg(args, arglen, &argidx);
5337 if (v == NULL)
5338 goto onError;
5339 if (!PyInt_Check(v)) {
5340 PyErr_SetString(PyExc_TypeError,
5341 "* wants int");
5342 goto onError;
5343 }
5344 width = PyInt_AsLong(v);
5345 if (width < 0) {
5346 flags |= F_LJUST;
5347 width = -width;
5348 }
5349 if (--fmtcnt >= 0)
5350 c = *fmt++;
5351 }
5352 else if (c >= '0' && c <= '9') {
5353 width = c - '0';
5354 while (--fmtcnt >= 0) {
5355 c = *fmt++;
5356 if (c < '0' || c > '9')
5357 break;
5358 if ((width*10) / 10 != width) {
5359 PyErr_SetString(PyExc_ValueError,
5360 "width too big");
5361 goto onError;
5362 }
5363 width = width*10 + (c - '0');
5364 }
5365 }
5366 if (c == '.') {
5367 prec = 0;
5368 if (--fmtcnt >= 0)
5369 c = *fmt++;
5370 if (c == '*') {
5371 v = getnextarg(args, arglen, &argidx);
5372 if (v == NULL)
5373 goto onError;
5374 if (!PyInt_Check(v)) {
5375 PyErr_SetString(PyExc_TypeError,
5376 "* wants int");
5377 goto onError;
5378 }
5379 prec = PyInt_AsLong(v);
5380 if (prec < 0)
5381 prec = 0;
5382 if (--fmtcnt >= 0)
5383 c = *fmt++;
5384 }
5385 else if (c >= '0' && c <= '9') {
5386 prec = c - '0';
5387 while (--fmtcnt >= 0) {
5388 c = Py_CHARMASK(*fmt++);
5389 if (c < '0' || c > '9')
5390 break;
5391 if ((prec*10) / 10 != prec) {
5392 PyErr_SetString(PyExc_ValueError,
5393 "prec too big");
5394 goto onError;
5395 }
5396 prec = prec*10 + (c - '0');
5397 }
5398 }
5399 } /* prec */
5400 if (fmtcnt >= 0) {
5401 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (--fmtcnt >= 0)
5403 c = *fmt++;
5404 }
5405 }
5406 if (fmtcnt < 0) {
5407 PyErr_SetString(PyExc_ValueError,
5408 "incomplete format");
5409 goto onError;
5410 }
5411 if (c != '%') {
5412 v = getnextarg(args, arglen, &argidx);
5413 if (v == NULL)
5414 goto onError;
5415 }
5416 sign = 0;
5417 fill = ' ';
5418 switch (c) {
5419
5420 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005421 pbuf = formatbuf;
5422 /* presume that buffer length is at least 1 */
5423 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 len = 1;
5425 break;
5426
5427 case 's':
5428 case 'r':
5429 if (PyUnicode_Check(v) && c == 's') {
5430 temp = v;
5431 Py_INCREF(temp);
5432 }
5433 else {
5434 PyObject *unicode;
5435 if (c == 's')
5436 temp = PyObject_Str(v);
5437 else
5438 temp = PyObject_Repr(v);
5439 if (temp == NULL)
5440 goto onError;
5441 if (!PyString_Check(temp)) {
5442 /* XXX Note: this should never happen, since
5443 PyObject_Repr() and PyObject_Str() assure
5444 this */
5445 Py_DECREF(temp);
5446 PyErr_SetString(PyExc_TypeError,
5447 "%s argument has non-string str()");
5448 goto onError;
5449 }
Fred Drakee4315f52000-05-09 19:53:39 +00005450 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005452 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 "strict");
5454 Py_DECREF(temp);
5455 temp = unicode;
5456 if (temp == NULL)
5457 goto onError;
5458 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005459 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 len = PyUnicode_GET_SIZE(temp);
5461 if (prec >= 0 && len > prec)
5462 len = prec;
5463 break;
5464
5465 case 'i':
5466 case 'd':
5467 case 'u':
5468 case 'o':
5469 case 'x':
5470 case 'X':
5471 if (c == 'i')
5472 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005473 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005474 temp = formatlong(v, flags, prec, c);
5475 if (!temp)
5476 goto onError;
5477 pbuf = PyUnicode_AS_UNICODE(temp);
5478 len = PyUnicode_GET_SIZE(temp);
5479 /* unbounded ints can always produce
5480 a sign character! */
5481 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005483 else {
5484 pbuf = formatbuf;
5485 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5486 flags, prec, c, v);
5487 if (len < 0)
5488 goto onError;
5489 /* only d conversion is signed */
5490 sign = c == 'd';
5491 }
5492 if (flags & F_ZERO)
5493 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 break;
5495
5496 case 'e':
5497 case 'E':
5498 case 'f':
5499 case 'g':
5500 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005501 pbuf = formatbuf;
5502 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5503 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 if (len < 0)
5505 goto onError;
5506 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005507 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 fill = '0';
5509 break;
5510
5511 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005512 pbuf = formatbuf;
5513 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 if (len < 0)
5515 goto onError;
5516 break;
5517
5518 default:
5519 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005520 "unsupported format character '%c' (0x%x) "
5521 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005522 (31<=c && c<=126) ? c : '?',
5523 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 goto onError;
5525 }
5526 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005527 if (*pbuf == '-' || *pbuf == '+') {
5528 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 len--;
5530 }
5531 else if (flags & F_SIGN)
5532 sign = '+';
5533 else if (flags & F_BLANK)
5534 sign = ' ';
5535 else
5536 sign = 0;
5537 }
5538 if (width < len)
5539 width = len;
5540 if (rescnt < width + (sign != 0)) {
5541 reslen -= rescnt;
5542 rescnt = width + fmtcnt + 100;
5543 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005544 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 return NULL;
5546 res = PyUnicode_AS_UNICODE(result)
5547 + reslen - rescnt;
5548 }
5549 if (sign) {
5550 if (fill != ' ')
5551 *res++ = sign;
5552 rescnt--;
5553 if (width > len)
5554 width--;
5555 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005556 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5557 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005558 assert(pbuf[1] == c);
5559 if (fill != ' ') {
5560 *res++ = *pbuf++;
5561 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005562 }
Tim Petersfff53252001-04-12 18:38:48 +00005563 rescnt -= 2;
5564 width -= 2;
5565 if (width < 0)
5566 width = 0;
5567 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 if (width > len && !(flags & F_LJUST)) {
5570 do {
5571 --rescnt;
5572 *res++ = fill;
5573 } while (--width > len);
5574 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005575 if (fill == ' ') {
5576 if (sign)
5577 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005578 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005579 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005580 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005581 *res++ = *pbuf++;
5582 *res++ = *pbuf++;
5583 }
5584 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005585 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 res += len;
5587 rescnt -= len;
5588 while (--width >= len) {
5589 --rescnt;
5590 *res++ = ' ';
5591 }
5592 if (dict && (argidx < arglen) && c != '%') {
5593 PyErr_SetString(PyExc_TypeError,
5594 "not all arguments converted");
5595 goto onError;
5596 }
5597 Py_XDECREF(temp);
5598 } /* '%' */
5599 } /* until end */
5600 if (argidx < arglen && !dict) {
5601 PyErr_SetString(PyExc_TypeError,
5602 "not all arguments converted");
5603 goto onError;
5604 }
5605
5606 if (args_owned) {
5607 Py_DECREF(args);
5608 }
5609 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005610 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005611 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 return (PyObject *)result;
5613
5614 onError:
5615 Py_XDECREF(result);
5616 Py_DECREF(uformat);
5617 if (args_owned) {
5618 Py_DECREF(args);
5619 }
5620 return NULL;
5621}
5622
5623static PyBufferProcs unicode_as_buffer = {
5624 (getreadbufferproc) unicode_buffer_getreadbuf,
5625 (getwritebufferproc) unicode_buffer_getwritebuf,
5626 (getsegcountproc) unicode_buffer_getsegcount,
5627 (getcharbufferproc) unicode_buffer_getcharbuf,
5628};
5629
Guido van Rossume023fe02001-08-30 03:12:59 +00005630staticforward PyObject *
5631unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5632
Tim Peters6d6c1a32001-08-02 04:15:00 +00005633static PyObject *
5634unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5635{
5636 PyObject *x = NULL;
5637 static char *kwlist[] = {"string", "encoding", "errors", 0};
5638 char *encoding = NULL;
5639 char *errors = NULL;
5640
Guido van Rossume023fe02001-08-30 03:12:59 +00005641 if (type != &PyUnicode_Type)
5642 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005643 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5644 kwlist, &x, &encoding, &errors))
5645 return NULL;
5646 if (x == NULL)
5647 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005648 if (encoding == NULL && errors == NULL)
5649 return PyObject_Unicode(x);
5650 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005651 return PyUnicode_FromEncodedObject(x, encoding, errors);
5652}
5653
Guido van Rossume023fe02001-08-30 03:12:59 +00005654static PyObject *
5655unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5656{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005657 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005658 int n;
5659
5660 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5661 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5662 if (tmp == NULL)
5663 return NULL;
5664 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005665 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5666 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005667 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005668 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5669 if (pnew->str == NULL) {
5670 _Py_ForgetReference((PyObject *)pnew);
5671 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005672 return NULL;
5673 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005674 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5675 pnew->length = n;
5676 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005677 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005678 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005679}
5680
Tim Peters6d6c1a32001-08-02 04:15:00 +00005681static char unicode_doc[] =
5682"unicode(string [, encoding[, errors]]) -> object\n\
5683\n\
5684Create a new Unicode object from the given encoded string.\n\
5685encoding defaults to the current default string encoding and \n\
5686errors, defining the error handling, to 'strict'.";
5687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688PyTypeObject PyUnicode_Type = {
5689 PyObject_HEAD_INIT(&PyType_Type)
5690 0, /* ob_size */
5691 "unicode", /* tp_name */
5692 sizeof(PyUnicodeObject), /* tp_size */
5693 0, /* tp_itemsize */
5694 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005695 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005697 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 0, /* tp_setattr */
5699 (cmpfunc) unicode_compare, /* tp_compare */
5700 (reprfunc) unicode_repr, /* tp_repr */
5701 0, /* tp_as_number */
5702 &unicode_as_sequence, /* tp_as_sequence */
5703 0, /* tp_as_mapping */
5704 (hashfunc) unicode_hash, /* tp_hash*/
5705 0, /* tp_call*/
5706 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005707 PyObject_GenericGetAttr, /* tp_getattro */
5708 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005710 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005711 unicode_doc, /* tp_doc */
5712 0, /* tp_traverse */
5713 0, /* tp_clear */
5714 0, /* tp_richcompare */
5715 0, /* tp_weaklistoffset */
5716 0, /* tp_iter */
5717 0, /* tp_iternext */
5718 unicode_methods, /* tp_methods */
5719 0, /* tp_members */
5720 0, /* tp_getset */
5721 0, /* tp_base */
5722 0, /* tp_dict */
5723 0, /* tp_descr_get */
5724 0, /* tp_descr_set */
5725 0, /* tp_dictoffset */
5726 0, /* tp_init */
5727 0, /* tp_alloc */
5728 unicode_new, /* tp_new */
Guido van Rossum9475a232001-10-05 20:51:39 +00005729 _PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730};
5731
5732/* Initialize the Unicode implementation */
5733
Thomas Wouters78890102000-07-22 19:25:51 +00005734void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005736 int i;
5737
Fred Drakee4315f52000-05-09 19:53:39 +00005738 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005739 unicode_freelist = NULL;
5740 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005742 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005743 for (i = 0; i < 256; i++)
5744 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745}
5746
5747/* Finalize the Unicode implementation */
5748
5749void
Thomas Wouters78890102000-07-22 19:25:51 +00005750_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005752 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005753 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005755 Py_XDECREF(unicode_empty);
5756 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005757
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005758 for (i = 0; i < 256; i++) {
5759 if (unicode_latin1[i]) {
5760 Py_DECREF(unicode_latin1[i]);
5761 unicode_latin1[i] = NULL;
5762 }
5763 }
5764
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005765 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 PyUnicodeObject *v = u;
5767 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005768 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005769 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005770 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005771 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005773 unicode_freelist = NULL;
5774 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775}