blob: 1d0508cc8b3ad36d3a8e0692cc2148ed8080a94d [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1178{
1179 PyObject *v;
1180 char *p;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001181 unsigned int cbAllocated = 2 * size;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 unsigned int cbWritten = 0;
1183 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001185 /* Short-cut for emtpy strings */
1186 if (size == 0)
1187 return PyString_FromStringAndSize(NULL, 0);
1188
1189 /* We allocate 4 more bytes to have room for at least one full
1190 UTF-8 sequence; saves a few cycles in the loop below */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001191 v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if (v == NULL)
1193 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001195 p = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 while (i < size) {
1197 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001198
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001199 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001201 cbWritten++;
1202 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 else if (ch < 0x0800) {
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001205 *p++ = (char)(0xc0 | (ch >> 6));
1206 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001207 cbWritten += 2;
1208 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001209
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001210 else {
1211
1212 /* Assure that we have enough room for high order Unicode
1213 ordinals */
1214 if (cbWritten >= cbAllocated) {
1215 cbAllocated += 4 * 10;
1216 if (_PyString_Resize(&v, cbAllocated + 4))
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001217 goto onError;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001218 p = PyString_AS_STRING(v) + cbWritten;
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001219 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001221 if (ch < 0x10000) {
1222 /* Check for high surrogate */
1223 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1224 Py_UCS4 ch2 = s[i];
1225 /* Check for low surrogate */
1226 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001227 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001228 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001229 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001230 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1231 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 i++;
1233 cbWritten += 4;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001234 continue;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001235 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001236 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001238 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001239 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1240 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001241 cbWritten += 3;
1242
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001243 } else {
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001244 *p++ = (char)(0xf0 | (ch>>18));
1245 *p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1246 *p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1247 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001248 cbWritten += 4;
1249 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 *p = '\0';
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001253 if (_PyString_Resize(&v, cbWritten))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return v;
1256
1257 onError:
1258 Py_DECREF(v);
1259 return NULL;
1260}
1261
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (!PyUnicode_Check(unicode)) {
1265 PyErr_BadArgument();
1266 return NULL;
1267 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001268 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1269 PyUnicode_GET_SIZE(unicode),
1270 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271}
1272
1273/* --- UTF-16 Codec ------------------------------------------------------- */
1274
1275static
Tim Peters772747b2001-08-09 22:21:55 +00001276int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 const char *errors,
1278 const char *details)
1279{
1280 if ((errors == NULL) ||
1281 (strcmp(errors,"strict") == 0)) {
1282 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001283 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 details);
1285 return -1;
1286 }
1287 else if (strcmp(errors,"ignore") == 0) {
1288 return 0;
1289 }
1290 else if (strcmp(errors,"replace") == 0) {
1291 if (dest) {
1292 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1293 (*dest)++;
1294 }
1295 return 0;
1296 }
1297 else {
1298 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001299 "UTF-16 decoding error; "
1300 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301 errors);
1302 return -1;
1303 }
1304}
1305
Tim Peters772747b2001-08-09 22:21:55 +00001306PyObject *
1307PyUnicode_DecodeUTF16(const char *s,
1308 int size,
1309 const char *errors,
1310 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311{
1312 PyUnicodeObject *unicode;
1313 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001314 const unsigned char *q, *e;
1315 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001317 /* Offsets from q for retrieving byte pairs in the right order. */
1318#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1319 int ihi = 1, ilo = 0;
1320#else
1321 int ihi = 0, ilo = 1;
1322#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323
1324 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001325 if (size & 1) {
1326 if (utf16_decoding_error(NULL, errors, "truncated data"))
1327 return NULL;
1328 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 }
1330
1331 /* Note: size will always be longer than the resulting Unicode
1332 character count */
1333 unicode = _PyUnicode_New(size);
1334 if (!unicode)
1335 return NULL;
1336 if (size == 0)
1337 return (PyObject *)unicode;
1338
1339 /* Unpack UTF-16 encoded data */
1340 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001341 q = (unsigned char *)s;
1342 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343
1344 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001345 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001347 /* Check for BOM marks (U+FEFF) in the input and adjust current
1348 byte order setting accordingly. In native mode, the leading BOM
1349 mark is skipped, in all other modes, it is copied to the output
1350 stream as-is (giving a ZWNBSP character). */
1351 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001352 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001353#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001354 if (bom == 0xFEFF) {
1355 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001356 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001357 }
1358 else if (bom == 0xFFFE) {
1359 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001360 bo = 1;
1361 }
1362#else
Tim Peters772747b2001-08-09 22:21:55 +00001363 if (bom == 0xFEFF) {
1364 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001365 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001366 }
1367 else if (bom == 0xFFFE) {
1368 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001369 bo = -1;
1370 }
1371#endif
1372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373
Tim Peters772747b2001-08-09 22:21:55 +00001374 if (bo == -1) {
1375 /* force LE */
1376 ihi = 1;
1377 ilo = 0;
1378 }
1379 else if (bo == 1) {
1380 /* force BE */
1381 ihi = 0;
1382 ilo = 1;
1383 }
1384
1385 while (q < e) {
1386 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1387 q += 2;
1388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (ch < 0xD800 || ch > 0xDFFF) {
1390 *p++ = ch;
1391 continue;
1392 }
1393
1394 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001395 if (q >= e) {
1396 errmsg = "unexpected end of data";
1397 goto utf16Error;
1398 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001399 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001400 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1401 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001402 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001403#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001404 *p++ = ch;
1405 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001406#else
1407 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001409 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001410 }
1411 else {
1412 errmsg = "illegal UTF-16 surrogate";
1413 goto utf16Error;
1414 }
1415
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001417 errmsg = "illegal encoding";
1418 /* Fall through to report the error */
1419
1420 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001421 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423 }
1424
1425 if (byteorder)
1426 *byteorder = bo;
1427
1428 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001429 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 goto onError;
1431
1432 return (PyObject *)unicode;
1433
1434onError:
1435 Py_DECREF(unicode);
1436 return NULL;
1437}
1438
Tim Peters772747b2001-08-09 22:21:55 +00001439PyObject *
1440PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1441 int size,
1442 const char *errors,
1443 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001446 unsigned char *p;
1447 int i, pairs;
1448 /* Offsets from p for storing byte pairs in the right order. */
1449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1450 int ihi = 1, ilo = 0;
1451#else
1452 int ihi = 0, ilo = 1;
1453#endif
1454
1455#define STORECHAR(CH) \
1456 do { \
1457 p[ihi] = ((CH) >> 8) & 0xff; \
1458 p[ilo] = (CH) & 0xff; \
1459 p += 2; \
1460 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001462 for (i = pairs = 0; i < size; i++)
1463 if (s[i] >= 0x10000)
1464 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001466 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 if (v == NULL)
1468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469
Tim Peters772747b2001-08-09 22:21:55 +00001470 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001471 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001472 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001473 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001474 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001475
1476 if (byteorder == -1) {
1477 /* force LE */
1478 ihi = 1;
1479 ilo = 0;
1480 }
1481 else if (byteorder == 1) {
1482 /* force BE */
1483 ihi = 0;
1484 ilo = 1;
1485 }
1486
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001487 while (size-- > 0) {
1488 Py_UNICODE ch = *s++;
1489 Py_UNICODE ch2 = 0;
1490 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001491 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1492 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 }
Tim Peters772747b2001-08-09 22:21:55 +00001494 STORECHAR(ch);
1495 if (ch2)
1496 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001499#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500}
1501
1502PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 return NULL;
1507 }
1508 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1509 PyUnicode_GET_SIZE(unicode),
1510 NULL,
1511 0);
1512}
1513
1514/* --- Unicode Escape Codec ----------------------------------------------- */
1515
1516static
1517int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001518 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 const char *errors,
1520 const char *details)
1521{
1522 if ((errors == NULL) ||
1523 (strcmp(errors,"strict") == 0)) {
1524 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001525 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 details);
1527 return -1;
1528 }
1529 else if (strcmp(errors,"ignore") == 0) {
1530 return 0;
1531 }
1532 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001533 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 return 0;
1535 }
1536 else {
1537 PyErr_Format(PyExc_ValueError,
1538 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001539 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 errors);
1541 return -1;
1542 }
1543}
1544
Fredrik Lundh06d12682001-01-24 07:59:11 +00001545static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001546
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1548 int size,
1549 const char *errors)
1550{
1551 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001552 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001554 char* message;
1555 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1556
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 /* Escaped strings will always be longer than the resulting
1558 Unicode string, so we start with size here and then reduce the
1559 length after conversion to the true value. */
1560 v = _PyUnicode_New(size);
1561 if (v == NULL)
1562 goto onError;
1563 if (size == 0)
1564 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 p = buf = PyUnicode_AS_UNICODE(v);
1567 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569 while (s < end) {
1570 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001571 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001572 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
1574 /* Non-escape characters are interpreted as Unicode ordinals */
1575 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001576 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 continue;
1578 }
1579
1580 /* \ - Escapes */
1581 s++;
1582 switch (*s++) {
1583
1584 /* \x escapes */
1585 case '\n': break;
1586 case '\\': *p++ = '\\'; break;
1587 case '\'': *p++ = '\''; break;
1588 case '\"': *p++ = '\"'; break;
1589 case 'b': *p++ = '\b'; break;
1590 case 'f': *p++ = '\014'; break; /* FF */
1591 case 't': *p++ = '\t'; break;
1592 case 'n': *p++ = '\n'; break;
1593 case 'r': *p++ = '\r'; break;
1594 case 'v': *p++ = '\013'; break; /* VT */
1595 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1596
1597 /* \OOO (octal) escapes */
1598 case '0': case '1': case '2': case '3':
1599 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001600 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001602 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001604 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001606 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607 break;
1608
Fredrik Lundhccc74732001-02-18 22:13:49 +00001609 /* hex escapes */
1610 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001612 digits = 2;
1613 message = "truncated \\xXX escape";
1614 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 digits = 4;
1619 message = "truncated \\uXXXX escape";
1620 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621
Fredrik Lundhccc74732001-02-18 22:13:49 +00001622 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001623 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001624 digits = 8;
1625 message = "truncated \\UXXXXXXXX escape";
1626 hexescape:
1627 chr = 0;
1628 for (i = 0; i < digits; i++) {
1629 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001630 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001632 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001634 i++;
1635 break;
1636 }
1637 chr = (chr<<4) & ~0xF;
1638 if (c >= '0' && c <= '9')
1639 chr += c - '0';
1640 else if (c >= 'a' && c <= 'f')
1641 chr += 10 + c - 'a';
1642 else
1643 chr += 10 + c - 'A';
1644 }
1645 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 /* when we get here, chr is a 32-bit unicode character */
1648 if (chr <= 0xffff)
1649 /* UCS-2 character */
1650 *p++ = (Py_UNICODE) chr;
1651 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001652 /* UCS-4 character. Either store directly, or as
1653 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001654#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001655 *p++ = chr;
1656#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 chr -= 0x10000L;
1658 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001659 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001660#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001661 } else {
1662 if (unicodeescape_decoding_error(
1663 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001664 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001665 )
1666 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001667 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001668 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001669 break;
1670
1671 /* \N{name} */
1672 case 'N':
1673 message = "malformed \\N character escape";
1674 if (ucnhash_CAPI == NULL) {
1675 /* load the unicode data module */
1676 PyObject *m, *v;
1677 m = PyImport_ImportModule("unicodedata");
1678 if (m == NULL)
1679 goto ucnhashError;
1680 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1681 Py_DECREF(m);
1682 if (v == NULL)
1683 goto ucnhashError;
1684 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1685 Py_DECREF(v);
1686 if (ucnhash_CAPI == NULL)
1687 goto ucnhashError;
1688 }
1689 if (*s == '{') {
1690 const char *start = s+1;
1691 /* look for the closing brace */
1692 while (*s != '}' && s < end)
1693 s++;
1694 if (s > start && s < end && *s == '}') {
1695 /* found a name. look it up in the unicode database */
1696 message = "unknown Unicode character name";
1697 s++;
1698 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1699 goto store;
1700 }
1701 }
1702 if (unicodeescape_decoding_error(&s, &x, errors, message))
1703 goto onError;
1704 *p++ = x;
1705 break;
1706
1707 default:
1708 *p++ = '\\';
1709 *p++ = (unsigned char)s[-1];
1710 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
1712 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001713 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001714 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 return (PyObject *)v;
1716
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001718 PyErr_SetString(
1719 PyExc_UnicodeError,
1720 "\\N escapes not supported (can't load unicodedata module)"
1721 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001722 return NULL;
1723
Fredrik Lundhccc74732001-02-18 22:13:49 +00001724onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 Py_XDECREF(v);
1726 return NULL;
1727}
1728
1729/* Return a Unicode-Escape string version of the Unicode object.
1730
1731 If quotes is true, the string is enclosed in u"" or u'' quotes as
1732 appropriate.
1733
1734*/
1735
Barry Warsaw51ac5802000-03-20 16:36:48 +00001736static const Py_UNICODE *findchar(const Py_UNICODE *s,
1737 int size,
1738 Py_UNICODE ch);
1739
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740static
1741PyObject *unicodeescape_string(const Py_UNICODE *s,
1742 int size,
1743 int quotes)
1744{
1745 PyObject *repr;
1746 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001748 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
1750 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1751 if (repr == NULL)
1752 return NULL;
1753
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001754 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 *p++ = 'u';
1758 *p++ = (findchar(s, size, '\'') &&
1759 !findchar(s, size, '"')) ? '"' : '\'';
1760 }
1761 while (size-- > 0) {
1762 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001763
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001765 if (quotes &&
1766 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 *p++ = '\\';
1768 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001769 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001771
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001772#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001773 /* Map 21-bit characters to '\U00xxxxxx' */
1774 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001775 int offset = p - PyString_AS_STRING(repr);
1776
1777 /* Resize the string if necessary */
1778 if (offset + 12 > PyString_GET_SIZE(repr)) {
1779 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1780 goto onError;
1781 p = PyString_AS_STRING(repr) + offset;
1782 }
1783
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001784 *p++ = '\\';
1785 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001786 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1787 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1788 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1789 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1790 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1791 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1792 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001793 *p++ = hexdigit[ch & 0x0000000F];
1794 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001795 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001796#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001797 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1798 else if (ch >= 0xD800 && ch < 0xDC00) {
1799 Py_UNICODE ch2;
1800 Py_UCS4 ucs;
1801
1802 ch2 = *s++;
1803 size--;
1804 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1805 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1806 *p++ = '\\';
1807 *p++ = 'U';
1808 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1809 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1810 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1811 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1812 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1813 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1814 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1815 *p++ = hexdigit[ucs & 0x0000000F];
1816 continue;
1817 }
1818 /* Fall through: isolated surrogates are copied as-is */
1819 s--;
1820 size++;
1821 }
1822
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001824 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 *p++ = '\\';
1826 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001827 *p++ = hexdigit[(ch >> 12) & 0x000F];
1828 *p++ = hexdigit[(ch >> 8) & 0x000F];
1829 *p++ = hexdigit[(ch >> 4) & 0x000F];
1830 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001832
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001833 /* Map special whitespace to '\t', \n', '\r' */
1834 else if (ch == '\t') {
1835 *p++ = '\\';
1836 *p++ = 't';
1837 }
1838 else if (ch == '\n') {
1839 *p++ = '\\';
1840 *p++ = 'n';
1841 }
1842 else if (ch == '\r') {
1843 *p++ = '\\';
1844 *p++ = 'r';
1845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001846
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001847 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001848 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001850 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001851 *p++ = hexdigit[(ch >> 4) & 0x000F];
1852 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001854
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855 /* Copy everything else as-is */
1856 else
1857 *p++ = (char) ch;
1858 }
1859 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001860 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861
1862 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001863 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
1866 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001867
1868 onError:
1869 Py_DECREF(repr);
1870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871}
1872
1873PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1874 int size)
1875{
1876 return unicodeescape_string(s, size, 0);
1877}
1878
1879PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1880{
1881 if (!PyUnicode_Check(unicode)) {
1882 PyErr_BadArgument();
1883 return NULL;
1884 }
1885 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1886 PyUnicode_GET_SIZE(unicode));
1887}
1888
1889/* --- Raw Unicode Escape Codec ------------------------------------------- */
1890
1891PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1892 int size,
1893 const char *errors)
1894{
1895 PyUnicodeObject *v;
1896 Py_UNICODE *p, *buf;
1897 const char *end;
1898 const char *bs;
1899
1900 /* Escaped strings will always be longer than the resulting
1901 Unicode string, so we start with size here and then reduce the
1902 length after conversion to the true value. */
1903 v = _PyUnicode_New(size);
1904 if (v == NULL)
1905 goto onError;
1906 if (size == 0)
1907 return (PyObject *)v;
1908 p = buf = PyUnicode_AS_UNICODE(v);
1909 end = s + size;
1910 while (s < end) {
1911 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001912 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 int i;
1914
1915 /* Non-escape characters are interpreted as Unicode ordinals */
1916 if (*s != '\\') {
1917 *p++ = (unsigned char)*s++;
1918 continue;
1919 }
1920
1921 /* \u-escapes are only interpreted iff the number of leading
1922 backslashes if odd */
1923 bs = s;
1924 for (;s < end;) {
1925 if (*s != '\\')
1926 break;
1927 *p++ = (unsigned char)*s++;
1928 }
1929 if (((s - bs) & 1) == 0 ||
1930 s >= end ||
1931 *s != 'u') {
1932 continue;
1933 }
1934 p--;
1935 s++;
1936
1937 /* \uXXXX with 4 hex digits */
1938 for (x = 0, i = 0; i < 4; i++) {
1939 c = (unsigned char)s[i];
1940 if (!isxdigit(c)) {
1941 if (unicodeescape_decoding_error(&s, &x, errors,
1942 "truncated \\uXXXX"))
1943 goto onError;
1944 i++;
1945 break;
1946 }
1947 x = (x<<4) & ~0xF;
1948 if (c >= '0' && c <= '9')
1949 x += c - '0';
1950 else if (c >= 'a' && c <= 'f')
1951 x += 10 + c - 'a';
1952 else
1953 x += 10 + c - 'A';
1954 }
1955 s += i;
1956 *p++ = x;
1957 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001958 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 return (PyObject *)v;
1961
1962 onError:
1963 Py_XDECREF(v);
1964 return NULL;
1965}
1966
1967PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1968 int size)
1969{
1970 PyObject *repr;
1971 char *p;
1972 char *q;
1973
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001974 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
1976 repr = PyString_FromStringAndSize(NULL, 6 * size);
1977 if (repr == NULL)
1978 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001979 if (size == 0)
1980 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981
1982 p = q = PyString_AS_STRING(repr);
1983 while (size-- > 0) {
1984 Py_UNICODE ch = *s++;
1985 /* Map 16-bit characters to '\uxxxx' */
1986 if (ch >= 256) {
1987 *p++ = '\\';
1988 *p++ = 'u';
1989 *p++ = hexdigit[(ch >> 12) & 0xf];
1990 *p++ = hexdigit[(ch >> 8) & 0xf];
1991 *p++ = hexdigit[(ch >> 4) & 0xf];
1992 *p++ = hexdigit[ch & 15];
1993 }
1994 /* Copy everything else as-is */
1995 else
1996 *p++ = (char) ch;
1997 }
1998 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001999 if (_PyString_Resize(&repr, p - q))
2000 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002003
2004 onError:
2005 Py_DECREF(repr);
2006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007}
2008
2009PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2010{
2011 if (!PyUnicode_Check(unicode)) {
2012 PyErr_BadArgument();
2013 return NULL;
2014 }
2015 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2016 PyUnicode_GET_SIZE(unicode));
2017}
2018
2019/* --- Latin-1 Codec ------------------------------------------------------ */
2020
2021PyObject *PyUnicode_DecodeLatin1(const char *s,
2022 int size,
2023 const char *errors)
2024{
2025 PyUnicodeObject *v;
2026 Py_UNICODE *p;
2027
2028 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002029 if (size == 1 && *(unsigned char*)s < 256) {
2030 Py_UNICODE r = *(unsigned char*)s;
2031 return PyUnicode_FromUnicode(&r, 1);
2032 }
2033
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 v = _PyUnicode_New(size);
2035 if (v == NULL)
2036 goto onError;
2037 if (size == 0)
2038 return (PyObject *)v;
2039 p = PyUnicode_AS_UNICODE(v);
2040 while (size-- > 0)
2041 *p++ = (unsigned char)*s++;
2042 return (PyObject *)v;
2043
2044 onError:
2045 Py_XDECREF(v);
2046 return NULL;
2047}
2048
2049static
2050int latin1_encoding_error(const Py_UNICODE **source,
2051 char **dest,
2052 const char *errors,
2053 const char *details)
2054{
2055 if ((errors == NULL) ||
2056 (strcmp(errors,"strict") == 0)) {
2057 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002058 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 details);
2060 return -1;
2061 }
2062 else if (strcmp(errors,"ignore") == 0) {
2063 return 0;
2064 }
2065 else if (strcmp(errors,"replace") == 0) {
2066 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002067 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 return 0;
2069 }
2070 else {
2071 PyErr_Format(PyExc_ValueError,
2072 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002073 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 errors);
2075 return -1;
2076 }
2077}
2078
2079PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2080 int size,
2081 const char *errors)
2082{
2083 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002084 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002085
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 repr = PyString_FromStringAndSize(NULL, size);
2087 if (repr == NULL)
2088 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002089 if (size == 0)
2090 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091
2092 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002093 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 while (size-- > 0) {
2095 Py_UNICODE ch = *p++;
2096 if (ch >= 256) {
2097 if (latin1_encoding_error(&p, &s, errors,
2098 "ordinal not in range(256)"))
2099 goto onError;
2100 }
2101 else
2102 *s++ = (char)ch;
2103 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002104 /* Resize if error handling skipped some characters */
2105 if (s - start < PyString_GET_SIZE(repr))
2106 if (_PyString_Resize(&repr, s - start))
2107 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 return repr;
2109
2110 onError:
2111 Py_DECREF(repr);
2112 return NULL;
2113}
2114
2115PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2116{
2117 if (!PyUnicode_Check(unicode)) {
2118 PyErr_BadArgument();
2119 return NULL;
2120 }
2121 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2122 PyUnicode_GET_SIZE(unicode),
2123 NULL);
2124}
2125
2126/* --- 7-bit ASCII Codec -------------------------------------------------- */
2127
2128static
2129int ascii_decoding_error(const char **source,
2130 Py_UNICODE **dest,
2131 const char *errors,
2132 const char *details)
2133{
2134 if ((errors == NULL) ||
2135 (strcmp(errors,"strict") == 0)) {
2136 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002137 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 details);
2139 return -1;
2140 }
2141 else if (strcmp(errors,"ignore") == 0) {
2142 return 0;
2143 }
2144 else if (strcmp(errors,"replace") == 0) {
2145 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2146 (*dest)++;
2147 return 0;
2148 }
2149 else {
2150 PyErr_Format(PyExc_ValueError,
2151 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002152 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 errors);
2154 return -1;
2155 }
2156}
2157
2158PyObject *PyUnicode_DecodeASCII(const char *s,
2159 int size,
2160 const char *errors)
2161{
2162 PyUnicodeObject *v;
2163 Py_UNICODE *p;
2164
2165 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002166 if (size == 1 && *(unsigned char*)s < 128) {
2167 Py_UNICODE r = *(unsigned char*)s;
2168 return PyUnicode_FromUnicode(&r, 1);
2169 }
2170
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 v = _PyUnicode_New(size);
2172 if (v == NULL)
2173 goto onError;
2174 if (size == 0)
2175 return (PyObject *)v;
2176 p = PyUnicode_AS_UNICODE(v);
2177 while (size-- > 0) {
2178 register unsigned char c;
2179
2180 c = (unsigned char)*s++;
2181 if (c < 128)
2182 *p++ = c;
2183 else if (ascii_decoding_error(&s, &p, errors,
2184 "ordinal not in range(128)"))
2185 goto onError;
2186 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002187 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002188 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002189 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 return (PyObject *)v;
2191
2192 onError:
2193 Py_XDECREF(v);
2194 return NULL;
2195}
2196
2197static
2198int ascii_encoding_error(const Py_UNICODE **source,
2199 char **dest,
2200 const char *errors,
2201 const char *details)
2202{
2203 if ((errors == NULL) ||
2204 (strcmp(errors,"strict") == 0)) {
2205 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002206 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 details);
2208 return -1;
2209 }
2210 else if (strcmp(errors,"ignore") == 0) {
2211 return 0;
2212 }
2213 else if (strcmp(errors,"replace") == 0) {
2214 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002215 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 return 0;
2217 }
2218 else {
2219 PyErr_Format(PyExc_ValueError,
2220 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002221 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 errors);
2223 return -1;
2224 }
2225}
2226
2227PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2228 int size,
2229 const char *errors)
2230{
2231 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002232 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002233
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 repr = PyString_FromStringAndSize(NULL, size);
2235 if (repr == NULL)
2236 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002237 if (size == 0)
2238 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239
2240 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002241 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 while (size-- > 0) {
2243 Py_UNICODE ch = *p++;
2244 if (ch >= 128) {
2245 if (ascii_encoding_error(&p, &s, errors,
2246 "ordinal not in range(128)"))
2247 goto onError;
2248 }
2249 else
2250 *s++ = (char)ch;
2251 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002252 /* Resize if error handling skipped some characters */
2253 if (s - start < PyString_GET_SIZE(repr))
2254 if (_PyString_Resize(&repr, s - start))
2255 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 return repr;
2257
2258 onError:
2259 Py_DECREF(repr);
2260 return NULL;
2261}
2262
2263PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2264{
2265 if (!PyUnicode_Check(unicode)) {
2266 PyErr_BadArgument();
2267 return NULL;
2268 }
2269 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2270 PyUnicode_GET_SIZE(unicode),
2271 NULL);
2272}
2273
Fredrik Lundh30831632001-06-26 15:11:00 +00002274#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002275
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002276/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002277
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002278PyObject *PyUnicode_DecodeMBCS(const char *s,
2279 int size,
2280 const char *errors)
2281{
2282 PyUnicodeObject *v;
2283 Py_UNICODE *p;
2284
2285 /* First get the size of the result */
2286 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002287 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002288 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2289
2290 v = _PyUnicode_New(usize);
2291 if (v == NULL)
2292 return NULL;
2293 if (usize == 0)
2294 return (PyObject *)v;
2295 p = PyUnicode_AS_UNICODE(v);
2296 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2297 Py_DECREF(v);
2298 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2299 }
2300
2301 return (PyObject *)v;
2302}
2303
2304PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2305 int size,
2306 const char *errors)
2307{
2308 PyObject *repr;
2309 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002310 DWORD mbcssize;
2311
2312 /* If there are no characters, bail now! */
2313 if (size==0)
2314 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002315
2316 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002317 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002318 if (mbcssize==0)
2319 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2320
2321 repr = PyString_FromStringAndSize(NULL, mbcssize);
2322 if (repr == NULL)
2323 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002324 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002325 return repr;
2326
2327 /* Do the conversion */
2328 s = PyString_AS_STRING(repr);
2329 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2330 Py_DECREF(repr);
2331 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2332 }
2333 return repr;
2334}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002335
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002336#endif /* MS_WIN32 */
2337
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338/* --- Character Mapping Codec -------------------------------------------- */
2339
2340static
2341int charmap_decoding_error(const char **source,
2342 Py_UNICODE **dest,
2343 const char *errors,
2344 const char *details)
2345{
2346 if ((errors == NULL) ||
2347 (strcmp(errors,"strict") == 0)) {
2348 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002349 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 details);
2351 return -1;
2352 }
2353 else if (strcmp(errors,"ignore") == 0) {
2354 return 0;
2355 }
2356 else if (strcmp(errors,"replace") == 0) {
2357 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2358 (*dest)++;
2359 return 0;
2360 }
2361 else {
2362 PyErr_Format(PyExc_ValueError,
2363 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002364 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 errors);
2366 return -1;
2367 }
2368}
2369
2370PyObject *PyUnicode_DecodeCharmap(const char *s,
2371 int size,
2372 PyObject *mapping,
2373 const char *errors)
2374{
2375 PyUnicodeObject *v;
2376 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002377 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378
2379 /* Default to Latin-1 */
2380 if (mapping == NULL)
2381 return PyUnicode_DecodeLatin1(s, size, errors);
2382
2383 v = _PyUnicode_New(size);
2384 if (v == NULL)
2385 goto onError;
2386 if (size == 0)
2387 return (PyObject *)v;
2388 p = PyUnicode_AS_UNICODE(v);
2389 while (size-- > 0) {
2390 unsigned char ch = *s++;
2391 PyObject *w, *x;
2392
2393 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2394 w = PyInt_FromLong((long)ch);
2395 if (w == NULL)
2396 goto onError;
2397 x = PyObject_GetItem(mapping, w);
2398 Py_DECREF(w);
2399 if (x == NULL) {
2400 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002401 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002403 x = Py_None;
2404 Py_INCREF(x);
2405 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002406 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407 }
2408
2409 /* Apply mapping */
2410 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002411 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 if (value < 0 || value > 65535) {
2413 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002414 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 Py_DECREF(x);
2416 goto onError;
2417 }
2418 *p++ = (Py_UNICODE)value;
2419 }
2420 else if (x == Py_None) {
2421 /* undefined mapping */
2422 if (charmap_decoding_error(&s, &p, errors,
2423 "character maps to <undefined>")) {
2424 Py_DECREF(x);
2425 goto onError;
2426 }
2427 }
2428 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002429 int targetsize = PyUnicode_GET_SIZE(x);
2430
2431 if (targetsize == 1)
2432 /* 1-1 mapping */
2433 *p++ = *PyUnicode_AS_UNICODE(x);
2434
2435 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002437 if (targetsize > extrachars) {
2438 /* resize first */
2439 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2440 int needed = (targetsize - extrachars) + \
2441 (targetsize << 2);
2442 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002443 if (_PyUnicode_Resize(&v,
2444 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002445 Py_DECREF(x);
2446 goto onError;
2447 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002448 p = PyUnicode_AS_UNICODE(v) + oldpos;
2449 }
2450 Py_UNICODE_COPY(p,
2451 PyUnicode_AS_UNICODE(x),
2452 targetsize);
2453 p += targetsize;
2454 extrachars -= targetsize;
2455 }
2456 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 }
2458 else {
2459 /* wrong return value */
2460 PyErr_SetString(PyExc_TypeError,
2461 "character mapping must return integer, None or unicode");
2462 Py_DECREF(x);
2463 goto onError;
2464 }
2465 Py_DECREF(x);
2466 }
2467 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002468 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 goto onError;
2470 return (PyObject *)v;
2471
2472 onError:
2473 Py_XDECREF(v);
2474 return NULL;
2475}
2476
2477static
2478int charmap_encoding_error(const Py_UNICODE **source,
2479 char **dest,
2480 const char *errors,
2481 const char *details)
2482{
2483 if ((errors == NULL) ||
2484 (strcmp(errors,"strict") == 0)) {
2485 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002486 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 details);
2488 return -1;
2489 }
2490 else if (strcmp(errors,"ignore") == 0) {
2491 return 0;
2492 }
2493 else if (strcmp(errors,"replace") == 0) {
2494 **dest = '?';
2495 (*dest)++;
2496 return 0;
2497 }
2498 else {
2499 PyErr_Format(PyExc_ValueError,
2500 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002501 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 errors);
2503 return -1;
2504 }
2505}
2506
2507PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2508 int size,
2509 PyObject *mapping,
2510 const char *errors)
2511{
2512 PyObject *v;
2513 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002514 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515
2516 /* Default to Latin-1 */
2517 if (mapping == NULL)
2518 return PyUnicode_EncodeLatin1(p, size, errors);
2519
2520 v = PyString_FromStringAndSize(NULL, size);
2521 if (v == NULL)
2522 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002523 if (size == 0)
2524 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 s = PyString_AS_STRING(v);
2526 while (size-- > 0) {
2527 Py_UNICODE ch = *p++;
2528 PyObject *w, *x;
2529
2530 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2531 w = PyInt_FromLong((long)ch);
2532 if (w == NULL)
2533 goto onError;
2534 x = PyObject_GetItem(mapping, w);
2535 Py_DECREF(w);
2536 if (x == NULL) {
2537 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002538 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002540 x = Py_None;
2541 Py_INCREF(x);
2542 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002543 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 }
2545
2546 /* Apply mapping */
2547 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002548 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 if (value < 0 || value > 255) {
2550 PyErr_SetString(PyExc_TypeError,
2551 "character mapping must be in range(256)");
2552 Py_DECREF(x);
2553 goto onError;
2554 }
2555 *s++ = (char)value;
2556 }
2557 else if (x == Py_None) {
2558 /* undefined mapping */
2559 if (charmap_encoding_error(&p, &s, errors,
2560 "character maps to <undefined>")) {
2561 Py_DECREF(x);
2562 goto onError;
2563 }
2564 }
2565 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002566 int targetsize = PyString_GET_SIZE(x);
2567
2568 if (targetsize == 1)
2569 /* 1-1 mapping */
2570 *s++ = *PyString_AS_STRING(x);
2571
2572 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002574 if (targetsize > extrachars) {
2575 /* resize first */
2576 int oldpos = (int)(s - PyString_AS_STRING(v));
2577 int needed = (targetsize - extrachars) + \
2578 (targetsize << 2);
2579 extrachars += needed;
2580 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002581 Py_DECREF(x);
2582 goto onError;
2583 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002584 s = PyString_AS_STRING(v) + oldpos;
2585 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002586 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002587 s += targetsize;
2588 extrachars -= targetsize;
2589 }
2590 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 }
2592 else {
2593 /* wrong return value */
2594 PyErr_SetString(PyExc_TypeError,
2595 "character mapping must return integer, None or unicode");
2596 Py_DECREF(x);
2597 goto onError;
2598 }
2599 Py_DECREF(x);
2600 }
2601 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2602 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2603 goto onError;
2604 return v;
2605
2606 onError:
2607 Py_DECREF(v);
2608 return NULL;
2609}
2610
2611PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2612 PyObject *mapping)
2613{
2614 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2615 PyErr_BadArgument();
2616 return NULL;
2617 }
2618 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2619 PyUnicode_GET_SIZE(unicode),
2620 mapping,
2621 NULL);
2622}
2623
2624static
2625int translate_error(const Py_UNICODE **source,
2626 Py_UNICODE **dest,
2627 const char *errors,
2628 const char *details)
2629{
2630 if ((errors == NULL) ||
2631 (strcmp(errors,"strict") == 0)) {
2632 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002633 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 details);
2635 return -1;
2636 }
2637 else if (strcmp(errors,"ignore") == 0) {
2638 return 0;
2639 }
2640 else if (strcmp(errors,"replace") == 0) {
2641 **dest = '?';
2642 (*dest)++;
2643 return 0;
2644 }
2645 else {
2646 PyErr_Format(PyExc_ValueError,
2647 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002648 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 errors);
2650 return -1;
2651 }
2652}
2653
2654PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2655 int size,
2656 PyObject *mapping,
2657 const char *errors)
2658{
2659 PyUnicodeObject *v;
2660 Py_UNICODE *p;
2661
2662 if (mapping == NULL) {
2663 PyErr_BadArgument();
2664 return NULL;
2665 }
2666
2667 /* Output will never be longer than input */
2668 v = _PyUnicode_New(size);
2669 if (v == NULL)
2670 goto onError;
2671 if (size == 0)
2672 goto done;
2673 p = PyUnicode_AS_UNICODE(v);
2674 while (size-- > 0) {
2675 Py_UNICODE ch = *s++;
2676 PyObject *w, *x;
2677
2678 /* Get mapping */
2679 w = PyInt_FromLong(ch);
2680 if (w == NULL)
2681 goto onError;
2682 x = PyObject_GetItem(mapping, w);
2683 Py_DECREF(w);
2684 if (x == NULL) {
2685 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2686 /* No mapping found: default to 1-1 mapping */
2687 PyErr_Clear();
2688 *p++ = ch;
2689 continue;
2690 }
2691 goto onError;
2692 }
2693
2694 /* Apply mapping */
2695 if (PyInt_Check(x))
2696 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2697 else if (x == Py_None) {
2698 /* undefined mapping */
2699 if (translate_error(&s, &p, errors,
2700 "character maps to <undefined>")) {
2701 Py_DECREF(x);
2702 goto onError;
2703 }
2704 }
2705 else if (PyUnicode_Check(x)) {
2706 if (PyUnicode_GET_SIZE(x) != 1) {
2707 /* 1-n mapping */
2708 PyErr_SetString(PyExc_NotImplementedError,
2709 "1-n mappings are currently not implemented");
2710 Py_DECREF(x);
2711 goto onError;
2712 }
2713 *p++ = *PyUnicode_AS_UNICODE(x);
2714 }
2715 else {
2716 /* wrong return value */
2717 PyErr_SetString(PyExc_TypeError,
2718 "translate mapping must return integer, None or unicode");
2719 Py_DECREF(x);
2720 goto onError;
2721 }
2722 Py_DECREF(x);
2723 }
2724 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002725 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002726 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727
2728 done:
2729 return (PyObject *)v;
2730
2731 onError:
2732 Py_XDECREF(v);
2733 return NULL;
2734}
2735
2736PyObject *PyUnicode_Translate(PyObject *str,
2737 PyObject *mapping,
2738 const char *errors)
2739{
2740 PyObject *result;
2741
2742 str = PyUnicode_FromObject(str);
2743 if (str == NULL)
2744 goto onError;
2745 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2746 PyUnicode_GET_SIZE(str),
2747 mapping,
2748 errors);
2749 Py_DECREF(str);
2750 return result;
2751
2752 onError:
2753 Py_XDECREF(str);
2754 return NULL;
2755}
2756
Guido van Rossum9e896b32000-04-05 20:11:21 +00002757/* --- Decimal Encoder ---------------------------------------------------- */
2758
2759int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2760 int length,
2761 char *output,
2762 const char *errors)
2763{
2764 Py_UNICODE *p, *end;
2765
2766 if (output == NULL) {
2767 PyErr_BadArgument();
2768 return -1;
2769 }
2770
2771 p = s;
2772 end = s + length;
2773 while (p < end) {
2774 register Py_UNICODE ch = *p++;
2775 int decimal;
2776
2777 if (Py_UNICODE_ISSPACE(ch)) {
2778 *output++ = ' ';
2779 continue;
2780 }
2781 decimal = Py_UNICODE_TODECIMAL(ch);
2782 if (decimal >= 0) {
2783 *output++ = '0' + decimal;
2784 continue;
2785 }
Guido van Rossumba477042000-04-06 18:18:10 +00002786 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002787 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002788 continue;
2789 }
2790 /* All other characters are considered invalid */
2791 if (errors == NULL || strcmp(errors, "strict") == 0) {
2792 PyErr_SetString(PyExc_ValueError,
2793 "invalid decimal Unicode string");
2794 goto onError;
2795 }
2796 else if (strcmp(errors, "ignore") == 0)
2797 continue;
2798 else if (strcmp(errors, "replace") == 0) {
2799 *output++ = '?';
2800 continue;
2801 }
2802 }
2803 /* 0-terminate the output string */
2804 *output++ = '\0';
2805 return 0;
2806
2807 onError:
2808 return -1;
2809}
2810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811/* --- Helpers ------------------------------------------------------------ */
2812
2813static
2814int count(PyUnicodeObject *self,
2815 int start,
2816 int end,
2817 PyUnicodeObject *substring)
2818{
2819 int count = 0;
2820
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002821 if (start < 0)
2822 start += self->length;
2823 if (start < 0)
2824 start = 0;
2825 if (end > self->length)
2826 end = self->length;
2827 if (end < 0)
2828 end += self->length;
2829 if (end < 0)
2830 end = 0;
2831
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002832 if (substring->length == 0)
2833 return (end - start + 1);
2834
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 end -= substring->length;
2836
2837 while (start <= end)
2838 if (Py_UNICODE_MATCH(self, start, substring)) {
2839 count++;
2840 start += substring->length;
2841 } else
2842 start++;
2843
2844 return count;
2845}
2846
2847int PyUnicode_Count(PyObject *str,
2848 PyObject *substr,
2849 int start,
2850 int end)
2851{
2852 int result;
2853
2854 str = PyUnicode_FromObject(str);
2855 if (str == NULL)
2856 return -1;
2857 substr = PyUnicode_FromObject(substr);
2858 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002859 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 return -1;
2861 }
2862
2863 result = count((PyUnicodeObject *)str,
2864 start, end,
2865 (PyUnicodeObject *)substr);
2866
2867 Py_DECREF(str);
2868 Py_DECREF(substr);
2869 return result;
2870}
2871
2872static
2873int findstring(PyUnicodeObject *self,
2874 PyUnicodeObject *substring,
2875 int start,
2876 int end,
2877 int direction)
2878{
2879 if (start < 0)
2880 start += self->length;
2881 if (start < 0)
2882 start = 0;
2883
2884 if (substring->length == 0)
2885 return start;
2886
2887 if (end > self->length)
2888 end = self->length;
2889 if (end < 0)
2890 end += self->length;
2891 if (end < 0)
2892 end = 0;
2893
2894 end -= substring->length;
2895
2896 if (direction < 0) {
2897 for (; end >= start; end--)
2898 if (Py_UNICODE_MATCH(self, end, substring))
2899 return end;
2900 } else {
2901 for (; start <= end; start++)
2902 if (Py_UNICODE_MATCH(self, start, substring))
2903 return start;
2904 }
2905
2906 return -1;
2907}
2908
2909int PyUnicode_Find(PyObject *str,
2910 PyObject *substr,
2911 int start,
2912 int end,
2913 int direction)
2914{
2915 int result;
2916
2917 str = PyUnicode_FromObject(str);
2918 if (str == NULL)
2919 return -1;
2920 substr = PyUnicode_FromObject(substr);
2921 if (substr == NULL) {
2922 Py_DECREF(substr);
2923 return -1;
2924 }
2925
2926 result = findstring((PyUnicodeObject *)str,
2927 (PyUnicodeObject *)substr,
2928 start, end, direction);
2929 Py_DECREF(str);
2930 Py_DECREF(substr);
2931 return result;
2932}
2933
2934static
2935int tailmatch(PyUnicodeObject *self,
2936 PyUnicodeObject *substring,
2937 int start,
2938 int end,
2939 int direction)
2940{
2941 if (start < 0)
2942 start += self->length;
2943 if (start < 0)
2944 start = 0;
2945
2946 if (substring->length == 0)
2947 return 1;
2948
2949 if (end > self->length)
2950 end = self->length;
2951 if (end < 0)
2952 end += self->length;
2953 if (end < 0)
2954 end = 0;
2955
2956 end -= substring->length;
2957 if (end < start)
2958 return 0;
2959
2960 if (direction > 0) {
2961 if (Py_UNICODE_MATCH(self, end, substring))
2962 return 1;
2963 } else {
2964 if (Py_UNICODE_MATCH(self, start, substring))
2965 return 1;
2966 }
2967
2968 return 0;
2969}
2970
2971int PyUnicode_Tailmatch(PyObject *str,
2972 PyObject *substr,
2973 int start,
2974 int end,
2975 int direction)
2976{
2977 int result;
2978
2979 str = PyUnicode_FromObject(str);
2980 if (str == NULL)
2981 return -1;
2982 substr = PyUnicode_FromObject(substr);
2983 if (substr == NULL) {
2984 Py_DECREF(substr);
2985 return -1;
2986 }
2987
2988 result = tailmatch((PyUnicodeObject *)str,
2989 (PyUnicodeObject *)substr,
2990 start, end, direction);
2991 Py_DECREF(str);
2992 Py_DECREF(substr);
2993 return result;
2994}
2995
2996static
2997const Py_UNICODE *findchar(const Py_UNICODE *s,
2998 int size,
2999 Py_UNICODE ch)
3000{
3001 /* like wcschr, but doesn't stop at NULL characters */
3002
3003 while (size-- > 0) {
3004 if (*s == ch)
3005 return s;
3006 s++;
3007 }
3008
3009 return NULL;
3010}
3011
3012/* Apply fixfct filter to the Unicode object self and return a
3013 reference to the modified object */
3014
3015static
3016PyObject *fixup(PyUnicodeObject *self,
3017 int (*fixfct)(PyUnicodeObject *s))
3018{
3019
3020 PyUnicodeObject *u;
3021
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003022 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 if (u == NULL)
3024 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003025
3026 Py_UNICODE_COPY(u->str, self->str, self->length);
3027
Tim Peters7a29bd52001-09-12 03:03:31 +00003028 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 /* fixfct should return TRUE if it modified the buffer. If
3030 FALSE, return a reference to the original buffer instead
3031 (to save space, not time) */
3032 Py_INCREF(self);
3033 Py_DECREF(u);
3034 return (PyObject*) self;
3035 }
3036 return (PyObject*) u;
3037}
3038
3039static
3040int fixupper(PyUnicodeObject *self)
3041{
3042 int len = self->length;
3043 Py_UNICODE *s = self->str;
3044 int status = 0;
3045
3046 while (len-- > 0) {
3047 register Py_UNICODE ch;
3048
3049 ch = Py_UNICODE_TOUPPER(*s);
3050 if (ch != *s) {
3051 status = 1;
3052 *s = ch;
3053 }
3054 s++;
3055 }
3056
3057 return status;
3058}
3059
3060static
3061int fixlower(PyUnicodeObject *self)
3062{
3063 int len = self->length;
3064 Py_UNICODE *s = self->str;
3065 int status = 0;
3066
3067 while (len-- > 0) {
3068 register Py_UNICODE ch;
3069
3070 ch = Py_UNICODE_TOLOWER(*s);
3071 if (ch != *s) {
3072 status = 1;
3073 *s = ch;
3074 }
3075 s++;
3076 }
3077
3078 return status;
3079}
3080
3081static
3082int fixswapcase(PyUnicodeObject *self)
3083{
3084 int len = self->length;
3085 Py_UNICODE *s = self->str;
3086 int status = 0;
3087
3088 while (len-- > 0) {
3089 if (Py_UNICODE_ISUPPER(*s)) {
3090 *s = Py_UNICODE_TOLOWER(*s);
3091 status = 1;
3092 } else if (Py_UNICODE_ISLOWER(*s)) {
3093 *s = Py_UNICODE_TOUPPER(*s);
3094 status = 1;
3095 }
3096 s++;
3097 }
3098
3099 return status;
3100}
3101
3102static
3103int fixcapitalize(PyUnicodeObject *self)
3104{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003105 int len = self->length;
3106 Py_UNICODE *s = self->str;
3107 int status = 0;
3108
3109 if (len == 0)
3110 return 0;
3111 if (Py_UNICODE_ISLOWER(*s)) {
3112 *s = Py_UNICODE_TOUPPER(*s);
3113 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003115 s++;
3116 while (--len > 0) {
3117 if (Py_UNICODE_ISUPPER(*s)) {
3118 *s = Py_UNICODE_TOLOWER(*s);
3119 status = 1;
3120 }
3121 s++;
3122 }
3123 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124}
3125
3126static
3127int fixtitle(PyUnicodeObject *self)
3128{
3129 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3130 register Py_UNICODE *e;
3131 int previous_is_cased;
3132
3133 /* Shortcut for single character strings */
3134 if (PyUnicode_GET_SIZE(self) == 1) {
3135 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3136 if (*p != ch) {
3137 *p = ch;
3138 return 1;
3139 }
3140 else
3141 return 0;
3142 }
3143
3144 e = p + PyUnicode_GET_SIZE(self);
3145 previous_is_cased = 0;
3146 for (; p < e; p++) {
3147 register const Py_UNICODE ch = *p;
3148
3149 if (previous_is_cased)
3150 *p = Py_UNICODE_TOLOWER(ch);
3151 else
3152 *p = Py_UNICODE_TOTITLE(ch);
3153
3154 if (Py_UNICODE_ISLOWER(ch) ||
3155 Py_UNICODE_ISUPPER(ch) ||
3156 Py_UNICODE_ISTITLE(ch))
3157 previous_is_cased = 1;
3158 else
3159 previous_is_cased = 0;
3160 }
3161 return 1;
3162}
3163
3164PyObject *PyUnicode_Join(PyObject *separator,
3165 PyObject *seq)
3166{
3167 Py_UNICODE *sep;
3168 int seplen;
3169 PyUnicodeObject *res = NULL;
3170 int reslen = 0;
3171 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 int sz = 100;
3173 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003174 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
Tim Peters2cfe3682001-05-05 05:36:48 +00003176 it = PyObject_GetIter(seq);
3177 if (it == NULL)
3178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179
3180 if (separator == NULL) {
3181 Py_UNICODE blank = ' ';
3182 sep = &blank;
3183 seplen = 1;
3184 }
3185 else {
3186 separator = PyUnicode_FromObject(separator);
3187 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 sep = PyUnicode_AS_UNICODE(separator);
3190 seplen = PyUnicode_GET_SIZE(separator);
3191 }
3192
3193 res = _PyUnicode_New(sz);
3194 if (res == NULL)
3195 goto onError;
3196 p = PyUnicode_AS_UNICODE(res);
3197 reslen = 0;
3198
Tim Peters2cfe3682001-05-05 05:36:48 +00003199 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003201 PyObject *item = PyIter_Next(it);
3202 if (item == NULL) {
3203 if (PyErr_Occurred())
3204 goto onError;
3205 break;
3206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (!PyUnicode_Check(item)) {
3208 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003209 if (!PyString_Check(item)) {
3210 PyErr_Format(PyExc_TypeError,
3211 "sequence item %i: expected string or Unicode,"
3212 " %.80s found",
3213 i, item->ob_type->tp_name);
3214 Py_DECREF(item);
3215 goto onError;
3216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 v = PyUnicode_FromObject(item);
3218 Py_DECREF(item);
3219 item = v;
3220 if (item == NULL)
3221 goto onError;
3222 }
3223 itemlen = PyUnicode_GET_SIZE(item);
3224 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003225 if (_PyUnicode_Resize(&res, sz*2)) {
3226 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 sz *= 2;
3230 p = PyUnicode_AS_UNICODE(res) + reslen;
3231 }
3232 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003233 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 p += seplen;
3235 reslen += seplen;
3236 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003237 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 p += itemlen;
3239 reslen += itemlen;
3240 Py_DECREF(item);
3241 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003242 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 goto onError;
3244
3245 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003246 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return (PyObject *)res;
3248
3249 onError:
3250 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003251 Py_XDECREF(res);
3252 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 return NULL;
3254}
3255
3256static
3257PyUnicodeObject *pad(PyUnicodeObject *self,
3258 int left,
3259 int right,
3260 Py_UNICODE fill)
3261{
3262 PyUnicodeObject *u;
3263
3264 if (left < 0)
3265 left = 0;
3266 if (right < 0)
3267 right = 0;
3268
Tim Peters7a29bd52001-09-12 03:03:31 +00003269 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 Py_INCREF(self);
3271 return self;
3272 }
3273
3274 u = _PyUnicode_New(left + self->length + right);
3275 if (u) {
3276 if (left)
3277 Py_UNICODE_FILL(u->str, fill, left);
3278 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3279 if (right)
3280 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3281 }
3282
3283 return u;
3284}
3285
3286#define SPLIT_APPEND(data, left, right) \
3287 str = PyUnicode_FromUnicode(data + left, right - left); \
3288 if (!str) \
3289 goto onError; \
3290 if (PyList_Append(list, str)) { \
3291 Py_DECREF(str); \
3292 goto onError; \
3293 } \
3294 else \
3295 Py_DECREF(str);
3296
3297static
3298PyObject *split_whitespace(PyUnicodeObject *self,
3299 PyObject *list,
3300 int maxcount)
3301{
3302 register int i;
3303 register int j;
3304 int len = self->length;
3305 PyObject *str;
3306
3307 for (i = j = 0; i < len; ) {
3308 /* find a token */
3309 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3310 i++;
3311 j = i;
3312 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3313 i++;
3314 if (j < i) {
3315 if (maxcount-- <= 0)
3316 break;
3317 SPLIT_APPEND(self->str, j, i);
3318 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3319 i++;
3320 j = i;
3321 }
3322 }
3323 if (j < len) {
3324 SPLIT_APPEND(self->str, j, len);
3325 }
3326 return list;
3327
3328 onError:
3329 Py_DECREF(list);
3330 return NULL;
3331}
3332
3333PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003334 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335{
3336 register int i;
3337 register int j;
3338 int len;
3339 PyObject *list;
3340 PyObject *str;
3341 Py_UNICODE *data;
3342
3343 string = PyUnicode_FromObject(string);
3344 if (string == NULL)
3345 return NULL;
3346 data = PyUnicode_AS_UNICODE(string);
3347 len = PyUnicode_GET_SIZE(string);
3348
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 list = PyList_New(0);
3350 if (!list)
3351 goto onError;
3352
3353 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003354 int eol;
3355
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 /* Find a line and append it */
3357 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3358 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003361 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 if (i < len) {
3363 if (data[i] == '\r' && i + 1 < len &&
3364 data[i+1] == '\n')
3365 i += 2;
3366 else
3367 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003368 if (keepends)
3369 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370 }
Guido van Rossum86662912000-04-11 15:38:46 +00003371 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 j = i;
3373 }
3374 if (j < len) {
3375 SPLIT_APPEND(data, j, len);
3376 }
3377
3378 Py_DECREF(string);
3379 return list;
3380
3381 onError:
3382 Py_DECREF(list);
3383 Py_DECREF(string);
3384 return NULL;
3385}
3386
3387static
3388PyObject *split_char(PyUnicodeObject *self,
3389 PyObject *list,
3390 Py_UNICODE ch,
3391 int maxcount)
3392{
3393 register int i;
3394 register int j;
3395 int len = self->length;
3396 PyObject *str;
3397
3398 for (i = j = 0; i < len; ) {
3399 if (self->str[i] == ch) {
3400 if (maxcount-- <= 0)
3401 break;
3402 SPLIT_APPEND(self->str, j, i);
3403 i = j = i + 1;
3404 } else
3405 i++;
3406 }
3407 if (j <= len) {
3408 SPLIT_APPEND(self->str, j, len);
3409 }
3410 return list;
3411
3412 onError:
3413 Py_DECREF(list);
3414 return NULL;
3415}
3416
3417static
3418PyObject *split_substring(PyUnicodeObject *self,
3419 PyObject *list,
3420 PyUnicodeObject *substring,
3421 int maxcount)
3422{
3423 register int i;
3424 register int j;
3425 int len = self->length;
3426 int sublen = substring->length;
3427 PyObject *str;
3428
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003429 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 if (Py_UNICODE_MATCH(self, i, substring)) {
3431 if (maxcount-- <= 0)
3432 break;
3433 SPLIT_APPEND(self->str, j, i);
3434 i = j = i + sublen;
3435 } else
3436 i++;
3437 }
3438 if (j <= len) {
3439 SPLIT_APPEND(self->str, j, len);
3440 }
3441 return list;
3442
3443 onError:
3444 Py_DECREF(list);
3445 return NULL;
3446}
3447
3448#undef SPLIT_APPEND
3449
3450static
3451PyObject *split(PyUnicodeObject *self,
3452 PyUnicodeObject *substring,
3453 int maxcount)
3454{
3455 PyObject *list;
3456
3457 if (maxcount < 0)
3458 maxcount = INT_MAX;
3459
3460 list = PyList_New(0);
3461 if (!list)
3462 return NULL;
3463
3464 if (substring == NULL)
3465 return split_whitespace(self,list,maxcount);
3466
3467 else if (substring->length == 1)
3468 return split_char(self,list,substring->str[0],maxcount);
3469
3470 else if (substring->length == 0) {
3471 Py_DECREF(list);
3472 PyErr_SetString(PyExc_ValueError, "empty separator");
3473 return NULL;
3474 }
3475 else
3476 return split_substring(self,list,substring,maxcount);
3477}
3478
3479static
3480PyObject *strip(PyUnicodeObject *self,
3481 int left,
3482 int right)
3483{
3484 Py_UNICODE *p = self->str;
3485 int start = 0;
3486 int end = self->length;
3487
3488 if (left)
3489 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3490 start++;
3491
3492 if (right)
3493 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3494 end--;
3495
Tim Peters7a29bd52001-09-12 03:03:31 +00003496 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 /* couldn't strip anything off, return original string */
3498 Py_INCREF(self);
3499 return (PyObject*) self;
3500 }
3501
3502 return (PyObject*) PyUnicode_FromUnicode(
3503 self->str + start,
3504 end - start
3505 );
3506}
3507
3508static
3509PyObject *replace(PyUnicodeObject *self,
3510 PyUnicodeObject *str1,
3511 PyUnicodeObject *str2,
3512 int maxcount)
3513{
3514 PyUnicodeObject *u;
3515
3516 if (maxcount < 0)
3517 maxcount = INT_MAX;
3518
3519 if (str1->length == 1 && str2->length == 1) {
3520 int i;
3521
3522 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003523 if (!findchar(self->str, self->length, str1->str[0]) &&
3524 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 /* nothing to replace, return original string */
3526 Py_INCREF(self);
3527 u = self;
3528 } else {
3529 Py_UNICODE u1 = str1->str[0];
3530 Py_UNICODE u2 = str2->str[0];
3531
3532 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003533 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 self->length
3535 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003536 if (u != NULL) {
3537 Py_UNICODE_COPY(u->str, self->str,
3538 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 for (i = 0; i < u->length; i++)
3540 if (u->str[i] == u1) {
3541 if (--maxcount < 0)
3542 break;
3543 u->str[i] = u2;
3544 }
3545 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547
3548 } else {
3549 int n, i;
3550 Py_UNICODE *p;
3551
3552 /* replace strings */
3553 n = count(self, 0, self->length, str1);
3554 if (n > maxcount)
3555 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003556 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 /* nothing to replace, return original string */
3558 Py_INCREF(self);
3559 u = self;
3560 } else {
3561 u = _PyUnicode_New(
3562 self->length + n * (str2->length - str1->length));
3563 if (u) {
3564 i = 0;
3565 p = u->str;
3566 while (i <= self->length - str1->length)
3567 if (Py_UNICODE_MATCH(self, i, str1)) {
3568 /* replace string segment */
3569 Py_UNICODE_COPY(p, str2->str, str2->length);
3570 p += str2->length;
3571 i += str1->length;
3572 if (--n <= 0) {
3573 /* copy remaining part */
3574 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3575 break;
3576 }
3577 } else
3578 *p++ = self->str[i++];
3579 }
3580 }
3581 }
3582
3583 return (PyObject *) u;
3584}
3585
3586/* --- Unicode Object Methods --------------------------------------------- */
3587
3588static char title__doc__[] =
3589"S.title() -> unicode\n\
3590\n\
3591Return a titlecased version of S, i.e. words start with title case\n\
3592characters, all remaining cased characters have lower case.";
3593
3594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003595unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return fixup(self, fixtitle);
3598}
3599
3600static char capitalize__doc__[] =
3601"S.capitalize() -> unicode\n\
3602\n\
3603Return a capitalized version of S, i.e. make the first character\n\
3604have upper case.";
3605
3606static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003607unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 return fixup(self, fixcapitalize);
3610}
3611
3612#if 0
3613static char capwords__doc__[] =
3614"S.capwords() -> unicode\n\
3615\n\
3616Apply .capitalize() to all words in S and return the result with\n\
3617normalized whitespace (all whitespace strings are replaced by ' ').";
3618
3619static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003620unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621{
3622 PyObject *list;
3623 PyObject *item;
3624 int i;
3625
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 /* Split into words */
3627 list = split(self, NULL, -1);
3628 if (!list)
3629 return NULL;
3630
3631 /* Capitalize each word */
3632 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3633 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3634 fixcapitalize);
3635 if (item == NULL)
3636 goto onError;
3637 Py_DECREF(PyList_GET_ITEM(list, i));
3638 PyList_SET_ITEM(list, i, item);
3639 }
3640
3641 /* Join the words to form a new string */
3642 item = PyUnicode_Join(NULL, list);
3643
3644onError:
3645 Py_DECREF(list);
3646 return (PyObject *)item;
3647}
3648#endif
3649
3650static char center__doc__[] =
3651"S.center(width) -> unicode\n\
3652\n\
3653Return S centered in a Unicode string of length width. Padding is done\n\
3654using spaces.";
3655
3656static PyObject *
3657unicode_center(PyUnicodeObject *self, PyObject *args)
3658{
3659 int marg, left;
3660 int width;
3661
3662 if (!PyArg_ParseTuple(args, "i:center", &width))
3663 return NULL;
3664
Tim Peters7a29bd52001-09-12 03:03:31 +00003665 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 Py_INCREF(self);
3667 return (PyObject*) self;
3668 }
3669
3670 marg = width - self->length;
3671 left = marg / 2 + (marg & width & 1);
3672
3673 return (PyObject*) pad(self, left, marg - left, ' ');
3674}
3675
Marc-André Lemburge5034372000-08-08 08:04:29 +00003676#if 0
3677
3678/* This code should go into some future Unicode collation support
3679 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003680 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003681
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003682/* speedy UTF-16 code point order comparison */
3683/* gleaned from: */
3684/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3685
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003686static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003687{
3688 0, 0, 0, 0, 0, 0, 0, 0,
3689 0, 0, 0, 0, 0, 0, 0, 0,
3690 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003691 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003692};
3693
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694static int
3695unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3696{
3697 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003698
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 Py_UNICODE *s1 = str1->str;
3700 Py_UNICODE *s2 = str2->str;
3701
3702 len1 = str1->length;
3703 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003704
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003706 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003707
3708 c1 = *s1++;
3709 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003710
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003711 if (c1 > (1<<11) * 26)
3712 c1 += utf16Fixup[c1>>11];
3713 if (c2 > (1<<11) * 26)
3714 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003715 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003716
3717 if (c1 != c2)
3718 return (c1 < c2) ? -1 : 1;
3719
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003720 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 }
3722
3723 return (len1 < len2) ? -1 : (len1 != len2);
3724}
3725
Marc-André Lemburge5034372000-08-08 08:04:29 +00003726#else
3727
3728static int
3729unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3730{
3731 register int len1, len2;
3732
3733 Py_UNICODE *s1 = str1->str;
3734 Py_UNICODE *s2 = str2->str;
3735
3736 len1 = str1->length;
3737 len2 = str2->length;
3738
3739 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003740 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003741
Fredrik Lundh45714e92001-06-26 16:39:36 +00003742 c1 = *s1++;
3743 c2 = *s2++;
3744
3745 if (c1 != c2)
3746 return (c1 < c2) ? -1 : 1;
3747
Marc-André Lemburge5034372000-08-08 08:04:29 +00003748 len1--; len2--;
3749 }
3750
3751 return (len1 < len2) ? -1 : (len1 != len2);
3752}
3753
3754#endif
3755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756int PyUnicode_Compare(PyObject *left,
3757 PyObject *right)
3758{
3759 PyUnicodeObject *u = NULL, *v = NULL;
3760 int result;
3761
3762 /* Coerce the two arguments */
3763 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3764 if (u == NULL)
3765 goto onError;
3766 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3767 if (v == NULL)
3768 goto onError;
3769
Thomas Wouters7e474022000-07-16 12:04:32 +00003770 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 if (v == u) {
3772 Py_DECREF(u);
3773 Py_DECREF(v);
3774 return 0;
3775 }
3776
3777 result = unicode_compare(u, v);
3778
3779 Py_DECREF(u);
3780 Py_DECREF(v);
3781 return result;
3782
3783onError:
3784 Py_XDECREF(u);
3785 Py_XDECREF(v);
3786 return -1;
3787}
3788
Guido van Rossum403d68b2000-03-13 15:55:09 +00003789int PyUnicode_Contains(PyObject *container,
3790 PyObject *element)
3791{
3792 PyUnicodeObject *u = NULL, *v = NULL;
3793 int result;
3794 register const Py_UNICODE *p, *e;
3795 register Py_UNICODE ch;
3796
3797 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003798 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003799 if (v == NULL) {
3800 PyErr_SetString(PyExc_TypeError,
3801 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003802 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003803 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003804 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3805 if (u == NULL) {
3806 Py_DECREF(v);
3807 goto onError;
3808 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003809
3810 /* Check v in u */
3811 if (PyUnicode_GET_SIZE(v) != 1) {
3812 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003813 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003814 goto onError;
3815 }
3816 ch = *PyUnicode_AS_UNICODE(v);
3817 p = PyUnicode_AS_UNICODE(u);
3818 e = p + PyUnicode_GET_SIZE(u);
3819 result = 0;
3820 while (p < e) {
3821 if (*p++ == ch) {
3822 result = 1;
3823 break;
3824 }
3825 }
3826
3827 Py_DECREF(u);
3828 Py_DECREF(v);
3829 return result;
3830
3831onError:
3832 Py_XDECREF(u);
3833 Py_XDECREF(v);
3834 return -1;
3835}
3836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837/* Concat to string or Unicode object giving a new Unicode object. */
3838
3839PyObject *PyUnicode_Concat(PyObject *left,
3840 PyObject *right)
3841{
3842 PyUnicodeObject *u = NULL, *v = NULL, *w;
3843
3844 /* Coerce the two arguments */
3845 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3846 if (u == NULL)
3847 goto onError;
3848 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3849 if (v == NULL)
3850 goto onError;
3851
3852 /* Shortcuts */
3853 if (v == unicode_empty) {
3854 Py_DECREF(v);
3855 return (PyObject *)u;
3856 }
3857 if (u == unicode_empty) {
3858 Py_DECREF(u);
3859 return (PyObject *)v;
3860 }
3861
3862 /* Concat the two Unicode strings */
3863 w = _PyUnicode_New(u->length + v->length);
3864 if (w == NULL)
3865 goto onError;
3866 Py_UNICODE_COPY(w->str, u->str, u->length);
3867 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3868
3869 Py_DECREF(u);
3870 Py_DECREF(v);
3871 return (PyObject *)w;
3872
3873onError:
3874 Py_XDECREF(u);
3875 Py_XDECREF(v);
3876 return NULL;
3877}
3878
3879static char count__doc__[] =
3880"S.count(sub[, start[, end]]) -> int\n\
3881\n\
3882Return the number of occurrences of substring sub in Unicode string\n\
3883S[start:end]. Optional arguments start and end are\n\
3884interpreted as in slice notation.";
3885
3886static PyObject *
3887unicode_count(PyUnicodeObject *self, PyObject *args)
3888{
3889 PyUnicodeObject *substring;
3890 int start = 0;
3891 int end = INT_MAX;
3892 PyObject *result;
3893
Guido van Rossumb8872e62000-05-09 14:14:27 +00003894 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3895 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 return NULL;
3897
3898 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3899 (PyObject *)substring);
3900 if (substring == NULL)
3901 return NULL;
3902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 if (start < 0)
3904 start += self->length;
3905 if (start < 0)
3906 start = 0;
3907 if (end > self->length)
3908 end = self->length;
3909 if (end < 0)
3910 end += self->length;
3911 if (end < 0)
3912 end = 0;
3913
3914 result = PyInt_FromLong((long) count(self, start, end, substring));
3915
3916 Py_DECREF(substring);
3917 return result;
3918}
3919
3920static char encode__doc__[] =
3921"S.encode([encoding[,errors]]) -> string\n\
3922\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003923Return an encoded string version of S. Default encoding is the current\n\
3924default string encoding. errors may be given to set a different error\n\
3925handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3926a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927
3928static PyObject *
3929unicode_encode(PyUnicodeObject *self, PyObject *args)
3930{
3931 char *encoding = NULL;
3932 char *errors = NULL;
3933 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3934 return NULL;
3935 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3936}
3937
3938static char expandtabs__doc__[] =
3939"S.expandtabs([tabsize]) -> unicode\n\
3940\n\
3941Return a copy of S where all tab characters are expanded using spaces.\n\
3942If tabsize is not given, a tab size of 8 characters is assumed.";
3943
3944static PyObject*
3945unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3946{
3947 Py_UNICODE *e;
3948 Py_UNICODE *p;
3949 Py_UNICODE *q;
3950 int i, j;
3951 PyUnicodeObject *u;
3952 int tabsize = 8;
3953
3954 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3955 return NULL;
3956
Thomas Wouters7e474022000-07-16 12:04:32 +00003957 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 i = j = 0;
3959 e = self->str + self->length;
3960 for (p = self->str; p < e; p++)
3961 if (*p == '\t') {
3962 if (tabsize > 0)
3963 j += tabsize - (j % tabsize);
3964 }
3965 else {
3966 j++;
3967 if (*p == '\n' || *p == '\r') {
3968 i += j;
3969 j = 0;
3970 }
3971 }
3972
3973 /* Second pass: create output string and fill it */
3974 u = _PyUnicode_New(i + j);
3975 if (!u)
3976 return NULL;
3977
3978 j = 0;
3979 q = u->str;
3980
3981 for (p = self->str; p < e; p++)
3982 if (*p == '\t') {
3983 if (tabsize > 0) {
3984 i = tabsize - (j % tabsize);
3985 j += i;
3986 while (i--)
3987 *q++ = ' ';
3988 }
3989 }
3990 else {
3991 j++;
3992 *q++ = *p;
3993 if (*p == '\n' || *p == '\r')
3994 j = 0;
3995 }
3996
3997 return (PyObject*) u;
3998}
3999
4000static char find__doc__[] =
4001"S.find(sub [,start [,end]]) -> int\n\
4002\n\
4003Return the lowest index in S where substring sub is found,\n\
4004such that sub is contained within s[start,end]. Optional\n\
4005arguments start and end are interpreted as in slice notation.\n\
4006\n\
4007Return -1 on failure.";
4008
4009static PyObject *
4010unicode_find(PyUnicodeObject *self, PyObject *args)
4011{
4012 PyUnicodeObject *substring;
4013 int start = 0;
4014 int end = INT_MAX;
4015 PyObject *result;
4016
Guido van Rossumb8872e62000-05-09 14:14:27 +00004017 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4018 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 return NULL;
4020 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4021 (PyObject *)substring);
4022 if (substring == NULL)
4023 return NULL;
4024
4025 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4026
4027 Py_DECREF(substring);
4028 return result;
4029}
4030
4031static PyObject *
4032unicode_getitem(PyUnicodeObject *self, int index)
4033{
4034 if (index < 0 || index >= self->length) {
4035 PyErr_SetString(PyExc_IndexError, "string index out of range");
4036 return NULL;
4037 }
4038
4039 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4040}
4041
4042static long
4043unicode_hash(PyUnicodeObject *self)
4044{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004045 /* Since Unicode objects compare equal to their ASCII string
4046 counterparts, they should use the individual character values
4047 as basis for their hash value. This is needed to assure that
4048 strings and Unicode objects behave in the same way as
4049 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050
Fredrik Lundhdde61642000-07-10 18:27:47 +00004051 register int len;
4052 register Py_UNICODE *p;
4053 register long x;
4054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 if (self->hash != -1)
4056 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004057 len = PyUnicode_GET_SIZE(self);
4058 p = PyUnicode_AS_UNICODE(self);
4059 x = *p << 7;
4060 while (--len >= 0)
4061 x = (1000003*x) ^ *p++;
4062 x ^= PyUnicode_GET_SIZE(self);
4063 if (x == -1)
4064 x = -2;
4065 self->hash = x;
4066 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067}
4068
4069static char index__doc__[] =
4070"S.index(sub [,start [,end]]) -> int\n\
4071\n\
4072Like S.find() but raise ValueError when the substring is not found.";
4073
4074static PyObject *
4075unicode_index(PyUnicodeObject *self, PyObject *args)
4076{
4077 int result;
4078 PyUnicodeObject *substring;
4079 int start = 0;
4080 int end = INT_MAX;
4081
Guido van Rossumb8872e62000-05-09 14:14:27 +00004082 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4083 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 return NULL;
4085
4086 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4087 (PyObject *)substring);
4088 if (substring == NULL)
4089 return NULL;
4090
4091 result = findstring(self, substring, start, end, 1);
4092
4093 Py_DECREF(substring);
4094 if (result < 0) {
4095 PyErr_SetString(PyExc_ValueError, "substring not found");
4096 return NULL;
4097 }
4098 return PyInt_FromLong(result);
4099}
4100
4101static char islower__doc__[] =
4102"S.islower() -> int\n\
4103\n\
4104Return 1 if all cased characters in S are lowercase and there is\n\
4105at least one cased character in S, 0 otherwise.";
4106
4107static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004108unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109{
4110 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4111 register const Py_UNICODE *e;
4112 int cased;
4113
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 /* Shortcut for single character strings */
4115 if (PyUnicode_GET_SIZE(self) == 1)
4116 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4117
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004118 /* Special case for empty strings */
4119 if (PyString_GET_SIZE(self) == 0)
4120 return PyInt_FromLong(0);
4121
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 e = p + PyUnicode_GET_SIZE(self);
4123 cased = 0;
4124 for (; p < e; p++) {
4125 register const Py_UNICODE ch = *p;
4126
4127 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4128 return PyInt_FromLong(0);
4129 else if (!cased && Py_UNICODE_ISLOWER(ch))
4130 cased = 1;
4131 }
4132 return PyInt_FromLong(cased);
4133}
4134
4135static char isupper__doc__[] =
4136"S.isupper() -> int\n\
4137\n\
4138Return 1 if all cased characters in S are uppercase and there is\n\
4139at least one cased character in S, 0 otherwise.";
4140
4141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004142unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143{
4144 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4145 register const Py_UNICODE *e;
4146 int cased;
4147
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 /* Shortcut for single character strings */
4149 if (PyUnicode_GET_SIZE(self) == 1)
4150 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4151
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004152 /* Special case for empty strings */
4153 if (PyString_GET_SIZE(self) == 0)
4154 return PyInt_FromLong(0);
4155
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 e = p + PyUnicode_GET_SIZE(self);
4157 cased = 0;
4158 for (; p < e; p++) {
4159 register const Py_UNICODE ch = *p;
4160
4161 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4162 return PyInt_FromLong(0);
4163 else if (!cased && Py_UNICODE_ISUPPER(ch))
4164 cased = 1;
4165 }
4166 return PyInt_FromLong(cased);
4167}
4168
4169static char istitle__doc__[] =
4170"S.istitle() -> int\n\
4171\n\
4172Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4173may only follow uncased characters and lowercase characters only cased\n\
4174ones. Return 0 otherwise.";
4175
4176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004177unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178{
4179 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4180 register const Py_UNICODE *e;
4181 int cased, previous_is_cased;
4182
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 /* Shortcut for single character strings */
4184 if (PyUnicode_GET_SIZE(self) == 1)
4185 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4186 (Py_UNICODE_ISUPPER(*p) != 0));
4187
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004188 /* Special case for empty strings */
4189 if (PyString_GET_SIZE(self) == 0)
4190 return PyInt_FromLong(0);
4191
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 e = p + PyUnicode_GET_SIZE(self);
4193 cased = 0;
4194 previous_is_cased = 0;
4195 for (; p < e; p++) {
4196 register const Py_UNICODE ch = *p;
4197
4198 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4199 if (previous_is_cased)
4200 return PyInt_FromLong(0);
4201 previous_is_cased = 1;
4202 cased = 1;
4203 }
4204 else if (Py_UNICODE_ISLOWER(ch)) {
4205 if (!previous_is_cased)
4206 return PyInt_FromLong(0);
4207 previous_is_cased = 1;
4208 cased = 1;
4209 }
4210 else
4211 previous_is_cased = 0;
4212 }
4213 return PyInt_FromLong(cased);
4214}
4215
4216static char isspace__doc__[] =
4217"S.isspace() -> int\n\
4218\n\
4219Return 1 if there are only whitespace characters in S,\n\
42200 otherwise.";
4221
4222static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004223unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224{
4225 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4226 register const Py_UNICODE *e;
4227
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 /* Shortcut for single character strings */
4229 if (PyUnicode_GET_SIZE(self) == 1 &&
4230 Py_UNICODE_ISSPACE(*p))
4231 return PyInt_FromLong(1);
4232
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004233 /* Special case for empty strings */
4234 if (PyString_GET_SIZE(self) == 0)
4235 return PyInt_FromLong(0);
4236
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 e = p + PyUnicode_GET_SIZE(self);
4238 for (; p < e; p++) {
4239 if (!Py_UNICODE_ISSPACE(*p))
4240 return PyInt_FromLong(0);
4241 }
4242 return PyInt_FromLong(1);
4243}
4244
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004245static char isalpha__doc__[] =
4246"S.isalpha() -> int\n\
4247\n\
4248Return 1 if all characters in S are alphabetic\n\
4249and there is at least one character in S, 0 otherwise.";
4250
4251static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004252unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004253{
4254 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4255 register const Py_UNICODE *e;
4256
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004257 /* Shortcut for single character strings */
4258 if (PyUnicode_GET_SIZE(self) == 1 &&
4259 Py_UNICODE_ISALPHA(*p))
4260 return PyInt_FromLong(1);
4261
4262 /* Special case for empty strings */
4263 if (PyString_GET_SIZE(self) == 0)
4264 return PyInt_FromLong(0);
4265
4266 e = p + PyUnicode_GET_SIZE(self);
4267 for (; p < e; p++) {
4268 if (!Py_UNICODE_ISALPHA(*p))
4269 return PyInt_FromLong(0);
4270 }
4271 return PyInt_FromLong(1);
4272}
4273
4274static char isalnum__doc__[] =
4275"S.isalnum() -> int\n\
4276\n\
4277Return 1 if all characters in S are alphanumeric\n\
4278and there is at least one character in S, 0 otherwise.";
4279
4280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004281unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004282{
4283 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4284 register const Py_UNICODE *e;
4285
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004286 /* Shortcut for single character strings */
4287 if (PyUnicode_GET_SIZE(self) == 1 &&
4288 Py_UNICODE_ISALNUM(*p))
4289 return PyInt_FromLong(1);
4290
4291 /* Special case for empty strings */
4292 if (PyString_GET_SIZE(self) == 0)
4293 return PyInt_FromLong(0);
4294
4295 e = p + PyUnicode_GET_SIZE(self);
4296 for (; p < e; p++) {
4297 if (!Py_UNICODE_ISALNUM(*p))
4298 return PyInt_FromLong(0);
4299 }
4300 return PyInt_FromLong(1);
4301}
4302
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303static char isdecimal__doc__[] =
4304"S.isdecimal() -> int\n\
4305\n\
4306Return 1 if there are only decimal characters in S,\n\
43070 otherwise.";
4308
4309static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004310unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311{
4312 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4313 register const Py_UNICODE *e;
4314
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 /* Shortcut for single character strings */
4316 if (PyUnicode_GET_SIZE(self) == 1 &&
4317 Py_UNICODE_ISDECIMAL(*p))
4318 return PyInt_FromLong(1);
4319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004320 /* Special case for empty strings */
4321 if (PyString_GET_SIZE(self) == 0)
4322 return PyInt_FromLong(0);
4323
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 e = p + PyUnicode_GET_SIZE(self);
4325 for (; p < e; p++) {
4326 if (!Py_UNICODE_ISDECIMAL(*p))
4327 return PyInt_FromLong(0);
4328 }
4329 return PyInt_FromLong(1);
4330}
4331
4332static char isdigit__doc__[] =
4333"S.isdigit() -> int\n\
4334\n\
4335Return 1 if there are only digit characters in S,\n\
43360 otherwise.";
4337
4338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004339unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340{
4341 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4342 register const Py_UNICODE *e;
4343
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 /* Shortcut for single character strings */
4345 if (PyUnicode_GET_SIZE(self) == 1 &&
4346 Py_UNICODE_ISDIGIT(*p))
4347 return PyInt_FromLong(1);
4348
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004349 /* Special case for empty strings */
4350 if (PyString_GET_SIZE(self) == 0)
4351 return PyInt_FromLong(0);
4352
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 e = p + PyUnicode_GET_SIZE(self);
4354 for (; p < e; p++) {
4355 if (!Py_UNICODE_ISDIGIT(*p))
4356 return PyInt_FromLong(0);
4357 }
4358 return PyInt_FromLong(1);
4359}
4360
4361static char isnumeric__doc__[] =
4362"S.isnumeric() -> int\n\
4363\n\
4364Return 1 if there are only numeric characters in S,\n\
43650 otherwise.";
4366
4367static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004368unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369{
4370 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4371 register const Py_UNICODE *e;
4372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 /* Shortcut for single character strings */
4374 if (PyUnicode_GET_SIZE(self) == 1 &&
4375 Py_UNICODE_ISNUMERIC(*p))
4376 return PyInt_FromLong(1);
4377
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004378 /* Special case for empty strings */
4379 if (PyString_GET_SIZE(self) == 0)
4380 return PyInt_FromLong(0);
4381
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 e = p + PyUnicode_GET_SIZE(self);
4383 for (; p < e; p++) {
4384 if (!Py_UNICODE_ISNUMERIC(*p))
4385 return PyInt_FromLong(0);
4386 }
4387 return PyInt_FromLong(1);
4388}
4389
4390static char join__doc__[] =
4391"S.join(sequence) -> unicode\n\
4392\n\
4393Return a string which is the concatenation of the strings in the\n\
4394sequence. The separator between elements is S.";
4395
4396static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004397unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004399 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400}
4401
4402static int
4403unicode_length(PyUnicodeObject *self)
4404{
4405 return self->length;
4406}
4407
4408static char ljust__doc__[] =
4409"S.ljust(width) -> unicode\n\
4410\n\
4411Return S left justified in a Unicode string of length width. Padding is\n\
4412done using spaces.";
4413
4414static PyObject *
4415unicode_ljust(PyUnicodeObject *self, PyObject *args)
4416{
4417 int width;
4418 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4419 return NULL;
4420
Tim Peters7a29bd52001-09-12 03:03:31 +00004421 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 Py_INCREF(self);
4423 return (PyObject*) self;
4424 }
4425
4426 return (PyObject*) pad(self, 0, width - self->length, ' ');
4427}
4428
4429static char lower__doc__[] =
4430"S.lower() -> unicode\n\
4431\n\
4432Return a copy of the string S converted to lowercase.";
4433
4434static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004435unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 return fixup(self, fixlower);
4438}
4439
4440static char lstrip__doc__[] =
4441"S.lstrip() -> unicode\n\
4442\n\
4443Return a copy of the string S with leading whitespace removed.";
4444
4445static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004446unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 return strip(self, 1, 0);
4449}
4450
4451static PyObject*
4452unicode_repeat(PyUnicodeObject *str, int len)
4453{
4454 PyUnicodeObject *u;
4455 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004456 int nchars;
4457 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458
4459 if (len < 0)
4460 len = 0;
4461
Tim Peters7a29bd52001-09-12 03:03:31 +00004462 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 /* no repeat, return original string */
4464 Py_INCREF(str);
4465 return (PyObject*) str;
4466 }
Tim Peters8f422462000-09-09 06:13:41 +00004467
4468 /* ensure # of chars needed doesn't overflow int and # of bytes
4469 * needed doesn't overflow size_t
4470 */
4471 nchars = len * str->length;
4472 if (len && nchars / len != str->length) {
4473 PyErr_SetString(PyExc_OverflowError,
4474 "repeated string is too long");
4475 return NULL;
4476 }
4477 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4478 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4479 PyErr_SetString(PyExc_OverflowError,
4480 "repeated string is too long");
4481 return NULL;
4482 }
4483 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 if (!u)
4485 return NULL;
4486
4487 p = u->str;
4488
4489 while (len-- > 0) {
4490 Py_UNICODE_COPY(p, str->str, str->length);
4491 p += str->length;
4492 }
4493
4494 return (PyObject*) u;
4495}
4496
4497PyObject *PyUnicode_Replace(PyObject *obj,
4498 PyObject *subobj,
4499 PyObject *replobj,
4500 int maxcount)
4501{
4502 PyObject *self;
4503 PyObject *str1;
4504 PyObject *str2;
4505 PyObject *result;
4506
4507 self = PyUnicode_FromObject(obj);
4508 if (self == NULL)
4509 return NULL;
4510 str1 = PyUnicode_FromObject(subobj);
4511 if (str1 == NULL) {
4512 Py_DECREF(self);
4513 return NULL;
4514 }
4515 str2 = PyUnicode_FromObject(replobj);
4516 if (str2 == NULL) {
4517 Py_DECREF(self);
4518 Py_DECREF(str1);
4519 return NULL;
4520 }
4521 result = replace((PyUnicodeObject *)self,
4522 (PyUnicodeObject *)str1,
4523 (PyUnicodeObject *)str2,
4524 maxcount);
4525 Py_DECREF(self);
4526 Py_DECREF(str1);
4527 Py_DECREF(str2);
4528 return result;
4529}
4530
4531static char replace__doc__[] =
4532"S.replace (old, new[, maxsplit]) -> unicode\n\
4533\n\
4534Return a copy of S with all occurrences of substring\n\
4535old replaced by new. If the optional argument maxsplit is\n\
4536given, only the first maxsplit occurrences are replaced.";
4537
4538static PyObject*
4539unicode_replace(PyUnicodeObject *self, PyObject *args)
4540{
4541 PyUnicodeObject *str1;
4542 PyUnicodeObject *str2;
4543 int maxcount = -1;
4544 PyObject *result;
4545
4546 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4547 return NULL;
4548 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4549 if (str1 == NULL)
4550 return NULL;
4551 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4552 if (str2 == NULL)
4553 return NULL;
4554
4555 result = replace(self, str1, str2, maxcount);
4556
4557 Py_DECREF(str1);
4558 Py_DECREF(str2);
4559 return result;
4560}
4561
4562static
4563PyObject *unicode_repr(PyObject *unicode)
4564{
4565 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4566 PyUnicode_GET_SIZE(unicode),
4567 1);
4568}
4569
4570static char rfind__doc__[] =
4571"S.rfind(sub [,start [,end]]) -> int\n\
4572\n\
4573Return the highest index in S where substring sub is found,\n\
4574such that sub is contained within s[start,end]. Optional\n\
4575arguments start and end are interpreted as in slice notation.\n\
4576\n\
4577Return -1 on failure.";
4578
4579static PyObject *
4580unicode_rfind(PyUnicodeObject *self, PyObject *args)
4581{
4582 PyUnicodeObject *substring;
4583 int start = 0;
4584 int end = INT_MAX;
4585 PyObject *result;
4586
Guido van Rossumb8872e62000-05-09 14:14:27 +00004587 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4588 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 return NULL;
4590 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4591 (PyObject *)substring);
4592 if (substring == NULL)
4593 return NULL;
4594
4595 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4596
4597 Py_DECREF(substring);
4598 return result;
4599}
4600
4601static char rindex__doc__[] =
4602"S.rindex(sub [,start [,end]]) -> int\n\
4603\n\
4604Like S.rfind() but raise ValueError when the substring is not found.";
4605
4606static PyObject *
4607unicode_rindex(PyUnicodeObject *self, PyObject *args)
4608{
4609 int result;
4610 PyUnicodeObject *substring;
4611 int start = 0;
4612 int end = INT_MAX;
4613
Guido van Rossumb8872e62000-05-09 14:14:27 +00004614 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4615 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 return NULL;
4617 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4618 (PyObject *)substring);
4619 if (substring == NULL)
4620 return NULL;
4621
4622 result = findstring(self, substring, start, end, -1);
4623
4624 Py_DECREF(substring);
4625 if (result < 0) {
4626 PyErr_SetString(PyExc_ValueError, "substring not found");
4627 return NULL;
4628 }
4629 return PyInt_FromLong(result);
4630}
4631
4632static char rjust__doc__[] =
4633"S.rjust(width) -> unicode\n\
4634\n\
4635Return S right justified in a Unicode string of length width. Padding is\n\
4636done using spaces.";
4637
4638static PyObject *
4639unicode_rjust(PyUnicodeObject *self, PyObject *args)
4640{
4641 int width;
4642 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4643 return NULL;
4644
Tim Peters7a29bd52001-09-12 03:03:31 +00004645 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 Py_INCREF(self);
4647 return (PyObject*) self;
4648 }
4649
4650 return (PyObject*) pad(self, width - self->length, 0, ' ');
4651}
4652
4653static char rstrip__doc__[] =
4654"S.rstrip() -> unicode\n\
4655\n\
4656Return a copy of the string S with trailing whitespace removed.";
4657
4658static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004659unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 return strip(self, 0, 1);
4662}
4663
4664static PyObject*
4665unicode_slice(PyUnicodeObject *self, int start, int end)
4666{
4667 /* standard clamping */
4668 if (start < 0)
4669 start = 0;
4670 if (end < 0)
4671 end = 0;
4672 if (end > self->length)
4673 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004674 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 /* full slice, return original string */
4676 Py_INCREF(self);
4677 return (PyObject*) self;
4678 }
4679 if (start > end)
4680 start = end;
4681 /* copy slice */
4682 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4683 end - start);
4684}
4685
4686PyObject *PyUnicode_Split(PyObject *s,
4687 PyObject *sep,
4688 int maxsplit)
4689{
4690 PyObject *result;
4691
4692 s = PyUnicode_FromObject(s);
4693 if (s == NULL)
4694 return NULL;
4695 if (sep != NULL) {
4696 sep = PyUnicode_FromObject(sep);
4697 if (sep == NULL) {
4698 Py_DECREF(s);
4699 return NULL;
4700 }
4701 }
4702
4703 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4704
4705 Py_DECREF(s);
4706 Py_XDECREF(sep);
4707 return result;
4708}
4709
4710static char split__doc__[] =
4711"S.split([sep [,maxsplit]]) -> list of strings\n\
4712\n\
4713Return a list of the words in S, using sep as the\n\
4714delimiter string. If maxsplit is given, at most maxsplit\n\
4715splits are done. If sep is not specified, any whitespace string\n\
4716is a separator.";
4717
4718static PyObject*
4719unicode_split(PyUnicodeObject *self, PyObject *args)
4720{
4721 PyObject *substring = Py_None;
4722 int maxcount = -1;
4723
4724 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4725 return NULL;
4726
4727 if (substring == Py_None)
4728 return split(self, NULL, maxcount);
4729 else if (PyUnicode_Check(substring))
4730 return split(self, (PyUnicodeObject *)substring, maxcount);
4731 else
4732 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4733}
4734
4735static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004736"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737\n\
4738Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004739Line breaks are not included in the resulting list unless keepends\n\
4740is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741
4742static PyObject*
4743unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4744{
Guido van Rossum86662912000-04-11 15:38:46 +00004745 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
Guido van Rossum86662912000-04-11 15:38:46 +00004747 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 return NULL;
4749
Guido van Rossum86662912000-04-11 15:38:46 +00004750 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751}
4752
4753static
4754PyObject *unicode_str(PyUnicodeObject *self)
4755{
Fred Drakee4315f52000-05-09 19:53:39 +00004756 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757}
4758
4759static char strip__doc__[] =
4760"S.strip() -> unicode\n\
4761\n\
4762Return a copy of S with leading and trailing whitespace removed.";
4763
4764static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004765unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 return strip(self, 1, 1);
4768}
4769
4770static char swapcase__doc__[] =
4771"S.swapcase() -> unicode\n\
4772\n\
4773Return a copy of S with uppercase characters converted to lowercase\n\
4774and vice versa.";
4775
4776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004777unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 return fixup(self, fixswapcase);
4780}
4781
4782static char translate__doc__[] =
4783"S.translate(table) -> unicode\n\
4784\n\
4785Return a copy of the string S, where all characters have been mapped\n\
4786through the given translation table, which must be a mapping of\n\
4787Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4788are left untouched. Characters mapped to None are deleted.";
4789
4790static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004791unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 return PyUnicode_TranslateCharmap(self->str,
4794 self->length,
4795 table,
4796 "ignore");
4797}
4798
4799static char upper__doc__[] =
4800"S.upper() -> unicode\n\
4801\n\
4802Return a copy of S converted to uppercase.";
4803
4804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004805unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 return fixup(self, fixupper);
4808}
4809
4810#if 0
4811static char zfill__doc__[] =
4812"S.zfill(width) -> unicode\n\
4813\n\
4814Pad a numeric string x with zeros on the left, to fill a field\n\
4815of the specified width. The string x is never truncated.";
4816
4817static PyObject *
4818unicode_zfill(PyUnicodeObject *self, PyObject *args)
4819{
4820 int fill;
4821 PyUnicodeObject *u;
4822
4823 int width;
4824 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4825 return NULL;
4826
4827 if (self->length >= width) {
4828 Py_INCREF(self);
4829 return (PyObject*) self;
4830 }
4831
4832 fill = width - self->length;
4833
4834 u = pad(self, fill, 0, '0');
4835
4836 if (u->str[fill] == '+' || u->str[fill] == '-') {
4837 /* move sign to beginning of string */
4838 u->str[0] = u->str[fill];
4839 u->str[fill] = '0';
4840 }
4841
4842 return (PyObject*) u;
4843}
4844#endif
4845
4846#if 0
4847static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004848unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 return PyInt_FromLong(unicode_freelist_size);
4851}
4852#endif
4853
4854static char startswith__doc__[] =
4855"S.startswith(prefix[, start[, end]]) -> int\n\
4856\n\
4857Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4858optional start, test S beginning at that position. With optional end, stop\n\
4859comparing S at that position.";
4860
4861static PyObject *
4862unicode_startswith(PyUnicodeObject *self,
4863 PyObject *args)
4864{
4865 PyUnicodeObject *substring;
4866 int start = 0;
4867 int end = INT_MAX;
4868 PyObject *result;
4869
Guido van Rossumb8872e62000-05-09 14:14:27 +00004870 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4871 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 return NULL;
4873 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4874 (PyObject *)substring);
4875 if (substring == NULL)
4876 return NULL;
4877
4878 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4879
4880 Py_DECREF(substring);
4881 return result;
4882}
4883
4884
4885static char endswith__doc__[] =
4886"S.endswith(suffix[, start[, end]]) -> int\n\
4887\n\
4888Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4889optional start, test S beginning at that position. With optional end, stop\n\
4890comparing S at that position.";
4891
4892static PyObject *
4893unicode_endswith(PyUnicodeObject *self,
4894 PyObject *args)
4895{
4896 PyUnicodeObject *substring;
4897 int start = 0;
4898 int end = INT_MAX;
4899 PyObject *result;
4900
Guido van Rossumb8872e62000-05-09 14:14:27 +00004901 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4902 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 return NULL;
4904 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4905 (PyObject *)substring);
4906 if (substring == NULL)
4907 return NULL;
4908
4909 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4910
4911 Py_DECREF(substring);
4912 return result;
4913}
4914
4915
4916static PyMethodDef unicode_methods[] = {
4917
4918 /* Order is according to common usage: often used methods should
4919 appear first, since lookup is done sequentially. */
4920
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004921 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4922 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4923 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4924 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4925 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4926 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4927 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4928 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4929 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4930 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4931 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4932 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4933 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4934 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4935/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4936 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4937 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4938 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4939 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4940 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4941 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4942 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4943 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4944 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4945 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4946 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4947 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4948 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4949 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4950 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4951 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4952 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4953 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4954 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4955 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004957 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4958 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959#endif
4960
4961#if 0
4962 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004963 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964#endif
4965
4966 {NULL, NULL}
4967};
4968
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969static PySequenceMethods unicode_as_sequence = {
4970 (inquiry) unicode_length, /* sq_length */
4971 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4972 (intargfunc) unicode_repeat, /* sq_repeat */
4973 (intargfunc) unicode_getitem, /* sq_item */
4974 (intintargfunc) unicode_slice, /* sq_slice */
4975 0, /* sq_ass_item */
4976 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004977 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978};
4979
4980static int
4981unicode_buffer_getreadbuf(PyUnicodeObject *self,
4982 int index,
4983 const void **ptr)
4984{
4985 if (index != 0) {
4986 PyErr_SetString(PyExc_SystemError,
4987 "accessing non-existent unicode segment");
4988 return -1;
4989 }
4990 *ptr = (void *) self->str;
4991 return PyUnicode_GET_DATA_SIZE(self);
4992}
4993
4994static int
4995unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4996 const void **ptr)
4997{
4998 PyErr_SetString(PyExc_TypeError,
4999 "cannot use unicode as modifyable buffer");
5000 return -1;
5001}
5002
5003static int
5004unicode_buffer_getsegcount(PyUnicodeObject *self,
5005 int *lenp)
5006{
5007 if (lenp)
5008 *lenp = PyUnicode_GET_DATA_SIZE(self);
5009 return 1;
5010}
5011
5012static int
5013unicode_buffer_getcharbuf(PyUnicodeObject *self,
5014 int index,
5015 const void **ptr)
5016{
5017 PyObject *str;
5018
5019 if (index != 0) {
5020 PyErr_SetString(PyExc_SystemError,
5021 "accessing non-existent unicode segment");
5022 return -1;
5023 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005024 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 if (str == NULL)
5026 return -1;
5027 *ptr = (void *) PyString_AS_STRING(str);
5028 return PyString_GET_SIZE(str);
5029}
5030
5031/* Helpers for PyUnicode_Format() */
5032
5033static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005034getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035{
5036 int argidx = *p_argidx;
5037 if (argidx < arglen) {
5038 (*p_argidx)++;
5039 if (arglen < 0)
5040 return args;
5041 else
5042 return PyTuple_GetItem(args, argidx);
5043 }
5044 PyErr_SetString(PyExc_TypeError,
5045 "not enough arguments for format string");
5046 return NULL;
5047}
5048
5049#define F_LJUST (1<<0)
5050#define F_SIGN (1<<1)
5051#define F_BLANK (1<<2)
5052#define F_ALT (1<<3)
5053#define F_ZERO (1<<4)
5054
5055static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057{
5058 register int i;
5059 int len;
5060 va_list va;
5061 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063
5064 /* First, format the string as char array, then expand to Py_UNICODE
5065 array. */
5066 charbuffer = (char *)buffer;
5067 len = vsprintf(charbuffer, format, va);
5068 for (i = len - 1; i >= 0; i--)
5069 buffer[i] = (Py_UNICODE) charbuffer[i];
5070
5071 va_end(va);
5072 return len;
5073}
5074
5075static int
5076formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005077 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 int flags,
5079 int prec,
5080 int type,
5081 PyObject *v)
5082{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005083 /* fmt = '%#.' + `prec` + `type`
5084 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 char fmt[20];
5086 double x;
5087
5088 x = PyFloat_AsDouble(v);
5089 if (x == -1.0 && PyErr_Occurred())
5090 return -1;
5091 if (prec < 0)
5092 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5094 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005095 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5096 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005097 /* worst case length calc to ensure no buffer overrun:
5098 fmt = %#.<prec>g
5099 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5100 for any double rep.)
5101 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5102 If prec=0 the effective precision is 1 (the leading digit is
5103 always given), therefore increase by one to 10+prec. */
5104 if (buflen <= (size_t)10 + (size_t)prec) {
5105 PyErr_SetString(PyExc_OverflowError,
5106 "formatted float is too long (precision too long?)");
5107 return -1;
5108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 return usprintf(buf, fmt, x);
5110}
5111
Tim Peters38fd5b62000-09-21 05:43:11 +00005112static PyObject*
5113formatlong(PyObject *val, int flags, int prec, int type)
5114{
5115 char *buf;
5116 int i, len;
5117 PyObject *str; /* temporary string object. */
5118 PyUnicodeObject *result;
5119
5120 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5121 if (!str)
5122 return NULL;
5123 result = _PyUnicode_New(len);
5124 for (i = 0; i < len; i++)
5125 result->str[i] = buf[i];
5126 result->str[len] = 0;
5127 Py_DECREF(str);
5128 return (PyObject*)result;
5129}
5130
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131static int
5132formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005133 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 int flags,
5135 int prec,
5136 int type,
5137 PyObject *v)
5138{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005139 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005140 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5141 + 1 + 1 = 24*/
5142 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005144 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145
5146 x = PyInt_AsLong(v);
5147 if (x == -1 && PyErr_Occurred())
5148 return -1;
5149 if (prec < 0)
5150 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005151 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5152 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5153 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5154 PyErr_SetString(PyExc_OverflowError,
5155 "formatted integer is too long (precision too long?)");
5156 return -1;
5157 }
Tim Petersfff53252001-04-12 18:38:48 +00005158 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5159 * but we want it (for consistency with other %#x conversions, and
5160 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005161 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5162 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5163 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005164 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005165 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5166 /* Only way to know what the platform does is to try it. */
Barry Warsawe5c492d2001-11-28 21:00:41 +00005167 PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005168 if (fmt[1] != (char)type) {
5169 /* Supply our own leading 0x/0X -- needed under std C */
5170 use_native_c_format = 0;
Barry Warsawe5c492d2001-11-28 21:00:41 +00005171 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005172 }
5173 }
5174 if (use_native_c_format)
Barry Warsawe5c492d2001-11-28 21:00:41 +00005175 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5176 (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 return usprintf(buf, fmt, x);
5178}
5179
5180static int
5181formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005182 size_t buflen,
5183 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005185 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005186 if (PyUnicode_Check(v)) {
5187 if (PyUnicode_GET_SIZE(v) != 1)
5188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005192 else if (PyString_Check(v)) {
5193 if (PyString_GET_SIZE(v) != 1)
5194 goto onError;
5195 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198 else {
5199 /* Integer input truncated to a character */
5200 long x;
5201 x = PyInt_AsLong(v);
5202 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005203 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 buf[0] = (char) x;
5205 }
5206 buf[1] = '\0';
5207 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005208
5209 onError:
5210 PyErr_SetString(PyExc_TypeError,
5211 "%c requires int or char");
5212 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213}
5214
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005215/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5216
5217 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5218 chars are formatted. XXX This is a magic number. Each formatting
5219 routine does bounds checking to ensure no overflow, but a better
5220 solution may be to malloc a buffer of appropriate size for each
5221 format. For now, the current solution is sufficient.
5222*/
5223#define FORMATBUFLEN (size_t)120
5224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225PyObject *PyUnicode_Format(PyObject *format,
5226 PyObject *args)
5227{
5228 Py_UNICODE *fmt, *res;
5229 int fmtcnt, rescnt, reslen, arglen, argidx;
5230 int args_owned = 0;
5231 PyUnicodeObject *result = NULL;
5232 PyObject *dict = NULL;
5233 PyObject *uformat;
5234
5235 if (format == NULL || args == NULL) {
5236 PyErr_BadInternalCall();
5237 return NULL;
5238 }
5239 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005240 if (uformat == NULL)
5241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 fmt = PyUnicode_AS_UNICODE(uformat);
5243 fmtcnt = PyUnicode_GET_SIZE(uformat);
5244
5245 reslen = rescnt = fmtcnt + 100;
5246 result = _PyUnicode_New(reslen);
5247 if (result == NULL)
5248 goto onError;
5249 res = PyUnicode_AS_UNICODE(result);
5250
5251 if (PyTuple_Check(args)) {
5252 arglen = PyTuple_Size(args);
5253 argidx = 0;
5254 }
5255 else {
5256 arglen = -1;
5257 argidx = -2;
5258 }
5259 if (args->ob_type->tp_as_mapping)
5260 dict = args;
5261
5262 while (--fmtcnt >= 0) {
5263 if (*fmt != '%') {
5264 if (--rescnt < 0) {
5265 rescnt = fmtcnt + 100;
5266 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005267 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 return NULL;
5269 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5270 --rescnt;
5271 }
5272 *res++ = *fmt++;
5273 }
5274 else {
5275 /* Got a format specifier */
5276 int flags = 0;
5277 int width = -1;
5278 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 Py_UNICODE c = '\0';
5280 Py_UNICODE fill;
5281 PyObject *v = NULL;
5282 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005283 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 Py_UNICODE sign;
5285 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005286 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287
5288 fmt++;
5289 if (*fmt == '(') {
5290 Py_UNICODE *keystart;
5291 int keylen;
5292 PyObject *key;
5293 int pcount = 1;
5294
5295 if (dict == NULL) {
5296 PyErr_SetString(PyExc_TypeError,
5297 "format requires a mapping");
5298 goto onError;
5299 }
5300 ++fmt;
5301 --fmtcnt;
5302 keystart = fmt;
5303 /* Skip over balanced parentheses */
5304 while (pcount > 0 && --fmtcnt >= 0) {
5305 if (*fmt == ')')
5306 --pcount;
5307 else if (*fmt == '(')
5308 ++pcount;
5309 fmt++;
5310 }
5311 keylen = fmt - keystart - 1;
5312 if (fmtcnt < 0 || pcount > 0) {
5313 PyErr_SetString(PyExc_ValueError,
5314 "incomplete format key");
5315 goto onError;
5316 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005317#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005318 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 then looked up since Python uses strings to hold
5320 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005321 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 key = PyUnicode_EncodeUTF8(keystart,
5323 keylen,
5324 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005325#else
5326 key = PyUnicode_FromUnicode(keystart, keylen);
5327#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 if (key == NULL)
5329 goto onError;
5330 if (args_owned) {
5331 Py_DECREF(args);
5332 args_owned = 0;
5333 }
5334 args = PyObject_GetItem(dict, key);
5335 Py_DECREF(key);
5336 if (args == NULL) {
5337 goto onError;
5338 }
5339 args_owned = 1;
5340 arglen = -1;
5341 argidx = -2;
5342 }
5343 while (--fmtcnt >= 0) {
5344 switch (c = *fmt++) {
5345 case '-': flags |= F_LJUST; continue;
5346 case '+': flags |= F_SIGN; continue;
5347 case ' ': flags |= F_BLANK; continue;
5348 case '#': flags |= F_ALT; continue;
5349 case '0': flags |= F_ZERO; continue;
5350 }
5351 break;
5352 }
5353 if (c == '*') {
5354 v = getnextarg(args, arglen, &argidx);
5355 if (v == NULL)
5356 goto onError;
5357 if (!PyInt_Check(v)) {
5358 PyErr_SetString(PyExc_TypeError,
5359 "* wants int");
5360 goto onError;
5361 }
5362 width = PyInt_AsLong(v);
5363 if (width < 0) {
5364 flags |= F_LJUST;
5365 width = -width;
5366 }
5367 if (--fmtcnt >= 0)
5368 c = *fmt++;
5369 }
5370 else if (c >= '0' && c <= '9') {
5371 width = c - '0';
5372 while (--fmtcnt >= 0) {
5373 c = *fmt++;
5374 if (c < '0' || c > '9')
5375 break;
5376 if ((width*10) / 10 != width) {
5377 PyErr_SetString(PyExc_ValueError,
5378 "width too big");
5379 goto onError;
5380 }
5381 width = width*10 + (c - '0');
5382 }
5383 }
5384 if (c == '.') {
5385 prec = 0;
5386 if (--fmtcnt >= 0)
5387 c = *fmt++;
5388 if (c == '*') {
5389 v = getnextarg(args, arglen, &argidx);
5390 if (v == NULL)
5391 goto onError;
5392 if (!PyInt_Check(v)) {
5393 PyErr_SetString(PyExc_TypeError,
5394 "* wants int");
5395 goto onError;
5396 }
5397 prec = PyInt_AsLong(v);
5398 if (prec < 0)
5399 prec = 0;
5400 if (--fmtcnt >= 0)
5401 c = *fmt++;
5402 }
5403 else if (c >= '0' && c <= '9') {
5404 prec = c - '0';
5405 while (--fmtcnt >= 0) {
5406 c = Py_CHARMASK(*fmt++);
5407 if (c < '0' || c > '9')
5408 break;
5409 if ((prec*10) / 10 != prec) {
5410 PyErr_SetString(PyExc_ValueError,
5411 "prec too big");
5412 goto onError;
5413 }
5414 prec = prec*10 + (c - '0');
5415 }
5416 }
5417 } /* prec */
5418 if (fmtcnt >= 0) {
5419 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 if (--fmtcnt >= 0)
5421 c = *fmt++;
5422 }
5423 }
5424 if (fmtcnt < 0) {
5425 PyErr_SetString(PyExc_ValueError,
5426 "incomplete format");
5427 goto onError;
5428 }
5429 if (c != '%') {
5430 v = getnextarg(args, arglen, &argidx);
5431 if (v == NULL)
5432 goto onError;
5433 }
5434 sign = 0;
5435 fill = ' ';
5436 switch (c) {
5437
5438 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005439 pbuf = formatbuf;
5440 /* presume that buffer length is at least 1 */
5441 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 len = 1;
5443 break;
5444
5445 case 's':
5446 case 'r':
5447 if (PyUnicode_Check(v) && c == 's') {
5448 temp = v;
5449 Py_INCREF(temp);
5450 }
5451 else {
5452 PyObject *unicode;
5453 if (c == 's')
5454 temp = PyObject_Str(v);
5455 else
5456 temp = PyObject_Repr(v);
5457 if (temp == NULL)
5458 goto onError;
5459 if (!PyString_Check(temp)) {
5460 /* XXX Note: this should never happen, since
5461 PyObject_Repr() and PyObject_Str() assure
5462 this */
5463 Py_DECREF(temp);
5464 PyErr_SetString(PyExc_TypeError,
5465 "%s argument has non-string str()");
5466 goto onError;
5467 }
Fred Drakee4315f52000-05-09 19:53:39 +00005468 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005470 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 "strict");
5472 Py_DECREF(temp);
5473 temp = unicode;
5474 if (temp == NULL)
5475 goto onError;
5476 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005477 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 len = PyUnicode_GET_SIZE(temp);
5479 if (prec >= 0 && len > prec)
5480 len = prec;
5481 break;
5482
5483 case 'i':
5484 case 'd':
5485 case 'u':
5486 case 'o':
5487 case 'x':
5488 case 'X':
5489 if (c == 'i')
5490 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005491 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005492 temp = formatlong(v, flags, prec, c);
5493 if (!temp)
5494 goto onError;
5495 pbuf = PyUnicode_AS_UNICODE(temp);
5496 len = PyUnicode_GET_SIZE(temp);
5497 /* unbounded ints can always produce
5498 a sign character! */
5499 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005501 else {
5502 pbuf = formatbuf;
5503 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5504 flags, prec, c, v);
5505 if (len < 0)
5506 goto onError;
5507 /* only d conversion is signed */
5508 sign = c == 'd';
5509 }
5510 if (flags & F_ZERO)
5511 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 break;
5513
5514 case 'e':
5515 case 'E':
5516 case 'f':
5517 case 'g':
5518 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005519 pbuf = formatbuf;
5520 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5521 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 if (len < 0)
5523 goto onError;
5524 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005525 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 fill = '0';
5527 break;
5528
5529 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005530 pbuf = formatbuf;
5531 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 if (len < 0)
5533 goto onError;
5534 break;
5535
5536 default:
5537 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005538 "unsupported format character '%c' (0x%x) "
5539 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005540 (31<=c && c<=126) ? c : '?',
5541 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 goto onError;
5543 }
5544 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005545 if (*pbuf == '-' || *pbuf == '+') {
5546 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 len--;
5548 }
5549 else if (flags & F_SIGN)
5550 sign = '+';
5551 else if (flags & F_BLANK)
5552 sign = ' ';
5553 else
5554 sign = 0;
5555 }
5556 if (width < len)
5557 width = len;
5558 if (rescnt < width + (sign != 0)) {
5559 reslen -= rescnt;
5560 rescnt = width + fmtcnt + 100;
5561 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005562 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 return NULL;
5564 res = PyUnicode_AS_UNICODE(result)
5565 + reslen - rescnt;
5566 }
5567 if (sign) {
5568 if (fill != ' ')
5569 *res++ = sign;
5570 rescnt--;
5571 if (width > len)
5572 width--;
5573 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005574 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5575 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005576 assert(pbuf[1] == c);
5577 if (fill != ' ') {
5578 *res++ = *pbuf++;
5579 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005580 }
Tim Petersfff53252001-04-12 18:38:48 +00005581 rescnt -= 2;
5582 width -= 2;
5583 if (width < 0)
5584 width = 0;
5585 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 if (width > len && !(flags & F_LJUST)) {
5588 do {
5589 --rescnt;
5590 *res++ = fill;
5591 } while (--width > len);
5592 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005593 if (fill == ' ') {
5594 if (sign)
5595 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005596 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005597 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005598 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005599 *res++ = *pbuf++;
5600 *res++ = *pbuf++;
5601 }
5602 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005603 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 res += len;
5605 rescnt -= len;
5606 while (--width >= len) {
5607 --rescnt;
5608 *res++ = ' ';
5609 }
5610 if (dict && (argidx < arglen) && c != '%') {
5611 PyErr_SetString(PyExc_TypeError,
5612 "not all arguments converted");
5613 goto onError;
5614 }
5615 Py_XDECREF(temp);
5616 } /* '%' */
5617 } /* until end */
5618 if (argidx < arglen && !dict) {
5619 PyErr_SetString(PyExc_TypeError,
5620 "not all arguments converted");
5621 goto onError;
5622 }
5623
5624 if (args_owned) {
5625 Py_DECREF(args);
5626 }
5627 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005628 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005629 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 return (PyObject *)result;
5631
5632 onError:
5633 Py_XDECREF(result);
5634 Py_DECREF(uformat);
5635 if (args_owned) {
5636 Py_DECREF(args);
5637 }
5638 return NULL;
5639}
5640
5641static PyBufferProcs unicode_as_buffer = {
5642 (getreadbufferproc) unicode_buffer_getreadbuf,
5643 (getwritebufferproc) unicode_buffer_getwritebuf,
5644 (getsegcountproc) unicode_buffer_getsegcount,
5645 (getcharbufferproc) unicode_buffer_getcharbuf,
5646};
5647
Guido van Rossume023fe02001-08-30 03:12:59 +00005648staticforward PyObject *
5649unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5650
Tim Peters6d6c1a32001-08-02 04:15:00 +00005651static PyObject *
5652unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5653{
5654 PyObject *x = NULL;
5655 static char *kwlist[] = {"string", "encoding", "errors", 0};
5656 char *encoding = NULL;
5657 char *errors = NULL;
5658
Guido van Rossume023fe02001-08-30 03:12:59 +00005659 if (type != &PyUnicode_Type)
5660 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005661 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5662 kwlist, &x, &encoding, &errors))
5663 return NULL;
5664 if (x == NULL)
5665 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005666 if (encoding == NULL && errors == NULL)
5667 return PyObject_Unicode(x);
5668 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005669 return PyUnicode_FromEncodedObject(x, encoding, errors);
5670}
5671
Guido van Rossume023fe02001-08-30 03:12:59 +00005672static PyObject *
5673unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5674{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005675 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005676 int n;
5677
5678 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5679 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5680 if (tmp == NULL)
5681 return NULL;
5682 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005683 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5684 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005685 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005686 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5687 if (pnew->str == NULL) {
5688 _Py_ForgetReference((PyObject *)pnew);
5689 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005690 return NULL;
5691 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005692 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5693 pnew->length = n;
5694 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005695 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005696 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005697}
5698
Tim Peters6d6c1a32001-08-02 04:15:00 +00005699static char unicode_doc[] =
5700"unicode(string [, encoding[, errors]]) -> object\n\
5701\n\
5702Create a new Unicode object from the given encoded string.\n\
5703encoding defaults to the current default string encoding and \n\
5704errors, defining the error handling, to 'strict'.";
5705
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706PyTypeObject PyUnicode_Type = {
5707 PyObject_HEAD_INIT(&PyType_Type)
5708 0, /* ob_size */
5709 "unicode", /* tp_name */
5710 sizeof(PyUnicodeObject), /* tp_size */
5711 0, /* tp_itemsize */
5712 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005713 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005715 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 0, /* tp_setattr */
5717 (cmpfunc) unicode_compare, /* tp_compare */
5718 (reprfunc) unicode_repr, /* tp_repr */
5719 0, /* tp_as_number */
5720 &unicode_as_sequence, /* tp_as_sequence */
5721 0, /* tp_as_mapping */
5722 (hashfunc) unicode_hash, /* tp_hash*/
5723 0, /* tp_call*/
5724 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005725 PyObject_GenericGetAttr, /* tp_getattro */
5726 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005728 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005729 unicode_doc, /* tp_doc */
5730 0, /* tp_traverse */
5731 0, /* tp_clear */
5732 0, /* tp_richcompare */
5733 0, /* tp_weaklistoffset */
5734 0, /* tp_iter */
5735 0, /* tp_iternext */
5736 unicode_methods, /* tp_methods */
5737 0, /* tp_members */
5738 0, /* tp_getset */
5739 0, /* tp_base */
5740 0, /* tp_dict */
5741 0, /* tp_descr_get */
5742 0, /* tp_descr_set */
5743 0, /* tp_dictoffset */
5744 0, /* tp_init */
5745 0, /* tp_alloc */
5746 unicode_new, /* tp_new */
Guido van Rossum9475a232001-10-05 20:51:39 +00005747 _PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748};
5749
5750/* Initialize the Unicode implementation */
5751
Thomas Wouters78890102000-07-22 19:25:51 +00005752void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005754 int i;
5755
Fred Drakee4315f52000-05-09 19:53:39 +00005756 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005757 unicode_freelist = NULL;
5758 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005760 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005761 for (i = 0; i < 256; i++)
5762 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763}
5764
5765/* Finalize the Unicode implementation */
5766
5767void
Thomas Wouters78890102000-07-22 19:25:51 +00005768_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005770 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005771 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005773 Py_XDECREF(unicode_empty);
5774 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005775
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005776 for (i = 0; i < 256; i++) {
5777 if (unicode_latin1[i]) {
5778 Py_DECREF(unicode_latin1[i]);
5779 unicode_latin1[i] = NULL;
5780 }
5781 }
5782
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005783 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 PyUnicodeObject *v = u;
5785 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005786 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005787 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005788 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005789 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005791 unicode_freelist = NULL;
5792 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793}