blob: 96cc5f475f31e82b60f0000db466308061b849c8 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauerdcc819a2002-03-22 15:33:15 +0000204 unicode = PyMalloc_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauerdcc819a2002-03-22 15:33:15 +0000222 PyMalloc_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1178{
1179 PyObject *v;
1180 char *p;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001181 unsigned int cbAllocated = 2 * size;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 unsigned int cbWritten = 0;
1183 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001185 /* Short-cut for emtpy strings */
1186 if (size == 0)
1187 return PyString_FromStringAndSize(NULL, 0);
1188
1189 /* We allocate 4 more bytes to have room for at least one full
1190 UTF-8 sequence; saves a few cycles in the loop below */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001191 v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if (v == NULL)
1193 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001195 p = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 while (i < size) {
1197 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001198
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001199 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001201 cbWritten++;
1202 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 else if (ch < 0x0800) {
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001205 *p++ = (char)(0xc0 | (ch >> 6));
1206 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001207 cbWritten += 2;
1208 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001209
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001210 else {
1211
1212 /* Assure that we have enough room for high order Unicode
1213 ordinals */
1214 if (cbWritten >= cbAllocated) {
1215 cbAllocated += 4 * 10;
1216 if (_PyString_Resize(&v, cbAllocated + 4))
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001217 goto onError;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001218 p = PyString_AS_STRING(v) + cbWritten;
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001219 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001221 if (ch < 0x10000) {
1222 /* Check for high surrogate */
1223 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1224 Py_UCS4 ch2 = s[i];
1225 /* Check for low surrogate */
1226 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001227 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001228 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001229 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001230 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1231 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 i++;
1233 cbWritten += 4;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001234 continue;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001235 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001236 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001238 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001239 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1240 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001241 cbWritten += 3;
1242
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001243 } else {
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001244 *p++ = (char)(0xf0 | (ch>>18));
1245 *p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1246 *p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1247 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001248 cbWritten += 4;
1249 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 *p = '\0';
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001253 if (_PyString_Resize(&v, cbWritten))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return v;
1256
1257 onError:
1258 Py_DECREF(v);
1259 return NULL;
1260}
1261
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 if (!PyUnicode_Check(unicode)) {
1265 PyErr_BadArgument();
1266 return NULL;
1267 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001268 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1269 PyUnicode_GET_SIZE(unicode),
1270 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271}
1272
1273/* --- UTF-16 Codec ------------------------------------------------------- */
1274
1275static
Tim Peters772747b2001-08-09 22:21:55 +00001276int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 const char *errors,
1278 const char *details)
1279{
1280 if ((errors == NULL) ||
1281 (strcmp(errors,"strict") == 0)) {
1282 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001283 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 details);
1285 return -1;
1286 }
1287 else if (strcmp(errors,"ignore") == 0) {
1288 return 0;
1289 }
1290 else if (strcmp(errors,"replace") == 0) {
1291 if (dest) {
1292 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1293 (*dest)++;
1294 }
1295 return 0;
1296 }
1297 else {
1298 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001299 "UTF-16 decoding error; "
1300 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301 errors);
1302 return -1;
1303 }
1304}
1305
Tim Peters772747b2001-08-09 22:21:55 +00001306PyObject *
1307PyUnicode_DecodeUTF16(const char *s,
1308 int size,
1309 const char *errors,
1310 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311{
1312 PyUnicodeObject *unicode;
1313 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001314 const unsigned char *q, *e;
1315 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001317 /* Offsets from q for retrieving byte pairs in the right order. */
1318#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1319 int ihi = 1, ilo = 0;
1320#else
1321 int ihi = 0, ilo = 1;
1322#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323
1324 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001325 if (size & 1) {
1326 if (utf16_decoding_error(NULL, errors, "truncated data"))
1327 return NULL;
1328 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 }
1330
1331 /* Note: size will always be longer than the resulting Unicode
1332 character count */
1333 unicode = _PyUnicode_New(size);
1334 if (!unicode)
1335 return NULL;
1336 if (size == 0)
1337 return (PyObject *)unicode;
1338
1339 /* Unpack UTF-16 encoded data */
1340 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001341 q = (unsigned char *)s;
1342 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343
1344 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001345 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001347 /* Check for BOM marks (U+FEFF) in the input and adjust current
1348 byte order setting accordingly. In native mode, the leading BOM
1349 mark is skipped, in all other modes, it is copied to the output
1350 stream as-is (giving a ZWNBSP character). */
1351 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001352 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001353#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001354 if (bom == 0xFEFF) {
1355 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001356 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001357 }
1358 else if (bom == 0xFFFE) {
1359 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001360 bo = 1;
1361 }
1362#else
Tim Peters772747b2001-08-09 22:21:55 +00001363 if (bom == 0xFEFF) {
1364 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001365 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001366 }
1367 else if (bom == 0xFFFE) {
1368 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001369 bo = -1;
1370 }
1371#endif
1372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373
Tim Peters772747b2001-08-09 22:21:55 +00001374 if (bo == -1) {
1375 /* force LE */
1376 ihi = 1;
1377 ilo = 0;
1378 }
1379 else if (bo == 1) {
1380 /* force BE */
1381 ihi = 0;
1382 ilo = 1;
1383 }
1384
1385 while (q < e) {
1386 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1387 q += 2;
1388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (ch < 0xD800 || ch > 0xDFFF) {
1390 *p++ = ch;
1391 continue;
1392 }
1393
1394 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001395 if (q >= e) {
1396 errmsg = "unexpected end of data";
1397 goto utf16Error;
1398 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001399 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001400 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1401 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001402 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001403#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001404 *p++ = ch;
1405 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001406#else
1407 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001409 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001410 }
1411 else {
1412 errmsg = "illegal UTF-16 surrogate";
1413 goto utf16Error;
1414 }
1415
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001417 errmsg = "illegal encoding";
1418 /* Fall through to report the error */
1419
1420 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001421 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423 }
1424
1425 if (byteorder)
1426 *byteorder = bo;
1427
1428 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001429 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 goto onError;
1431
1432 return (PyObject *)unicode;
1433
1434onError:
1435 Py_DECREF(unicode);
1436 return NULL;
1437}
1438
Tim Peters772747b2001-08-09 22:21:55 +00001439PyObject *
1440PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1441 int size,
1442 const char *errors,
1443 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444{
1445 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001446 unsigned char *p;
1447 int i, pairs;
1448 /* Offsets from p for storing byte pairs in the right order. */
1449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1450 int ihi = 1, ilo = 0;
1451#else
1452 int ihi = 0, ilo = 1;
1453#endif
1454
1455#define STORECHAR(CH) \
1456 do { \
1457 p[ihi] = ((CH) >> 8) & 0xff; \
1458 p[ilo] = (CH) & 0xff; \
1459 p += 2; \
1460 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001462 for (i = pairs = 0; i < size; i++)
1463 if (s[i] >= 0x10000)
1464 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001466 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 if (v == NULL)
1468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469
Tim Peters772747b2001-08-09 22:21:55 +00001470 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001471 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001472 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001473 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001474 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001475
1476 if (byteorder == -1) {
1477 /* force LE */
1478 ihi = 1;
1479 ilo = 0;
1480 }
1481 else if (byteorder == 1) {
1482 /* force BE */
1483 ihi = 0;
1484 ilo = 1;
1485 }
1486
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001487 while (size-- > 0) {
1488 Py_UNICODE ch = *s++;
1489 Py_UNICODE ch2 = 0;
1490 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001491 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1492 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 }
Tim Peters772747b2001-08-09 22:21:55 +00001494 STORECHAR(ch);
1495 if (ch2)
1496 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001499#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500}
1501
1502PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 return NULL;
1507 }
1508 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1509 PyUnicode_GET_SIZE(unicode),
1510 NULL,
1511 0);
1512}
1513
1514/* --- Unicode Escape Codec ----------------------------------------------- */
1515
1516static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001517int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 const char *errors,
1519 const char *details)
1520{
1521 if ((errors == NULL) ||
1522 (strcmp(errors,"strict") == 0)) {
1523 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001524 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 details);
1526 return -1;
1527 }
1528 else if (strcmp(errors,"ignore") == 0) {
1529 return 0;
1530 }
1531 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001532 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1533 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 return 0;
1535 }
1536 else {
1537 PyErr_Format(PyExc_ValueError,
1538 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001539 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 errors);
1541 return -1;
1542 }
1543}
1544
Fredrik Lundh06d12682001-01-24 07:59:11 +00001545static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001546
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1548 int size,
1549 const char *errors)
1550{
1551 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001552 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001554 char* message;
1555 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1556
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 /* Escaped strings will always be longer than the resulting
1558 Unicode string, so we start with size here and then reduce the
1559 length after conversion to the true value. */
1560 v = _PyUnicode_New(size);
1561 if (v == NULL)
1562 goto onError;
1563 if (size == 0)
1564 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 p = buf = PyUnicode_AS_UNICODE(v);
1567 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569 while (s < end) {
1570 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001571 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001572 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
1574 /* Non-escape characters are interpreted as Unicode ordinals */
1575 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001576 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 continue;
1578 }
1579
1580 /* \ - Escapes */
1581 s++;
1582 switch (*s++) {
1583
1584 /* \x escapes */
1585 case '\n': break;
1586 case '\\': *p++ = '\\'; break;
1587 case '\'': *p++ = '\''; break;
1588 case '\"': *p++ = '\"'; break;
1589 case 'b': *p++ = '\b'; break;
1590 case 'f': *p++ = '\014'; break; /* FF */
1591 case 't': *p++ = '\t'; break;
1592 case 'n': *p++ = '\n'; break;
1593 case 'r': *p++ = '\r'; break;
1594 case 'v': *p++ = '\013'; break; /* VT */
1595 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1596
1597 /* \OOO (octal) escapes */
1598 case '0': case '1': case '2': case '3':
1599 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001600 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001602 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001604 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001606 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607 break;
1608
Fredrik Lundhccc74732001-02-18 22:13:49 +00001609 /* hex escapes */
1610 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001612 digits = 2;
1613 message = "truncated \\xXX escape";
1614 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 digits = 4;
1619 message = "truncated \\uXXXX escape";
1620 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621
Fredrik Lundhccc74732001-02-18 22:13:49 +00001622 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001623 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001624 digits = 8;
1625 message = "truncated \\UXXXXXXXX escape";
1626 hexescape:
1627 chr = 0;
1628 for (i = 0; i < digits; i++) {
1629 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001630 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001631 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001632 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001633 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001634 i++;
1635 break;
1636 }
1637 chr = (chr<<4) & ~0xF;
1638 if (c >= '0' && c <= '9')
1639 chr += c - '0';
1640 else if (c >= 'a' && c <= 'f')
1641 chr += 10 + c - 'a';
1642 else
1643 chr += 10 + c - 'A';
1644 }
1645 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001646 if (chr == 0xffffffff)
1647 /* _decoding_error will have already written into the
1648 target buffer. */
1649 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001650 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001651 /* when we get here, chr is a 32-bit unicode character */
1652 if (chr <= 0xffff)
1653 /* UCS-2 character */
1654 *p++ = (Py_UNICODE) chr;
1655 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001656 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001657 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001659 *p++ = chr;
1660#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001661 chr -= 0x10000L;
1662 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001663 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001664#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001665 } else {
1666 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001667 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001669 )
1670 goto onError;
1671 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001672 break;
1673
1674 /* \N{name} */
1675 case 'N':
1676 message = "malformed \\N character escape";
1677 if (ucnhash_CAPI == NULL) {
1678 /* load the unicode data module */
1679 PyObject *m, *v;
1680 m = PyImport_ImportModule("unicodedata");
1681 if (m == NULL)
1682 goto ucnhashError;
1683 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1684 Py_DECREF(m);
1685 if (v == NULL)
1686 goto ucnhashError;
1687 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1688 Py_DECREF(v);
1689 if (ucnhash_CAPI == NULL)
1690 goto ucnhashError;
1691 }
1692 if (*s == '{') {
1693 const char *start = s+1;
1694 /* look for the closing brace */
1695 while (*s != '}' && s < end)
1696 s++;
1697 if (s > start && s < end && *s == '}') {
1698 /* found a name. look it up in the unicode database */
1699 message = "unknown Unicode character name";
1700 s++;
1701 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1702 goto store;
1703 }
1704 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001705 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001706 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001707 break;
1708
1709 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001710 if (s > end) {
1711 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1712 goto onError;
1713 }
1714 else {
1715 *p++ = '\\';
1716 *p++ = (unsigned char)s[-1];
1717 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001718 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 }
1720 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001721 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001724
Fredrik Lundhccc74732001-02-18 22:13:49 +00001725ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001726 PyErr_SetString(
1727 PyExc_UnicodeError,
1728 "\\N escapes not supported (can't load unicodedata module)"
1729 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001730 return NULL;
1731
Fredrik Lundhccc74732001-02-18 22:13:49 +00001732onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 Py_XDECREF(v);
1734 return NULL;
1735}
1736
1737/* Return a Unicode-Escape string version of the Unicode object.
1738
1739 If quotes is true, the string is enclosed in u"" or u'' quotes as
1740 appropriate.
1741
1742*/
1743
Barry Warsaw51ac5802000-03-20 16:36:48 +00001744static const Py_UNICODE *findchar(const Py_UNICODE *s,
1745 int size,
1746 Py_UNICODE ch);
1747
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748static
1749PyObject *unicodeescape_string(const Py_UNICODE *s,
1750 int size,
1751 int quotes)
1752{
1753 PyObject *repr;
1754 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001756 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757
1758 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1759 if (repr == NULL)
1760 return NULL;
1761
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001762 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763
1764 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765 *p++ = 'u';
1766 *p++ = (findchar(s, size, '\'') &&
1767 !findchar(s, size, '"')) ? '"' : '\'';
1768 }
1769 while (size-- > 0) {
1770 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001771
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001773 if (quotes &&
1774 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 *p++ = '\\';
1776 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001777 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001779
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001780#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001781 /* Map 21-bit characters to '\U00xxxxxx' */
1782 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001783 int offset = p - PyString_AS_STRING(repr);
1784
1785 /* Resize the string if necessary */
1786 if (offset + 12 > PyString_GET_SIZE(repr)) {
1787 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1788 goto onError;
1789 p = PyString_AS_STRING(repr) + offset;
1790 }
1791
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001792 *p++ = '\\';
1793 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001794 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1795 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1796 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1797 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1798 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1799 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1800 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001801 *p++ = hexdigit[ch & 0x0000000F];
1802 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001803 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001804#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001805 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1806 else if (ch >= 0xD800 && ch < 0xDC00) {
1807 Py_UNICODE ch2;
1808 Py_UCS4 ucs;
1809
1810 ch2 = *s++;
1811 size--;
1812 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1813 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1814 *p++ = '\\';
1815 *p++ = 'U';
1816 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1817 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1818 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1819 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1820 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1821 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1822 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1823 *p++ = hexdigit[ucs & 0x0000000F];
1824 continue;
1825 }
1826 /* Fall through: isolated surrogates are copied as-is */
1827 s--;
1828 size++;
1829 }
1830
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001832 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 *p++ = '\\';
1834 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001835 *p++ = hexdigit[(ch >> 12) & 0x000F];
1836 *p++ = hexdigit[(ch >> 8) & 0x000F];
1837 *p++ = hexdigit[(ch >> 4) & 0x000F];
1838 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001840
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001841 /* Map special whitespace to '\t', \n', '\r' */
1842 else if (ch == '\t') {
1843 *p++ = '\\';
1844 *p++ = 't';
1845 }
1846 else if (ch == '\n') {
1847 *p++ = '\\';
1848 *p++ = 'n';
1849 }
1850 else if (ch == '\r') {
1851 *p++ = '\\';
1852 *p++ = 'r';
1853 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001854
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001855 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001856 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001858 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001859 *p++ = hexdigit[(ch >> 4) & 0x000F];
1860 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001862
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 /* Copy everything else as-is */
1864 else
1865 *p++ = (char) ch;
1866 }
1867 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001868 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869
1870 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001871 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001872 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001875
1876 onError:
1877 Py_DECREF(repr);
1878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879}
1880
1881PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1882 int size)
1883{
1884 return unicodeescape_string(s, size, 0);
1885}
1886
1887PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1888{
1889 if (!PyUnicode_Check(unicode)) {
1890 PyErr_BadArgument();
1891 return NULL;
1892 }
1893 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1894 PyUnicode_GET_SIZE(unicode));
1895}
1896
1897/* --- Raw Unicode Escape Codec ------------------------------------------- */
1898
1899PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1900 int size,
1901 const char *errors)
1902{
1903 PyUnicodeObject *v;
1904 Py_UNICODE *p, *buf;
1905 const char *end;
1906 const char *bs;
1907
1908 /* Escaped strings will always be longer than the resulting
1909 Unicode string, so we start with size here and then reduce the
1910 length after conversion to the true value. */
1911 v = _PyUnicode_New(size);
1912 if (v == NULL)
1913 goto onError;
1914 if (size == 0)
1915 return (PyObject *)v;
1916 p = buf = PyUnicode_AS_UNICODE(v);
1917 end = s + size;
1918 while (s < end) {
1919 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001920 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 int i;
1922
1923 /* Non-escape characters are interpreted as Unicode ordinals */
1924 if (*s != '\\') {
1925 *p++ = (unsigned char)*s++;
1926 continue;
1927 }
1928
1929 /* \u-escapes are only interpreted iff the number of leading
1930 backslashes if odd */
1931 bs = s;
1932 for (;s < end;) {
1933 if (*s != '\\')
1934 break;
1935 *p++ = (unsigned char)*s++;
1936 }
1937 if (((s - bs) & 1) == 0 ||
1938 s >= end ||
1939 *s != 'u') {
1940 continue;
1941 }
1942 p--;
1943 s++;
1944
1945 /* \uXXXX with 4 hex digits */
1946 for (x = 0, i = 0; i < 4; i++) {
1947 c = (unsigned char)s[i];
1948 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001949 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950 "truncated \\uXXXX"))
1951 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001952 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 i++;
1954 break;
1955 }
1956 x = (x<<4) & ~0xF;
1957 if (c >= '0' && c <= '9')
1958 x += c - '0';
1959 else if (c >= 'a' && c <= 'f')
1960 x += 10 + c - 'a';
1961 else
1962 x += 10 + c - 'A';
1963 }
1964 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001965 if (x != 0xffffffff)
1966 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001968 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001969 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 return (PyObject *)v;
1971
1972 onError:
1973 Py_XDECREF(v);
1974 return NULL;
1975}
1976
1977PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1978 int size)
1979{
1980 PyObject *repr;
1981 char *p;
1982 char *q;
1983
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001984 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
1986 repr = PyString_FromStringAndSize(NULL, 6 * size);
1987 if (repr == NULL)
1988 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001989 if (size == 0)
1990 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991
1992 p = q = PyString_AS_STRING(repr);
1993 while (size-- > 0) {
1994 Py_UNICODE ch = *s++;
1995 /* Map 16-bit characters to '\uxxxx' */
1996 if (ch >= 256) {
1997 *p++ = '\\';
1998 *p++ = 'u';
1999 *p++ = hexdigit[(ch >> 12) & 0xf];
2000 *p++ = hexdigit[(ch >> 8) & 0xf];
2001 *p++ = hexdigit[(ch >> 4) & 0xf];
2002 *p++ = hexdigit[ch & 15];
2003 }
2004 /* Copy everything else as-is */
2005 else
2006 *p++ = (char) ch;
2007 }
2008 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002009 if (_PyString_Resize(&repr, p - q))
2010 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011
2012 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002013
2014 onError:
2015 Py_DECREF(repr);
2016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017}
2018
2019PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2020{
2021 if (!PyUnicode_Check(unicode)) {
2022 PyErr_BadArgument();
2023 return NULL;
2024 }
2025 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2026 PyUnicode_GET_SIZE(unicode));
2027}
2028
2029/* --- Latin-1 Codec ------------------------------------------------------ */
2030
2031PyObject *PyUnicode_DecodeLatin1(const char *s,
2032 int size,
2033 const char *errors)
2034{
2035 PyUnicodeObject *v;
2036 Py_UNICODE *p;
2037
2038 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002039 if (size == 1 && *(unsigned char*)s < 256) {
2040 Py_UNICODE r = *(unsigned char*)s;
2041 return PyUnicode_FromUnicode(&r, 1);
2042 }
2043
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 v = _PyUnicode_New(size);
2045 if (v == NULL)
2046 goto onError;
2047 if (size == 0)
2048 return (PyObject *)v;
2049 p = PyUnicode_AS_UNICODE(v);
2050 while (size-- > 0)
2051 *p++ = (unsigned char)*s++;
2052 return (PyObject *)v;
2053
2054 onError:
2055 Py_XDECREF(v);
2056 return NULL;
2057}
2058
2059static
2060int latin1_encoding_error(const Py_UNICODE **source,
2061 char **dest,
2062 const char *errors,
2063 const char *details)
2064{
2065 if ((errors == NULL) ||
2066 (strcmp(errors,"strict") == 0)) {
2067 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002068 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 details);
2070 return -1;
2071 }
2072 else if (strcmp(errors,"ignore") == 0) {
2073 return 0;
2074 }
2075 else if (strcmp(errors,"replace") == 0) {
2076 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002077 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 return 0;
2079 }
2080 else {
2081 PyErr_Format(PyExc_ValueError,
2082 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002083 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 errors);
2085 return -1;
2086 }
2087}
2088
2089PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2090 int size,
2091 const char *errors)
2092{
2093 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002094 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002095
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 repr = PyString_FromStringAndSize(NULL, size);
2097 if (repr == NULL)
2098 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002099 if (size == 0)
2100 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101
2102 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002103 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 while (size-- > 0) {
2105 Py_UNICODE ch = *p++;
2106 if (ch >= 256) {
2107 if (latin1_encoding_error(&p, &s, errors,
2108 "ordinal not in range(256)"))
2109 goto onError;
2110 }
2111 else
2112 *s++ = (char)ch;
2113 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002114 /* Resize if error handling skipped some characters */
2115 if (s - start < PyString_GET_SIZE(repr))
2116 if (_PyString_Resize(&repr, s - start))
2117 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 return repr;
2119
2120 onError:
2121 Py_DECREF(repr);
2122 return NULL;
2123}
2124
2125PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2126{
2127 if (!PyUnicode_Check(unicode)) {
2128 PyErr_BadArgument();
2129 return NULL;
2130 }
2131 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2132 PyUnicode_GET_SIZE(unicode),
2133 NULL);
2134}
2135
2136/* --- 7-bit ASCII Codec -------------------------------------------------- */
2137
2138static
2139int ascii_decoding_error(const char **source,
2140 Py_UNICODE **dest,
2141 const char *errors,
2142 const char *details)
2143{
2144 if ((errors == NULL) ||
2145 (strcmp(errors,"strict") == 0)) {
2146 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002147 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 details);
2149 return -1;
2150 }
2151 else if (strcmp(errors,"ignore") == 0) {
2152 return 0;
2153 }
2154 else if (strcmp(errors,"replace") == 0) {
2155 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2156 (*dest)++;
2157 return 0;
2158 }
2159 else {
2160 PyErr_Format(PyExc_ValueError,
2161 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002162 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 errors);
2164 return -1;
2165 }
2166}
2167
2168PyObject *PyUnicode_DecodeASCII(const char *s,
2169 int size,
2170 const char *errors)
2171{
2172 PyUnicodeObject *v;
2173 Py_UNICODE *p;
2174
2175 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002176 if (size == 1 && *(unsigned char*)s < 128) {
2177 Py_UNICODE r = *(unsigned char*)s;
2178 return PyUnicode_FromUnicode(&r, 1);
2179 }
2180
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 v = _PyUnicode_New(size);
2182 if (v == NULL)
2183 goto onError;
2184 if (size == 0)
2185 return (PyObject *)v;
2186 p = PyUnicode_AS_UNICODE(v);
2187 while (size-- > 0) {
2188 register unsigned char c;
2189
2190 c = (unsigned char)*s++;
2191 if (c < 128)
2192 *p++ = c;
2193 else if (ascii_decoding_error(&s, &p, errors,
2194 "ordinal not in range(128)"))
2195 goto onError;
2196 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002197 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002198 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002199 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 return (PyObject *)v;
2201
2202 onError:
2203 Py_XDECREF(v);
2204 return NULL;
2205}
2206
2207static
2208int ascii_encoding_error(const Py_UNICODE **source,
2209 char **dest,
2210 const char *errors,
2211 const char *details)
2212{
2213 if ((errors == NULL) ||
2214 (strcmp(errors,"strict") == 0)) {
2215 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002216 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 details);
2218 return -1;
2219 }
2220 else if (strcmp(errors,"ignore") == 0) {
2221 return 0;
2222 }
2223 else if (strcmp(errors,"replace") == 0) {
2224 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002225 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 return 0;
2227 }
2228 else {
2229 PyErr_Format(PyExc_ValueError,
2230 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002231 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 errors);
2233 return -1;
2234 }
2235}
2236
2237PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2238 int size,
2239 const char *errors)
2240{
2241 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002242 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002243
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 repr = PyString_FromStringAndSize(NULL, size);
2245 if (repr == NULL)
2246 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002247 if (size == 0)
2248 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249
2250 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002251 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 while (size-- > 0) {
2253 Py_UNICODE ch = *p++;
2254 if (ch >= 128) {
2255 if (ascii_encoding_error(&p, &s, errors,
2256 "ordinal not in range(128)"))
2257 goto onError;
2258 }
2259 else
2260 *s++ = (char)ch;
2261 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002262 /* Resize if error handling skipped some characters */
2263 if (s - start < PyString_GET_SIZE(repr))
2264 if (_PyString_Resize(&repr, s - start))
2265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 return repr;
2267
2268 onError:
2269 Py_DECREF(repr);
2270 return NULL;
2271}
2272
2273PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2274{
2275 if (!PyUnicode_Check(unicode)) {
2276 PyErr_BadArgument();
2277 return NULL;
2278 }
2279 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2280 PyUnicode_GET_SIZE(unicode),
2281 NULL);
2282}
2283
Fredrik Lundh30831632001-06-26 15:11:00 +00002284#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002285
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002286/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002287
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002288PyObject *PyUnicode_DecodeMBCS(const char *s,
2289 int size,
2290 const char *errors)
2291{
2292 PyUnicodeObject *v;
2293 Py_UNICODE *p;
2294
2295 /* First get the size of the result */
2296 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002297 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002298 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2299
2300 v = _PyUnicode_New(usize);
2301 if (v == NULL)
2302 return NULL;
2303 if (usize == 0)
2304 return (PyObject *)v;
2305 p = PyUnicode_AS_UNICODE(v);
2306 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2307 Py_DECREF(v);
2308 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2309 }
2310
2311 return (PyObject *)v;
2312}
2313
2314PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2315 int size,
2316 const char *errors)
2317{
2318 PyObject *repr;
2319 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002320 DWORD mbcssize;
2321
2322 /* If there are no characters, bail now! */
2323 if (size==0)
2324 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002325
2326 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002327 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002328 if (mbcssize==0)
2329 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2330
2331 repr = PyString_FromStringAndSize(NULL, mbcssize);
2332 if (repr == NULL)
2333 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002334 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002335 return repr;
2336
2337 /* Do the conversion */
2338 s = PyString_AS_STRING(repr);
2339 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2340 Py_DECREF(repr);
2341 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2342 }
2343 return repr;
2344}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002345
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002346#endif /* MS_WIN32 */
2347
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348/* --- Character Mapping Codec -------------------------------------------- */
2349
2350static
2351int charmap_decoding_error(const char **source,
2352 Py_UNICODE **dest,
2353 const char *errors,
2354 const char *details)
2355{
2356 if ((errors == NULL) ||
2357 (strcmp(errors,"strict") == 0)) {
2358 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002359 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 details);
2361 return -1;
2362 }
2363 else if (strcmp(errors,"ignore") == 0) {
2364 return 0;
2365 }
2366 else if (strcmp(errors,"replace") == 0) {
2367 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2368 (*dest)++;
2369 return 0;
2370 }
2371 else {
2372 PyErr_Format(PyExc_ValueError,
2373 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002374 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375 errors);
2376 return -1;
2377 }
2378}
2379
2380PyObject *PyUnicode_DecodeCharmap(const char *s,
2381 int size,
2382 PyObject *mapping,
2383 const char *errors)
2384{
2385 PyUnicodeObject *v;
2386 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002387 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388
2389 /* Default to Latin-1 */
2390 if (mapping == NULL)
2391 return PyUnicode_DecodeLatin1(s, size, errors);
2392
2393 v = _PyUnicode_New(size);
2394 if (v == NULL)
2395 goto onError;
2396 if (size == 0)
2397 return (PyObject *)v;
2398 p = PyUnicode_AS_UNICODE(v);
2399 while (size-- > 0) {
2400 unsigned char ch = *s++;
2401 PyObject *w, *x;
2402
2403 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2404 w = PyInt_FromLong((long)ch);
2405 if (w == NULL)
2406 goto onError;
2407 x = PyObject_GetItem(mapping, w);
2408 Py_DECREF(w);
2409 if (x == NULL) {
2410 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002411 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002413 x = Py_None;
2414 Py_INCREF(x);
2415 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417 }
2418
2419 /* Apply mapping */
2420 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002421 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422 if (value < 0 || value > 65535) {
2423 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002424 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 Py_DECREF(x);
2426 goto onError;
2427 }
2428 *p++ = (Py_UNICODE)value;
2429 }
2430 else if (x == Py_None) {
2431 /* undefined mapping */
2432 if (charmap_decoding_error(&s, &p, errors,
2433 "character maps to <undefined>")) {
2434 Py_DECREF(x);
2435 goto onError;
2436 }
2437 }
2438 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002439 int targetsize = PyUnicode_GET_SIZE(x);
2440
2441 if (targetsize == 1)
2442 /* 1-1 mapping */
2443 *p++ = *PyUnicode_AS_UNICODE(x);
2444
2445 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002447 if (targetsize > extrachars) {
2448 /* resize first */
2449 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2450 int needed = (targetsize - extrachars) + \
2451 (targetsize << 2);
2452 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002453 if (_PyUnicode_Resize(&v,
2454 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002455 Py_DECREF(x);
2456 goto onError;
2457 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002458 p = PyUnicode_AS_UNICODE(v) + oldpos;
2459 }
2460 Py_UNICODE_COPY(p,
2461 PyUnicode_AS_UNICODE(x),
2462 targetsize);
2463 p += targetsize;
2464 extrachars -= targetsize;
2465 }
2466 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 }
2468 else {
2469 /* wrong return value */
2470 PyErr_SetString(PyExc_TypeError,
2471 "character mapping must return integer, None or unicode");
2472 Py_DECREF(x);
2473 goto onError;
2474 }
2475 Py_DECREF(x);
2476 }
2477 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002478 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 goto onError;
2480 return (PyObject *)v;
2481
2482 onError:
2483 Py_XDECREF(v);
2484 return NULL;
2485}
2486
2487static
2488int charmap_encoding_error(const Py_UNICODE **source,
2489 char **dest,
2490 const char *errors,
2491 const char *details)
2492{
2493 if ((errors == NULL) ||
2494 (strcmp(errors,"strict") == 0)) {
2495 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002496 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 details);
2498 return -1;
2499 }
2500 else if (strcmp(errors,"ignore") == 0) {
2501 return 0;
2502 }
2503 else if (strcmp(errors,"replace") == 0) {
2504 **dest = '?';
2505 (*dest)++;
2506 return 0;
2507 }
2508 else {
2509 PyErr_Format(PyExc_ValueError,
2510 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002511 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 errors);
2513 return -1;
2514 }
2515}
2516
2517PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2518 int size,
2519 PyObject *mapping,
2520 const char *errors)
2521{
2522 PyObject *v;
2523 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002524 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525
2526 /* Default to Latin-1 */
2527 if (mapping == NULL)
2528 return PyUnicode_EncodeLatin1(p, size, errors);
2529
2530 v = PyString_FromStringAndSize(NULL, size);
2531 if (v == NULL)
2532 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002533 if (size == 0)
2534 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 s = PyString_AS_STRING(v);
2536 while (size-- > 0) {
2537 Py_UNICODE ch = *p++;
2538 PyObject *w, *x;
2539
2540 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2541 w = PyInt_FromLong((long)ch);
2542 if (w == NULL)
2543 goto onError;
2544 x = PyObject_GetItem(mapping, w);
2545 Py_DECREF(w);
2546 if (x == NULL) {
2547 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002548 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002550 x = Py_None;
2551 Py_INCREF(x);
2552 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 }
2555
2556 /* Apply mapping */
2557 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002558 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 if (value < 0 || value > 255) {
2560 PyErr_SetString(PyExc_TypeError,
2561 "character mapping must be in range(256)");
2562 Py_DECREF(x);
2563 goto onError;
2564 }
2565 *s++ = (char)value;
2566 }
2567 else if (x == Py_None) {
2568 /* undefined mapping */
2569 if (charmap_encoding_error(&p, &s, errors,
2570 "character maps to <undefined>")) {
2571 Py_DECREF(x);
2572 goto onError;
2573 }
2574 }
2575 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002576 int targetsize = PyString_GET_SIZE(x);
2577
2578 if (targetsize == 1)
2579 /* 1-1 mapping */
2580 *s++ = *PyString_AS_STRING(x);
2581
2582 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002584 if (targetsize > extrachars) {
2585 /* resize first */
2586 int oldpos = (int)(s - PyString_AS_STRING(v));
2587 int needed = (targetsize - extrachars) + \
2588 (targetsize << 2);
2589 extrachars += needed;
2590 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002591 Py_DECREF(x);
2592 goto onError;
2593 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002594 s = PyString_AS_STRING(v) + oldpos;
2595 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002596 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002597 s += targetsize;
2598 extrachars -= targetsize;
2599 }
2600 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 }
2602 else {
2603 /* wrong return value */
2604 PyErr_SetString(PyExc_TypeError,
2605 "character mapping must return integer, None or unicode");
2606 Py_DECREF(x);
2607 goto onError;
2608 }
2609 Py_DECREF(x);
2610 }
2611 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2612 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2613 goto onError;
2614 return v;
2615
2616 onError:
2617 Py_DECREF(v);
2618 return NULL;
2619}
2620
2621PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2622 PyObject *mapping)
2623{
2624 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2625 PyErr_BadArgument();
2626 return NULL;
2627 }
2628 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2629 PyUnicode_GET_SIZE(unicode),
2630 mapping,
2631 NULL);
2632}
2633
2634static
2635int translate_error(const Py_UNICODE **source,
2636 Py_UNICODE **dest,
2637 const char *errors,
2638 const char *details)
2639{
2640 if ((errors == NULL) ||
2641 (strcmp(errors,"strict") == 0)) {
2642 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002643 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 details);
2645 return -1;
2646 }
2647 else if (strcmp(errors,"ignore") == 0) {
2648 return 0;
2649 }
2650 else if (strcmp(errors,"replace") == 0) {
2651 **dest = '?';
2652 (*dest)++;
2653 return 0;
2654 }
2655 else {
2656 PyErr_Format(PyExc_ValueError,
2657 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002658 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 errors);
2660 return -1;
2661 }
2662}
2663
2664PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2665 int size,
2666 PyObject *mapping,
2667 const char *errors)
2668{
2669 PyUnicodeObject *v;
2670 Py_UNICODE *p;
2671
2672 if (mapping == NULL) {
2673 PyErr_BadArgument();
2674 return NULL;
2675 }
2676
2677 /* Output will never be longer than input */
2678 v = _PyUnicode_New(size);
2679 if (v == NULL)
2680 goto onError;
2681 if (size == 0)
2682 goto done;
2683 p = PyUnicode_AS_UNICODE(v);
2684 while (size-- > 0) {
2685 Py_UNICODE ch = *s++;
2686 PyObject *w, *x;
2687
2688 /* Get mapping */
2689 w = PyInt_FromLong(ch);
2690 if (w == NULL)
2691 goto onError;
2692 x = PyObject_GetItem(mapping, w);
2693 Py_DECREF(w);
2694 if (x == NULL) {
2695 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2696 /* No mapping found: default to 1-1 mapping */
2697 PyErr_Clear();
2698 *p++ = ch;
2699 continue;
2700 }
2701 goto onError;
2702 }
2703
2704 /* Apply mapping */
2705 if (PyInt_Check(x))
2706 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2707 else if (x == Py_None) {
2708 /* undefined mapping */
2709 if (translate_error(&s, &p, errors,
2710 "character maps to <undefined>")) {
2711 Py_DECREF(x);
2712 goto onError;
2713 }
2714 }
2715 else if (PyUnicode_Check(x)) {
2716 if (PyUnicode_GET_SIZE(x) != 1) {
2717 /* 1-n mapping */
2718 PyErr_SetString(PyExc_NotImplementedError,
2719 "1-n mappings are currently not implemented");
2720 Py_DECREF(x);
2721 goto onError;
2722 }
2723 *p++ = *PyUnicode_AS_UNICODE(x);
2724 }
2725 else {
2726 /* wrong return value */
2727 PyErr_SetString(PyExc_TypeError,
2728 "translate mapping must return integer, None or unicode");
2729 Py_DECREF(x);
2730 goto onError;
2731 }
2732 Py_DECREF(x);
2733 }
2734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002735 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
2738 done:
2739 return (PyObject *)v;
2740
2741 onError:
2742 Py_XDECREF(v);
2743 return NULL;
2744}
2745
2746PyObject *PyUnicode_Translate(PyObject *str,
2747 PyObject *mapping,
2748 const char *errors)
2749{
2750 PyObject *result;
2751
2752 str = PyUnicode_FromObject(str);
2753 if (str == NULL)
2754 goto onError;
2755 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2756 PyUnicode_GET_SIZE(str),
2757 mapping,
2758 errors);
2759 Py_DECREF(str);
2760 return result;
2761
2762 onError:
2763 Py_XDECREF(str);
2764 return NULL;
2765}
2766
Guido van Rossum9e896b32000-04-05 20:11:21 +00002767/* --- Decimal Encoder ---------------------------------------------------- */
2768
2769int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2770 int length,
2771 char *output,
2772 const char *errors)
2773{
2774 Py_UNICODE *p, *end;
2775
2776 if (output == NULL) {
2777 PyErr_BadArgument();
2778 return -1;
2779 }
2780
2781 p = s;
2782 end = s + length;
2783 while (p < end) {
2784 register Py_UNICODE ch = *p++;
2785 int decimal;
2786
2787 if (Py_UNICODE_ISSPACE(ch)) {
2788 *output++ = ' ';
2789 continue;
2790 }
2791 decimal = Py_UNICODE_TODECIMAL(ch);
2792 if (decimal >= 0) {
2793 *output++ = '0' + decimal;
2794 continue;
2795 }
Guido van Rossumba477042000-04-06 18:18:10 +00002796 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002797 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002798 continue;
2799 }
2800 /* All other characters are considered invalid */
2801 if (errors == NULL || strcmp(errors, "strict") == 0) {
2802 PyErr_SetString(PyExc_ValueError,
2803 "invalid decimal Unicode string");
2804 goto onError;
2805 }
2806 else if (strcmp(errors, "ignore") == 0)
2807 continue;
2808 else if (strcmp(errors, "replace") == 0) {
2809 *output++ = '?';
2810 continue;
2811 }
2812 }
2813 /* 0-terminate the output string */
2814 *output++ = '\0';
2815 return 0;
2816
2817 onError:
2818 return -1;
2819}
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821/* --- Helpers ------------------------------------------------------------ */
2822
2823static
2824int count(PyUnicodeObject *self,
2825 int start,
2826 int end,
2827 PyUnicodeObject *substring)
2828{
2829 int count = 0;
2830
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002831 if (start < 0)
2832 start += self->length;
2833 if (start < 0)
2834 start = 0;
2835 if (end > self->length)
2836 end = self->length;
2837 if (end < 0)
2838 end += self->length;
2839 if (end < 0)
2840 end = 0;
2841
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002842 if (substring->length == 0)
2843 return (end - start + 1);
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 end -= substring->length;
2846
2847 while (start <= end)
2848 if (Py_UNICODE_MATCH(self, start, substring)) {
2849 count++;
2850 start += substring->length;
2851 } else
2852 start++;
2853
2854 return count;
2855}
2856
2857int PyUnicode_Count(PyObject *str,
2858 PyObject *substr,
2859 int start,
2860 int end)
2861{
2862 int result;
2863
2864 str = PyUnicode_FromObject(str);
2865 if (str == NULL)
2866 return -1;
2867 substr = PyUnicode_FromObject(substr);
2868 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002869 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 return -1;
2871 }
2872
2873 result = count((PyUnicodeObject *)str,
2874 start, end,
2875 (PyUnicodeObject *)substr);
2876
2877 Py_DECREF(str);
2878 Py_DECREF(substr);
2879 return result;
2880}
2881
2882static
2883int findstring(PyUnicodeObject *self,
2884 PyUnicodeObject *substring,
2885 int start,
2886 int end,
2887 int direction)
2888{
2889 if (start < 0)
2890 start += self->length;
2891 if (start < 0)
2892 start = 0;
2893
2894 if (substring->length == 0)
2895 return start;
2896
2897 if (end > self->length)
2898 end = self->length;
2899 if (end < 0)
2900 end += self->length;
2901 if (end < 0)
2902 end = 0;
2903
2904 end -= substring->length;
2905
2906 if (direction < 0) {
2907 for (; end >= start; end--)
2908 if (Py_UNICODE_MATCH(self, end, substring))
2909 return end;
2910 } else {
2911 for (; start <= end; start++)
2912 if (Py_UNICODE_MATCH(self, start, substring))
2913 return start;
2914 }
2915
2916 return -1;
2917}
2918
2919int PyUnicode_Find(PyObject *str,
2920 PyObject *substr,
2921 int start,
2922 int end,
2923 int direction)
2924{
2925 int result;
2926
2927 str = PyUnicode_FromObject(str);
2928 if (str == NULL)
2929 return -1;
2930 substr = PyUnicode_FromObject(substr);
2931 if (substr == NULL) {
2932 Py_DECREF(substr);
2933 return -1;
2934 }
2935
2936 result = findstring((PyUnicodeObject *)str,
2937 (PyUnicodeObject *)substr,
2938 start, end, direction);
2939 Py_DECREF(str);
2940 Py_DECREF(substr);
2941 return result;
2942}
2943
2944static
2945int tailmatch(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int start,
2948 int end,
2949 int direction)
2950{
2951 if (start < 0)
2952 start += self->length;
2953 if (start < 0)
2954 start = 0;
2955
2956 if (substring->length == 0)
2957 return 1;
2958
2959 if (end > self->length)
2960 end = self->length;
2961 if (end < 0)
2962 end += self->length;
2963 if (end < 0)
2964 end = 0;
2965
2966 end -= substring->length;
2967 if (end < start)
2968 return 0;
2969
2970 if (direction > 0) {
2971 if (Py_UNICODE_MATCH(self, end, substring))
2972 return 1;
2973 } else {
2974 if (Py_UNICODE_MATCH(self, start, substring))
2975 return 1;
2976 }
2977
2978 return 0;
2979}
2980
2981int PyUnicode_Tailmatch(PyObject *str,
2982 PyObject *substr,
2983 int start,
2984 int end,
2985 int direction)
2986{
2987 int result;
2988
2989 str = PyUnicode_FromObject(str);
2990 if (str == NULL)
2991 return -1;
2992 substr = PyUnicode_FromObject(substr);
2993 if (substr == NULL) {
2994 Py_DECREF(substr);
2995 return -1;
2996 }
2997
2998 result = tailmatch((PyUnicodeObject *)str,
2999 (PyUnicodeObject *)substr,
3000 start, end, direction);
3001 Py_DECREF(str);
3002 Py_DECREF(substr);
3003 return result;
3004}
3005
3006static
3007const Py_UNICODE *findchar(const Py_UNICODE *s,
3008 int size,
3009 Py_UNICODE ch)
3010{
3011 /* like wcschr, but doesn't stop at NULL characters */
3012
3013 while (size-- > 0) {
3014 if (*s == ch)
3015 return s;
3016 s++;
3017 }
3018
3019 return NULL;
3020}
3021
3022/* Apply fixfct filter to the Unicode object self and return a
3023 reference to the modified object */
3024
3025static
3026PyObject *fixup(PyUnicodeObject *self,
3027 int (*fixfct)(PyUnicodeObject *s))
3028{
3029
3030 PyUnicodeObject *u;
3031
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003032 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 if (u == NULL)
3034 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003035
3036 Py_UNICODE_COPY(u->str, self->str, self->length);
3037
Tim Peters7a29bd52001-09-12 03:03:31 +00003038 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 /* fixfct should return TRUE if it modified the buffer. If
3040 FALSE, return a reference to the original buffer instead
3041 (to save space, not time) */
3042 Py_INCREF(self);
3043 Py_DECREF(u);
3044 return (PyObject*) self;
3045 }
3046 return (PyObject*) u;
3047}
3048
3049static
3050int fixupper(PyUnicodeObject *self)
3051{
3052 int len = self->length;
3053 Py_UNICODE *s = self->str;
3054 int status = 0;
3055
3056 while (len-- > 0) {
3057 register Py_UNICODE ch;
3058
3059 ch = Py_UNICODE_TOUPPER(*s);
3060 if (ch != *s) {
3061 status = 1;
3062 *s = ch;
3063 }
3064 s++;
3065 }
3066
3067 return status;
3068}
3069
3070static
3071int fixlower(PyUnicodeObject *self)
3072{
3073 int len = self->length;
3074 Py_UNICODE *s = self->str;
3075 int status = 0;
3076
3077 while (len-- > 0) {
3078 register Py_UNICODE ch;
3079
3080 ch = Py_UNICODE_TOLOWER(*s);
3081 if (ch != *s) {
3082 status = 1;
3083 *s = ch;
3084 }
3085 s++;
3086 }
3087
3088 return status;
3089}
3090
3091static
3092int fixswapcase(PyUnicodeObject *self)
3093{
3094 int len = self->length;
3095 Py_UNICODE *s = self->str;
3096 int status = 0;
3097
3098 while (len-- > 0) {
3099 if (Py_UNICODE_ISUPPER(*s)) {
3100 *s = Py_UNICODE_TOLOWER(*s);
3101 status = 1;
3102 } else if (Py_UNICODE_ISLOWER(*s)) {
3103 *s = Py_UNICODE_TOUPPER(*s);
3104 status = 1;
3105 }
3106 s++;
3107 }
3108
3109 return status;
3110}
3111
3112static
3113int fixcapitalize(PyUnicodeObject *self)
3114{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003115 int len = self->length;
3116 Py_UNICODE *s = self->str;
3117 int status = 0;
3118
3119 if (len == 0)
3120 return 0;
3121 if (Py_UNICODE_ISLOWER(*s)) {
3122 *s = Py_UNICODE_TOUPPER(*s);
3123 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003125 s++;
3126 while (--len > 0) {
3127 if (Py_UNICODE_ISUPPER(*s)) {
3128 *s = Py_UNICODE_TOLOWER(*s);
3129 status = 1;
3130 }
3131 s++;
3132 }
3133 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134}
3135
3136static
3137int fixtitle(PyUnicodeObject *self)
3138{
3139 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3140 register Py_UNICODE *e;
3141 int previous_is_cased;
3142
3143 /* Shortcut for single character strings */
3144 if (PyUnicode_GET_SIZE(self) == 1) {
3145 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3146 if (*p != ch) {
3147 *p = ch;
3148 return 1;
3149 }
3150 else
3151 return 0;
3152 }
3153
3154 e = p + PyUnicode_GET_SIZE(self);
3155 previous_is_cased = 0;
3156 for (; p < e; p++) {
3157 register const Py_UNICODE ch = *p;
3158
3159 if (previous_is_cased)
3160 *p = Py_UNICODE_TOLOWER(ch);
3161 else
3162 *p = Py_UNICODE_TOTITLE(ch);
3163
3164 if (Py_UNICODE_ISLOWER(ch) ||
3165 Py_UNICODE_ISUPPER(ch) ||
3166 Py_UNICODE_ISTITLE(ch))
3167 previous_is_cased = 1;
3168 else
3169 previous_is_cased = 0;
3170 }
3171 return 1;
3172}
3173
3174PyObject *PyUnicode_Join(PyObject *separator,
3175 PyObject *seq)
3176{
3177 Py_UNICODE *sep;
3178 int seplen;
3179 PyUnicodeObject *res = NULL;
3180 int reslen = 0;
3181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 int sz = 100;
3183 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003184 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
Tim Peters2cfe3682001-05-05 05:36:48 +00003186 it = PyObject_GetIter(seq);
3187 if (it == NULL)
3188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189
3190 if (separator == NULL) {
3191 Py_UNICODE blank = ' ';
3192 sep = &blank;
3193 seplen = 1;
3194 }
3195 else {
3196 separator = PyUnicode_FromObject(separator);
3197 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 sep = PyUnicode_AS_UNICODE(separator);
3200 seplen = PyUnicode_GET_SIZE(separator);
3201 }
3202
3203 res = _PyUnicode_New(sz);
3204 if (res == NULL)
3205 goto onError;
3206 p = PyUnicode_AS_UNICODE(res);
3207 reslen = 0;
3208
Tim Peters2cfe3682001-05-05 05:36:48 +00003209 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003211 PyObject *item = PyIter_Next(it);
3212 if (item == NULL) {
3213 if (PyErr_Occurred())
3214 goto onError;
3215 break;
3216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 if (!PyUnicode_Check(item)) {
3218 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003219 if (!PyString_Check(item)) {
3220 PyErr_Format(PyExc_TypeError,
3221 "sequence item %i: expected string or Unicode,"
3222 " %.80s found",
3223 i, item->ob_type->tp_name);
3224 Py_DECREF(item);
3225 goto onError;
3226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 v = PyUnicode_FromObject(item);
3228 Py_DECREF(item);
3229 item = v;
3230 if (item == NULL)
3231 goto onError;
3232 }
3233 itemlen = PyUnicode_GET_SIZE(item);
3234 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003235 if (_PyUnicode_Resize(&res, sz*2)) {
3236 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 sz *= 2;
3240 p = PyUnicode_AS_UNICODE(res) + reslen;
3241 }
3242 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003243 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 p += seplen;
3245 reslen += seplen;
3246 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003247 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 p += itemlen;
3249 reslen += itemlen;
3250 Py_DECREF(item);
3251 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003252 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 goto onError;
3254
3255 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003256 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 return (PyObject *)res;
3258
3259 onError:
3260 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003261 Py_XDECREF(res);
3262 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 return NULL;
3264}
3265
3266static
3267PyUnicodeObject *pad(PyUnicodeObject *self,
3268 int left,
3269 int right,
3270 Py_UNICODE fill)
3271{
3272 PyUnicodeObject *u;
3273
3274 if (left < 0)
3275 left = 0;
3276 if (right < 0)
3277 right = 0;
3278
Tim Peters7a29bd52001-09-12 03:03:31 +00003279 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 Py_INCREF(self);
3281 return self;
3282 }
3283
3284 u = _PyUnicode_New(left + self->length + right);
3285 if (u) {
3286 if (left)
3287 Py_UNICODE_FILL(u->str, fill, left);
3288 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3289 if (right)
3290 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3291 }
3292
3293 return u;
3294}
3295
3296#define SPLIT_APPEND(data, left, right) \
3297 str = PyUnicode_FromUnicode(data + left, right - left); \
3298 if (!str) \
3299 goto onError; \
3300 if (PyList_Append(list, str)) { \
3301 Py_DECREF(str); \
3302 goto onError; \
3303 } \
3304 else \
3305 Py_DECREF(str);
3306
3307static
3308PyObject *split_whitespace(PyUnicodeObject *self,
3309 PyObject *list,
3310 int maxcount)
3311{
3312 register int i;
3313 register int j;
3314 int len = self->length;
3315 PyObject *str;
3316
3317 for (i = j = 0; i < len; ) {
3318 /* find a token */
3319 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3320 i++;
3321 j = i;
3322 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3323 i++;
3324 if (j < i) {
3325 if (maxcount-- <= 0)
3326 break;
3327 SPLIT_APPEND(self->str, j, i);
3328 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3329 i++;
3330 j = i;
3331 }
3332 }
3333 if (j < len) {
3334 SPLIT_APPEND(self->str, j, len);
3335 }
3336 return list;
3337
3338 onError:
3339 Py_DECREF(list);
3340 return NULL;
3341}
3342
3343PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003344 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345{
3346 register int i;
3347 register int j;
3348 int len;
3349 PyObject *list;
3350 PyObject *str;
3351 Py_UNICODE *data;
3352
3353 string = PyUnicode_FromObject(string);
3354 if (string == NULL)
3355 return NULL;
3356 data = PyUnicode_AS_UNICODE(string);
3357 len = PyUnicode_GET_SIZE(string);
3358
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 list = PyList_New(0);
3360 if (!list)
3361 goto onError;
3362
3363 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003364 int eol;
3365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 /* Find a line and append it */
3367 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3368 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369
3370 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003371 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 if (i < len) {
3373 if (data[i] == '\r' && i + 1 < len &&
3374 data[i+1] == '\n')
3375 i += 2;
3376 else
3377 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003378 if (keepends)
3379 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
Guido van Rossum86662912000-04-11 15:38:46 +00003381 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 j = i;
3383 }
3384 if (j < len) {
3385 SPLIT_APPEND(data, j, len);
3386 }
3387
3388 Py_DECREF(string);
3389 return list;
3390
3391 onError:
3392 Py_DECREF(list);
3393 Py_DECREF(string);
3394 return NULL;
3395}
3396
3397static
3398PyObject *split_char(PyUnicodeObject *self,
3399 PyObject *list,
3400 Py_UNICODE ch,
3401 int maxcount)
3402{
3403 register int i;
3404 register int j;
3405 int len = self->length;
3406 PyObject *str;
3407
3408 for (i = j = 0; i < len; ) {
3409 if (self->str[i] == ch) {
3410 if (maxcount-- <= 0)
3411 break;
3412 SPLIT_APPEND(self->str, j, i);
3413 i = j = i + 1;
3414 } else
3415 i++;
3416 }
3417 if (j <= len) {
3418 SPLIT_APPEND(self->str, j, len);
3419 }
3420 return list;
3421
3422 onError:
3423 Py_DECREF(list);
3424 return NULL;
3425}
3426
3427static
3428PyObject *split_substring(PyUnicodeObject *self,
3429 PyObject *list,
3430 PyUnicodeObject *substring,
3431 int maxcount)
3432{
3433 register int i;
3434 register int j;
3435 int len = self->length;
3436 int sublen = substring->length;
3437 PyObject *str;
3438
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003439 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 if (Py_UNICODE_MATCH(self, i, substring)) {
3441 if (maxcount-- <= 0)
3442 break;
3443 SPLIT_APPEND(self->str, j, i);
3444 i = j = i + sublen;
3445 } else
3446 i++;
3447 }
3448 if (j <= len) {
3449 SPLIT_APPEND(self->str, j, len);
3450 }
3451 return list;
3452
3453 onError:
3454 Py_DECREF(list);
3455 return NULL;
3456}
3457
3458#undef SPLIT_APPEND
3459
3460static
3461PyObject *split(PyUnicodeObject *self,
3462 PyUnicodeObject *substring,
3463 int maxcount)
3464{
3465 PyObject *list;
3466
3467 if (maxcount < 0)
3468 maxcount = INT_MAX;
3469
3470 list = PyList_New(0);
3471 if (!list)
3472 return NULL;
3473
3474 if (substring == NULL)
3475 return split_whitespace(self,list,maxcount);
3476
3477 else if (substring->length == 1)
3478 return split_char(self,list,substring->str[0],maxcount);
3479
3480 else if (substring->length == 0) {
3481 Py_DECREF(list);
3482 PyErr_SetString(PyExc_ValueError, "empty separator");
3483 return NULL;
3484 }
3485 else
3486 return split_substring(self,list,substring,maxcount);
3487}
3488
3489static
3490PyObject *strip(PyUnicodeObject *self,
3491 int left,
3492 int right)
3493{
3494 Py_UNICODE *p = self->str;
3495 int start = 0;
3496 int end = self->length;
3497
3498 if (left)
3499 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3500 start++;
3501
3502 if (right)
3503 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3504 end--;
3505
Tim Peters7a29bd52001-09-12 03:03:31 +00003506 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 /* couldn't strip anything off, return original string */
3508 Py_INCREF(self);
3509 return (PyObject*) self;
3510 }
3511
3512 return (PyObject*) PyUnicode_FromUnicode(
3513 self->str + start,
3514 end - start
3515 );
3516}
3517
3518static
3519PyObject *replace(PyUnicodeObject *self,
3520 PyUnicodeObject *str1,
3521 PyUnicodeObject *str2,
3522 int maxcount)
3523{
3524 PyUnicodeObject *u;
3525
3526 if (maxcount < 0)
3527 maxcount = INT_MAX;
3528
3529 if (str1->length == 1 && str2->length == 1) {
3530 int i;
3531
3532 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003533 if (!findchar(self->str, self->length, str1->str[0]) &&
3534 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 /* nothing to replace, return original string */
3536 Py_INCREF(self);
3537 u = self;
3538 } else {
3539 Py_UNICODE u1 = str1->str[0];
3540 Py_UNICODE u2 = str2->str[0];
3541
3542 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003543 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 self->length
3545 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003546 if (u != NULL) {
3547 Py_UNICODE_COPY(u->str, self->str,
3548 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 for (i = 0; i < u->length; i++)
3550 if (u->str[i] == u1) {
3551 if (--maxcount < 0)
3552 break;
3553 u->str[i] = u2;
3554 }
3555 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557
3558 } else {
3559 int n, i;
3560 Py_UNICODE *p;
3561
3562 /* replace strings */
3563 n = count(self, 0, self->length, str1);
3564 if (n > maxcount)
3565 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003566 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 /* nothing to replace, return original string */
3568 Py_INCREF(self);
3569 u = self;
3570 } else {
3571 u = _PyUnicode_New(
3572 self->length + n * (str2->length - str1->length));
3573 if (u) {
3574 i = 0;
3575 p = u->str;
3576 while (i <= self->length - str1->length)
3577 if (Py_UNICODE_MATCH(self, i, str1)) {
3578 /* replace string segment */
3579 Py_UNICODE_COPY(p, str2->str, str2->length);
3580 p += str2->length;
3581 i += str1->length;
3582 if (--n <= 0) {
3583 /* copy remaining part */
3584 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3585 break;
3586 }
3587 } else
3588 *p++ = self->str[i++];
3589 }
3590 }
3591 }
3592
3593 return (PyObject *) u;
3594}
3595
3596/* --- Unicode Object Methods --------------------------------------------- */
3597
3598static char title__doc__[] =
3599"S.title() -> unicode\n\
3600\n\
3601Return a titlecased version of S, i.e. words start with title case\n\
3602characters, all remaining cased characters have lower case.";
3603
3604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003605unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 return fixup(self, fixtitle);
3608}
3609
3610static char capitalize__doc__[] =
3611"S.capitalize() -> unicode\n\
3612\n\
3613Return a capitalized version of S, i.e. make the first character\n\
3614have upper case.";
3615
3616static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003617unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 return fixup(self, fixcapitalize);
3620}
3621
3622#if 0
3623static char capwords__doc__[] =
3624"S.capwords() -> unicode\n\
3625\n\
3626Apply .capitalize() to all words in S and return the result with\n\
3627normalized whitespace (all whitespace strings are replaced by ' ').";
3628
3629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003630unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631{
3632 PyObject *list;
3633 PyObject *item;
3634 int i;
3635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 /* Split into words */
3637 list = split(self, NULL, -1);
3638 if (!list)
3639 return NULL;
3640
3641 /* Capitalize each word */
3642 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3643 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3644 fixcapitalize);
3645 if (item == NULL)
3646 goto onError;
3647 Py_DECREF(PyList_GET_ITEM(list, i));
3648 PyList_SET_ITEM(list, i, item);
3649 }
3650
3651 /* Join the words to form a new string */
3652 item = PyUnicode_Join(NULL, list);
3653
3654onError:
3655 Py_DECREF(list);
3656 return (PyObject *)item;
3657}
3658#endif
3659
3660static char center__doc__[] =
3661"S.center(width) -> unicode\n\
3662\n\
3663Return S centered in a Unicode string of length width. Padding is done\n\
3664using spaces.";
3665
3666static PyObject *
3667unicode_center(PyUnicodeObject *self, PyObject *args)
3668{
3669 int marg, left;
3670 int width;
3671
3672 if (!PyArg_ParseTuple(args, "i:center", &width))
3673 return NULL;
3674
Tim Peters7a29bd52001-09-12 03:03:31 +00003675 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 Py_INCREF(self);
3677 return (PyObject*) self;
3678 }
3679
3680 marg = width - self->length;
3681 left = marg / 2 + (marg & width & 1);
3682
3683 return (PyObject*) pad(self, left, marg - left, ' ');
3684}
3685
Marc-André Lemburge5034372000-08-08 08:04:29 +00003686#if 0
3687
3688/* This code should go into some future Unicode collation support
3689 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003690 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003691
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003692/* speedy UTF-16 code point order comparison */
3693/* gleaned from: */
3694/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3695
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003696static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003697{
3698 0, 0, 0, 0, 0, 0, 0, 0,
3699 0, 0, 0, 0, 0, 0, 0, 0,
3700 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003701 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003702};
3703
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704static int
3705unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3706{
3707 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003708
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 Py_UNICODE *s1 = str1->str;
3710 Py_UNICODE *s2 = str2->str;
3711
3712 len1 = str1->length;
3713 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003714
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003716 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003717
3718 c1 = *s1++;
3719 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003720
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003721 if (c1 > (1<<11) * 26)
3722 c1 += utf16Fixup[c1>>11];
3723 if (c2 > (1<<11) * 26)
3724 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003725 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003726
3727 if (c1 != c2)
3728 return (c1 < c2) ? -1 : 1;
3729
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003730 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 }
3732
3733 return (len1 < len2) ? -1 : (len1 != len2);
3734}
3735
Marc-André Lemburge5034372000-08-08 08:04:29 +00003736#else
3737
3738static int
3739unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3740{
3741 register int len1, len2;
3742
3743 Py_UNICODE *s1 = str1->str;
3744 Py_UNICODE *s2 = str2->str;
3745
3746 len1 = str1->length;
3747 len2 = str2->length;
3748
3749 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003750 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003751
Fredrik Lundh45714e92001-06-26 16:39:36 +00003752 c1 = *s1++;
3753 c2 = *s2++;
3754
3755 if (c1 != c2)
3756 return (c1 < c2) ? -1 : 1;
3757
Marc-André Lemburge5034372000-08-08 08:04:29 +00003758 len1--; len2--;
3759 }
3760
3761 return (len1 < len2) ? -1 : (len1 != len2);
3762}
3763
3764#endif
3765
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766int PyUnicode_Compare(PyObject *left,
3767 PyObject *right)
3768{
3769 PyUnicodeObject *u = NULL, *v = NULL;
3770 int result;
3771
3772 /* Coerce the two arguments */
3773 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3774 if (u == NULL)
3775 goto onError;
3776 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3777 if (v == NULL)
3778 goto onError;
3779
Thomas Wouters7e474022000-07-16 12:04:32 +00003780 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 if (v == u) {
3782 Py_DECREF(u);
3783 Py_DECREF(v);
3784 return 0;
3785 }
3786
3787 result = unicode_compare(u, v);
3788
3789 Py_DECREF(u);
3790 Py_DECREF(v);
3791 return result;
3792
3793onError:
3794 Py_XDECREF(u);
3795 Py_XDECREF(v);
3796 return -1;
3797}
3798
Guido van Rossum403d68b2000-03-13 15:55:09 +00003799int PyUnicode_Contains(PyObject *container,
3800 PyObject *element)
3801{
3802 PyUnicodeObject *u = NULL, *v = NULL;
3803 int result;
3804 register const Py_UNICODE *p, *e;
3805 register Py_UNICODE ch;
3806
3807 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003808 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003809 if (v == NULL) {
3810 PyErr_SetString(PyExc_TypeError,
3811 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003812 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003813 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003814 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3815 if (u == NULL) {
3816 Py_DECREF(v);
3817 goto onError;
3818 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003819
3820 /* Check v in u */
3821 if (PyUnicode_GET_SIZE(v) != 1) {
3822 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003823 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003824 goto onError;
3825 }
3826 ch = *PyUnicode_AS_UNICODE(v);
3827 p = PyUnicode_AS_UNICODE(u);
3828 e = p + PyUnicode_GET_SIZE(u);
3829 result = 0;
3830 while (p < e) {
3831 if (*p++ == ch) {
3832 result = 1;
3833 break;
3834 }
3835 }
3836
3837 Py_DECREF(u);
3838 Py_DECREF(v);
3839 return result;
3840
3841onError:
3842 Py_XDECREF(u);
3843 Py_XDECREF(v);
3844 return -1;
3845}
3846
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847/* Concat to string or Unicode object giving a new Unicode object. */
3848
3849PyObject *PyUnicode_Concat(PyObject *left,
3850 PyObject *right)
3851{
3852 PyUnicodeObject *u = NULL, *v = NULL, *w;
3853
3854 /* Coerce the two arguments */
3855 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3856 if (u == NULL)
3857 goto onError;
3858 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3859 if (v == NULL)
3860 goto onError;
3861
3862 /* Shortcuts */
3863 if (v == unicode_empty) {
3864 Py_DECREF(v);
3865 return (PyObject *)u;
3866 }
3867 if (u == unicode_empty) {
3868 Py_DECREF(u);
3869 return (PyObject *)v;
3870 }
3871
3872 /* Concat the two Unicode strings */
3873 w = _PyUnicode_New(u->length + v->length);
3874 if (w == NULL)
3875 goto onError;
3876 Py_UNICODE_COPY(w->str, u->str, u->length);
3877 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3878
3879 Py_DECREF(u);
3880 Py_DECREF(v);
3881 return (PyObject *)w;
3882
3883onError:
3884 Py_XDECREF(u);
3885 Py_XDECREF(v);
3886 return NULL;
3887}
3888
3889static char count__doc__[] =
3890"S.count(sub[, start[, end]]) -> int\n\
3891\n\
3892Return the number of occurrences of substring sub in Unicode string\n\
3893S[start:end]. Optional arguments start and end are\n\
3894interpreted as in slice notation.";
3895
3896static PyObject *
3897unicode_count(PyUnicodeObject *self, PyObject *args)
3898{
3899 PyUnicodeObject *substring;
3900 int start = 0;
3901 int end = INT_MAX;
3902 PyObject *result;
3903
Guido van Rossumb8872e62000-05-09 14:14:27 +00003904 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3905 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 return NULL;
3907
3908 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3909 (PyObject *)substring);
3910 if (substring == NULL)
3911 return NULL;
3912
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 if (start < 0)
3914 start += self->length;
3915 if (start < 0)
3916 start = 0;
3917 if (end > self->length)
3918 end = self->length;
3919 if (end < 0)
3920 end += self->length;
3921 if (end < 0)
3922 end = 0;
3923
3924 result = PyInt_FromLong((long) count(self, start, end, substring));
3925
3926 Py_DECREF(substring);
3927 return result;
3928}
3929
3930static char encode__doc__[] =
3931"S.encode([encoding[,errors]]) -> string\n\
3932\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003933Return an encoded string version of S. Default encoding is the current\n\
3934default string encoding. errors may be given to set a different error\n\
3935handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3936a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937
3938static PyObject *
3939unicode_encode(PyUnicodeObject *self, PyObject *args)
3940{
3941 char *encoding = NULL;
3942 char *errors = NULL;
3943 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3944 return NULL;
3945 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3946}
3947
3948static char expandtabs__doc__[] =
3949"S.expandtabs([tabsize]) -> unicode\n\
3950\n\
3951Return a copy of S where all tab characters are expanded using spaces.\n\
3952If tabsize is not given, a tab size of 8 characters is assumed.";
3953
3954static PyObject*
3955unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3956{
3957 Py_UNICODE *e;
3958 Py_UNICODE *p;
3959 Py_UNICODE *q;
3960 int i, j;
3961 PyUnicodeObject *u;
3962 int tabsize = 8;
3963
3964 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3965 return NULL;
3966
Thomas Wouters7e474022000-07-16 12:04:32 +00003967 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 i = j = 0;
3969 e = self->str + self->length;
3970 for (p = self->str; p < e; p++)
3971 if (*p == '\t') {
3972 if (tabsize > 0)
3973 j += tabsize - (j % tabsize);
3974 }
3975 else {
3976 j++;
3977 if (*p == '\n' || *p == '\r') {
3978 i += j;
3979 j = 0;
3980 }
3981 }
3982
3983 /* Second pass: create output string and fill it */
3984 u = _PyUnicode_New(i + j);
3985 if (!u)
3986 return NULL;
3987
3988 j = 0;
3989 q = u->str;
3990
3991 for (p = self->str; p < e; p++)
3992 if (*p == '\t') {
3993 if (tabsize > 0) {
3994 i = tabsize - (j % tabsize);
3995 j += i;
3996 while (i--)
3997 *q++ = ' ';
3998 }
3999 }
4000 else {
4001 j++;
4002 *q++ = *p;
4003 if (*p == '\n' || *p == '\r')
4004 j = 0;
4005 }
4006
4007 return (PyObject*) u;
4008}
4009
4010static char find__doc__[] =
4011"S.find(sub [,start [,end]]) -> int\n\
4012\n\
4013Return the lowest index in S where substring sub is found,\n\
4014such that sub is contained within s[start,end]. Optional\n\
4015arguments start and end are interpreted as in slice notation.\n\
4016\n\
4017Return -1 on failure.";
4018
4019static PyObject *
4020unicode_find(PyUnicodeObject *self, PyObject *args)
4021{
4022 PyUnicodeObject *substring;
4023 int start = 0;
4024 int end = INT_MAX;
4025 PyObject *result;
4026
Guido van Rossumb8872e62000-05-09 14:14:27 +00004027 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4028 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 return NULL;
4030 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4031 (PyObject *)substring);
4032 if (substring == NULL)
4033 return NULL;
4034
4035 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4036
4037 Py_DECREF(substring);
4038 return result;
4039}
4040
4041static PyObject *
4042unicode_getitem(PyUnicodeObject *self, int index)
4043{
4044 if (index < 0 || index >= self->length) {
4045 PyErr_SetString(PyExc_IndexError, "string index out of range");
4046 return NULL;
4047 }
4048
4049 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4050}
4051
4052static long
4053unicode_hash(PyUnicodeObject *self)
4054{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004055 /* Since Unicode objects compare equal to their ASCII string
4056 counterparts, they should use the individual character values
4057 as basis for their hash value. This is needed to assure that
4058 strings and Unicode objects behave in the same way as
4059 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060
Fredrik Lundhdde61642000-07-10 18:27:47 +00004061 register int len;
4062 register Py_UNICODE *p;
4063 register long x;
4064
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 if (self->hash != -1)
4066 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004067 len = PyUnicode_GET_SIZE(self);
4068 p = PyUnicode_AS_UNICODE(self);
4069 x = *p << 7;
4070 while (--len >= 0)
4071 x = (1000003*x) ^ *p++;
4072 x ^= PyUnicode_GET_SIZE(self);
4073 if (x == -1)
4074 x = -2;
4075 self->hash = x;
4076 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077}
4078
4079static char index__doc__[] =
4080"S.index(sub [,start [,end]]) -> int\n\
4081\n\
4082Like S.find() but raise ValueError when the substring is not found.";
4083
4084static PyObject *
4085unicode_index(PyUnicodeObject *self, PyObject *args)
4086{
4087 int result;
4088 PyUnicodeObject *substring;
4089 int start = 0;
4090 int end = INT_MAX;
4091
Guido van Rossumb8872e62000-05-09 14:14:27 +00004092 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4093 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 return NULL;
4095
4096 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4097 (PyObject *)substring);
4098 if (substring == NULL)
4099 return NULL;
4100
4101 result = findstring(self, substring, start, end, 1);
4102
4103 Py_DECREF(substring);
4104 if (result < 0) {
4105 PyErr_SetString(PyExc_ValueError, "substring not found");
4106 return NULL;
4107 }
4108 return PyInt_FromLong(result);
4109}
4110
4111static char islower__doc__[] =
4112"S.islower() -> int\n\
4113\n\
4114Return 1 if all cased characters in S are lowercase and there is\n\
4115at least one cased character in S, 0 otherwise.";
4116
4117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004118unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119{
4120 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4121 register const Py_UNICODE *e;
4122 int cased;
4123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 /* Shortcut for single character strings */
4125 if (PyUnicode_GET_SIZE(self) == 1)
4126 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4127
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004128 /* Special case for empty strings */
4129 if (PyString_GET_SIZE(self) == 0)
4130 return PyInt_FromLong(0);
4131
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 e = p + PyUnicode_GET_SIZE(self);
4133 cased = 0;
4134 for (; p < e; p++) {
4135 register const Py_UNICODE ch = *p;
4136
4137 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4138 return PyInt_FromLong(0);
4139 else if (!cased && Py_UNICODE_ISLOWER(ch))
4140 cased = 1;
4141 }
4142 return PyInt_FromLong(cased);
4143}
4144
4145static char isupper__doc__[] =
4146"S.isupper() -> int\n\
4147\n\
4148Return 1 if all cased characters in S are uppercase and there is\n\
4149at least one cased character in S, 0 otherwise.";
4150
4151static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004152unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153{
4154 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4155 register const Py_UNICODE *e;
4156 int cased;
4157
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 /* Shortcut for single character strings */
4159 if (PyUnicode_GET_SIZE(self) == 1)
4160 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4161
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004162 /* Special case for empty strings */
4163 if (PyString_GET_SIZE(self) == 0)
4164 return PyInt_FromLong(0);
4165
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 e = p + PyUnicode_GET_SIZE(self);
4167 cased = 0;
4168 for (; p < e; p++) {
4169 register const Py_UNICODE ch = *p;
4170
4171 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4172 return PyInt_FromLong(0);
4173 else if (!cased && Py_UNICODE_ISUPPER(ch))
4174 cased = 1;
4175 }
4176 return PyInt_FromLong(cased);
4177}
4178
4179static char istitle__doc__[] =
4180"S.istitle() -> int\n\
4181\n\
4182Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4183may only follow uncased characters and lowercase characters only cased\n\
4184ones. Return 0 otherwise.";
4185
4186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004187unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188{
4189 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4190 register const Py_UNICODE *e;
4191 int cased, previous_is_cased;
4192
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 /* Shortcut for single character strings */
4194 if (PyUnicode_GET_SIZE(self) == 1)
4195 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4196 (Py_UNICODE_ISUPPER(*p) != 0));
4197
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004198 /* Special case for empty strings */
4199 if (PyString_GET_SIZE(self) == 0)
4200 return PyInt_FromLong(0);
4201
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 e = p + PyUnicode_GET_SIZE(self);
4203 cased = 0;
4204 previous_is_cased = 0;
4205 for (; p < e; p++) {
4206 register const Py_UNICODE ch = *p;
4207
4208 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4209 if (previous_is_cased)
4210 return PyInt_FromLong(0);
4211 previous_is_cased = 1;
4212 cased = 1;
4213 }
4214 else if (Py_UNICODE_ISLOWER(ch)) {
4215 if (!previous_is_cased)
4216 return PyInt_FromLong(0);
4217 previous_is_cased = 1;
4218 cased = 1;
4219 }
4220 else
4221 previous_is_cased = 0;
4222 }
4223 return PyInt_FromLong(cased);
4224}
4225
4226static char isspace__doc__[] =
4227"S.isspace() -> int\n\
4228\n\
4229Return 1 if there are only whitespace characters in S,\n\
42300 otherwise.";
4231
4232static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004233unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
4235 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4236 register const Py_UNICODE *e;
4237
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 /* Shortcut for single character strings */
4239 if (PyUnicode_GET_SIZE(self) == 1 &&
4240 Py_UNICODE_ISSPACE(*p))
4241 return PyInt_FromLong(1);
4242
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004243 /* Special case for empty strings */
4244 if (PyString_GET_SIZE(self) == 0)
4245 return PyInt_FromLong(0);
4246
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 e = p + PyUnicode_GET_SIZE(self);
4248 for (; p < e; p++) {
4249 if (!Py_UNICODE_ISSPACE(*p))
4250 return PyInt_FromLong(0);
4251 }
4252 return PyInt_FromLong(1);
4253}
4254
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004255static char isalpha__doc__[] =
4256"S.isalpha() -> int\n\
4257\n\
4258Return 1 if all characters in S are alphabetic\n\
4259and there is at least one character in S, 0 otherwise.";
4260
4261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004262unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004263{
4264 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4265 register const Py_UNICODE *e;
4266
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004267 /* Shortcut for single character strings */
4268 if (PyUnicode_GET_SIZE(self) == 1 &&
4269 Py_UNICODE_ISALPHA(*p))
4270 return PyInt_FromLong(1);
4271
4272 /* Special case for empty strings */
4273 if (PyString_GET_SIZE(self) == 0)
4274 return PyInt_FromLong(0);
4275
4276 e = p + PyUnicode_GET_SIZE(self);
4277 for (; p < e; p++) {
4278 if (!Py_UNICODE_ISALPHA(*p))
4279 return PyInt_FromLong(0);
4280 }
4281 return PyInt_FromLong(1);
4282}
4283
4284static char isalnum__doc__[] =
4285"S.isalnum() -> int\n\
4286\n\
4287Return 1 if all characters in S are alphanumeric\n\
4288and there is at least one character in S, 0 otherwise.";
4289
4290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004291unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004292{
4293 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4294 register const Py_UNICODE *e;
4295
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004296 /* Shortcut for single character strings */
4297 if (PyUnicode_GET_SIZE(self) == 1 &&
4298 Py_UNICODE_ISALNUM(*p))
4299 return PyInt_FromLong(1);
4300
4301 /* Special case for empty strings */
4302 if (PyString_GET_SIZE(self) == 0)
4303 return PyInt_FromLong(0);
4304
4305 e = p + PyUnicode_GET_SIZE(self);
4306 for (; p < e; p++) {
4307 if (!Py_UNICODE_ISALNUM(*p))
4308 return PyInt_FromLong(0);
4309 }
4310 return PyInt_FromLong(1);
4311}
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313static char isdecimal__doc__[] =
4314"S.isdecimal() -> int\n\
4315\n\
4316Return 1 if there are only decimal characters in S,\n\
43170 otherwise.";
4318
4319static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004320unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321{
4322 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4323 register const Py_UNICODE *e;
4324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 /* Shortcut for single character strings */
4326 if (PyUnicode_GET_SIZE(self) == 1 &&
4327 Py_UNICODE_ISDECIMAL(*p))
4328 return PyInt_FromLong(1);
4329
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004330 /* Special case for empty strings */
4331 if (PyString_GET_SIZE(self) == 0)
4332 return PyInt_FromLong(0);
4333
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 e = p + PyUnicode_GET_SIZE(self);
4335 for (; p < e; p++) {
4336 if (!Py_UNICODE_ISDECIMAL(*p))
4337 return PyInt_FromLong(0);
4338 }
4339 return PyInt_FromLong(1);
4340}
4341
4342static char isdigit__doc__[] =
4343"S.isdigit() -> int\n\
4344\n\
4345Return 1 if there are only digit characters in S,\n\
43460 otherwise.";
4347
4348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004349unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350{
4351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4352 register const Py_UNICODE *e;
4353
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 /* Shortcut for single character strings */
4355 if (PyUnicode_GET_SIZE(self) == 1 &&
4356 Py_UNICODE_ISDIGIT(*p))
4357 return PyInt_FromLong(1);
4358
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004359 /* Special case for empty strings */
4360 if (PyString_GET_SIZE(self) == 0)
4361 return PyInt_FromLong(0);
4362
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 e = p + PyUnicode_GET_SIZE(self);
4364 for (; p < e; p++) {
4365 if (!Py_UNICODE_ISDIGIT(*p))
4366 return PyInt_FromLong(0);
4367 }
4368 return PyInt_FromLong(1);
4369}
4370
4371static char isnumeric__doc__[] =
4372"S.isnumeric() -> int\n\
4373\n\
4374Return 1 if there are only numeric characters in S,\n\
43750 otherwise.";
4376
4377static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004378unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379{
4380 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4381 register const Py_UNICODE *e;
4382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 /* Shortcut for single character strings */
4384 if (PyUnicode_GET_SIZE(self) == 1 &&
4385 Py_UNICODE_ISNUMERIC(*p))
4386 return PyInt_FromLong(1);
4387
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004388 /* Special case for empty strings */
4389 if (PyString_GET_SIZE(self) == 0)
4390 return PyInt_FromLong(0);
4391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 e = p + PyUnicode_GET_SIZE(self);
4393 for (; p < e; p++) {
4394 if (!Py_UNICODE_ISNUMERIC(*p))
4395 return PyInt_FromLong(0);
4396 }
4397 return PyInt_FromLong(1);
4398}
4399
4400static char join__doc__[] =
4401"S.join(sequence) -> unicode\n\
4402\n\
4403Return a string which is the concatenation of the strings in the\n\
4404sequence. The separator between elements is S.";
4405
4406static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004407unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004409 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410}
4411
4412static int
4413unicode_length(PyUnicodeObject *self)
4414{
4415 return self->length;
4416}
4417
4418static char ljust__doc__[] =
4419"S.ljust(width) -> unicode\n\
4420\n\
4421Return S left justified in a Unicode string of length width. Padding is\n\
4422done using spaces.";
4423
4424static PyObject *
4425unicode_ljust(PyUnicodeObject *self, PyObject *args)
4426{
4427 int width;
4428 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4429 return NULL;
4430
Tim Peters7a29bd52001-09-12 03:03:31 +00004431 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 Py_INCREF(self);
4433 return (PyObject*) self;
4434 }
4435
4436 return (PyObject*) pad(self, 0, width - self->length, ' ');
4437}
4438
4439static char lower__doc__[] =
4440"S.lower() -> unicode\n\
4441\n\
4442Return a copy of the string S converted to lowercase.";
4443
4444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004445unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 return fixup(self, fixlower);
4448}
4449
4450static char lstrip__doc__[] =
4451"S.lstrip() -> unicode\n\
4452\n\
4453Return a copy of the string S with leading whitespace removed.";
4454
4455static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004456unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 return strip(self, 1, 0);
4459}
4460
4461static PyObject*
4462unicode_repeat(PyUnicodeObject *str, int len)
4463{
4464 PyUnicodeObject *u;
4465 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004466 int nchars;
4467 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
4469 if (len < 0)
4470 len = 0;
4471
Tim Peters7a29bd52001-09-12 03:03:31 +00004472 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 /* no repeat, return original string */
4474 Py_INCREF(str);
4475 return (PyObject*) str;
4476 }
Tim Peters8f422462000-09-09 06:13:41 +00004477
4478 /* ensure # of chars needed doesn't overflow int and # of bytes
4479 * needed doesn't overflow size_t
4480 */
4481 nchars = len * str->length;
4482 if (len && nchars / len != str->length) {
4483 PyErr_SetString(PyExc_OverflowError,
4484 "repeated string is too long");
4485 return NULL;
4486 }
4487 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4488 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4489 PyErr_SetString(PyExc_OverflowError,
4490 "repeated string is too long");
4491 return NULL;
4492 }
4493 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 if (!u)
4495 return NULL;
4496
4497 p = u->str;
4498
4499 while (len-- > 0) {
4500 Py_UNICODE_COPY(p, str->str, str->length);
4501 p += str->length;
4502 }
4503
4504 return (PyObject*) u;
4505}
4506
4507PyObject *PyUnicode_Replace(PyObject *obj,
4508 PyObject *subobj,
4509 PyObject *replobj,
4510 int maxcount)
4511{
4512 PyObject *self;
4513 PyObject *str1;
4514 PyObject *str2;
4515 PyObject *result;
4516
4517 self = PyUnicode_FromObject(obj);
4518 if (self == NULL)
4519 return NULL;
4520 str1 = PyUnicode_FromObject(subobj);
4521 if (str1 == NULL) {
4522 Py_DECREF(self);
4523 return NULL;
4524 }
4525 str2 = PyUnicode_FromObject(replobj);
4526 if (str2 == NULL) {
4527 Py_DECREF(self);
4528 Py_DECREF(str1);
4529 return NULL;
4530 }
4531 result = replace((PyUnicodeObject *)self,
4532 (PyUnicodeObject *)str1,
4533 (PyUnicodeObject *)str2,
4534 maxcount);
4535 Py_DECREF(self);
4536 Py_DECREF(str1);
4537 Py_DECREF(str2);
4538 return result;
4539}
4540
4541static char replace__doc__[] =
4542"S.replace (old, new[, maxsplit]) -> unicode\n\
4543\n\
4544Return a copy of S with all occurrences of substring\n\
4545old replaced by new. If the optional argument maxsplit is\n\
4546given, only the first maxsplit occurrences are replaced.";
4547
4548static PyObject*
4549unicode_replace(PyUnicodeObject *self, PyObject *args)
4550{
4551 PyUnicodeObject *str1;
4552 PyUnicodeObject *str2;
4553 int maxcount = -1;
4554 PyObject *result;
4555
4556 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4557 return NULL;
4558 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4559 if (str1 == NULL)
4560 return NULL;
4561 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4562 if (str2 == NULL)
4563 return NULL;
4564
4565 result = replace(self, str1, str2, maxcount);
4566
4567 Py_DECREF(str1);
4568 Py_DECREF(str2);
4569 return result;
4570}
4571
4572static
4573PyObject *unicode_repr(PyObject *unicode)
4574{
4575 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4576 PyUnicode_GET_SIZE(unicode),
4577 1);
4578}
4579
4580static char rfind__doc__[] =
4581"S.rfind(sub [,start [,end]]) -> int\n\
4582\n\
4583Return the highest index in S where substring sub is found,\n\
4584such that sub is contained within s[start,end]. Optional\n\
4585arguments start and end are interpreted as in slice notation.\n\
4586\n\
4587Return -1 on failure.";
4588
4589static PyObject *
4590unicode_rfind(PyUnicodeObject *self, PyObject *args)
4591{
4592 PyUnicodeObject *substring;
4593 int start = 0;
4594 int end = INT_MAX;
4595 PyObject *result;
4596
Guido van Rossumb8872e62000-05-09 14:14:27 +00004597 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4598 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 return NULL;
4600 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4601 (PyObject *)substring);
4602 if (substring == NULL)
4603 return NULL;
4604
4605 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4606
4607 Py_DECREF(substring);
4608 return result;
4609}
4610
4611static char rindex__doc__[] =
4612"S.rindex(sub [,start [,end]]) -> int\n\
4613\n\
4614Like S.rfind() but raise ValueError when the substring is not found.";
4615
4616static PyObject *
4617unicode_rindex(PyUnicodeObject *self, PyObject *args)
4618{
4619 int result;
4620 PyUnicodeObject *substring;
4621 int start = 0;
4622 int end = INT_MAX;
4623
Guido van Rossumb8872e62000-05-09 14:14:27 +00004624 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4625 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 return NULL;
4627 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4628 (PyObject *)substring);
4629 if (substring == NULL)
4630 return NULL;
4631
4632 result = findstring(self, substring, start, end, -1);
4633
4634 Py_DECREF(substring);
4635 if (result < 0) {
4636 PyErr_SetString(PyExc_ValueError, "substring not found");
4637 return NULL;
4638 }
4639 return PyInt_FromLong(result);
4640}
4641
4642static char rjust__doc__[] =
4643"S.rjust(width) -> unicode\n\
4644\n\
4645Return S right justified in a Unicode string of length width. Padding is\n\
4646done using spaces.";
4647
4648static PyObject *
4649unicode_rjust(PyUnicodeObject *self, PyObject *args)
4650{
4651 int width;
4652 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4653 return NULL;
4654
Tim Peters7a29bd52001-09-12 03:03:31 +00004655 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 Py_INCREF(self);
4657 return (PyObject*) self;
4658 }
4659
4660 return (PyObject*) pad(self, width - self->length, 0, ' ');
4661}
4662
4663static char rstrip__doc__[] =
4664"S.rstrip() -> unicode\n\
4665\n\
4666Return a copy of the string S with trailing whitespace removed.";
4667
4668static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004669unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 return strip(self, 0, 1);
4672}
4673
4674static PyObject*
4675unicode_slice(PyUnicodeObject *self, int start, int end)
4676{
4677 /* standard clamping */
4678 if (start < 0)
4679 start = 0;
4680 if (end < 0)
4681 end = 0;
4682 if (end > self->length)
4683 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004684 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 /* full slice, return original string */
4686 Py_INCREF(self);
4687 return (PyObject*) self;
4688 }
4689 if (start > end)
4690 start = end;
4691 /* copy slice */
4692 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4693 end - start);
4694}
4695
4696PyObject *PyUnicode_Split(PyObject *s,
4697 PyObject *sep,
4698 int maxsplit)
4699{
4700 PyObject *result;
4701
4702 s = PyUnicode_FromObject(s);
4703 if (s == NULL)
4704 return NULL;
4705 if (sep != NULL) {
4706 sep = PyUnicode_FromObject(sep);
4707 if (sep == NULL) {
4708 Py_DECREF(s);
4709 return NULL;
4710 }
4711 }
4712
4713 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4714
4715 Py_DECREF(s);
4716 Py_XDECREF(sep);
4717 return result;
4718}
4719
4720static char split__doc__[] =
4721"S.split([sep [,maxsplit]]) -> list of strings\n\
4722\n\
4723Return a list of the words in S, using sep as the\n\
4724delimiter string. If maxsplit is given, at most maxsplit\n\
4725splits are done. If sep is not specified, any whitespace string\n\
4726is a separator.";
4727
4728static PyObject*
4729unicode_split(PyUnicodeObject *self, PyObject *args)
4730{
4731 PyObject *substring = Py_None;
4732 int maxcount = -1;
4733
4734 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4735 return NULL;
4736
4737 if (substring == Py_None)
4738 return split(self, NULL, maxcount);
4739 else if (PyUnicode_Check(substring))
4740 return split(self, (PyUnicodeObject *)substring, maxcount);
4741 else
4742 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4743}
4744
4745static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004746"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747\n\
4748Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004749Line breaks are not included in the resulting list unless keepends\n\
4750is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751
4752static PyObject*
4753unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4754{
Guido van Rossum86662912000-04-11 15:38:46 +00004755 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756
Guido van Rossum86662912000-04-11 15:38:46 +00004757 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 return NULL;
4759
Guido van Rossum86662912000-04-11 15:38:46 +00004760 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761}
4762
4763static
4764PyObject *unicode_str(PyUnicodeObject *self)
4765{
Fred Drakee4315f52000-05-09 19:53:39 +00004766 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767}
4768
4769static char strip__doc__[] =
4770"S.strip() -> unicode\n\
4771\n\
4772Return a copy of S with leading and trailing whitespace removed.";
4773
4774static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004775unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 return strip(self, 1, 1);
4778}
4779
4780static char swapcase__doc__[] =
4781"S.swapcase() -> unicode\n\
4782\n\
4783Return a copy of S with uppercase characters converted to lowercase\n\
4784and vice versa.";
4785
4786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004787unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 return fixup(self, fixswapcase);
4790}
4791
4792static char translate__doc__[] =
4793"S.translate(table) -> unicode\n\
4794\n\
4795Return a copy of the string S, where all characters have been mapped\n\
4796through the given translation table, which must be a mapping of\n\
4797Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4798are left untouched. Characters mapped to None are deleted.";
4799
4800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004801unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 return PyUnicode_TranslateCharmap(self->str,
4804 self->length,
4805 table,
4806 "ignore");
4807}
4808
4809static char upper__doc__[] =
4810"S.upper() -> unicode\n\
4811\n\
4812Return a copy of S converted to uppercase.";
4813
4814static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004815unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 return fixup(self, fixupper);
4818}
4819
4820#if 0
4821static char zfill__doc__[] =
4822"S.zfill(width) -> unicode\n\
4823\n\
4824Pad a numeric string x with zeros on the left, to fill a field\n\
4825of the specified width. The string x is never truncated.";
4826
4827static PyObject *
4828unicode_zfill(PyUnicodeObject *self, PyObject *args)
4829{
4830 int fill;
4831 PyUnicodeObject *u;
4832
4833 int width;
4834 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4835 return NULL;
4836
4837 if (self->length >= width) {
4838 Py_INCREF(self);
4839 return (PyObject*) self;
4840 }
4841
4842 fill = width - self->length;
4843
4844 u = pad(self, fill, 0, '0');
4845
4846 if (u->str[fill] == '+' || u->str[fill] == '-') {
4847 /* move sign to beginning of string */
4848 u->str[0] = u->str[fill];
4849 u->str[fill] = '0';
4850 }
4851
4852 return (PyObject*) u;
4853}
4854#endif
4855
4856#if 0
4857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004858unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 return PyInt_FromLong(unicode_freelist_size);
4861}
4862#endif
4863
4864static char startswith__doc__[] =
4865"S.startswith(prefix[, start[, end]]) -> int\n\
4866\n\
4867Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4868optional start, test S beginning at that position. With optional end, stop\n\
4869comparing S at that position.";
4870
4871static PyObject *
4872unicode_startswith(PyUnicodeObject *self,
4873 PyObject *args)
4874{
4875 PyUnicodeObject *substring;
4876 int start = 0;
4877 int end = INT_MAX;
4878 PyObject *result;
4879
Guido van Rossumb8872e62000-05-09 14:14:27 +00004880 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4881 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 return NULL;
4883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4884 (PyObject *)substring);
4885 if (substring == NULL)
4886 return NULL;
4887
4888 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4889
4890 Py_DECREF(substring);
4891 return result;
4892}
4893
4894
4895static char endswith__doc__[] =
4896"S.endswith(suffix[, start[, end]]) -> int\n\
4897\n\
4898Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4899optional start, test S beginning at that position. With optional end, stop\n\
4900comparing S at that position.";
4901
4902static PyObject *
4903unicode_endswith(PyUnicodeObject *self,
4904 PyObject *args)
4905{
4906 PyUnicodeObject *substring;
4907 int start = 0;
4908 int end = INT_MAX;
4909 PyObject *result;
4910
Guido van Rossumb8872e62000-05-09 14:14:27 +00004911 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4912 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 return NULL;
4914 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4915 (PyObject *)substring);
4916 if (substring == NULL)
4917 return NULL;
4918
4919 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4920
4921 Py_DECREF(substring);
4922 return result;
4923}
4924
4925
4926static PyMethodDef unicode_methods[] = {
4927
4928 /* Order is according to common usage: often used methods should
4929 appear first, since lookup is done sequentially. */
4930
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004931 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4932 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4933 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4934 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4935 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4936 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4937 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4938 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4939 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4940 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4941 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4942 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4943 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4944 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4945/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4946 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4947 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4948 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4949 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4950 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4951 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4952 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4953 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4954 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4955 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4956 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4957 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4958 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4959 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4960 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4961 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4962 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4963 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4964 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4965 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004967 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4968 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969#endif
4970
4971#if 0
4972 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004973 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974#endif
4975
4976 {NULL, NULL}
4977};
4978
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979static PySequenceMethods unicode_as_sequence = {
4980 (inquiry) unicode_length, /* sq_length */
4981 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4982 (intargfunc) unicode_repeat, /* sq_repeat */
4983 (intargfunc) unicode_getitem, /* sq_item */
4984 (intintargfunc) unicode_slice, /* sq_slice */
4985 0, /* sq_ass_item */
4986 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004987 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988};
4989
4990static int
4991unicode_buffer_getreadbuf(PyUnicodeObject *self,
4992 int index,
4993 const void **ptr)
4994{
4995 if (index != 0) {
4996 PyErr_SetString(PyExc_SystemError,
4997 "accessing non-existent unicode segment");
4998 return -1;
4999 }
5000 *ptr = (void *) self->str;
5001 return PyUnicode_GET_DATA_SIZE(self);
5002}
5003
5004static int
5005unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5006 const void **ptr)
5007{
5008 PyErr_SetString(PyExc_TypeError,
5009 "cannot use unicode as modifyable buffer");
5010 return -1;
5011}
5012
5013static int
5014unicode_buffer_getsegcount(PyUnicodeObject *self,
5015 int *lenp)
5016{
5017 if (lenp)
5018 *lenp = PyUnicode_GET_DATA_SIZE(self);
5019 return 1;
5020}
5021
5022static int
5023unicode_buffer_getcharbuf(PyUnicodeObject *self,
5024 int index,
5025 const void **ptr)
5026{
5027 PyObject *str;
5028
5029 if (index != 0) {
5030 PyErr_SetString(PyExc_SystemError,
5031 "accessing non-existent unicode segment");
5032 return -1;
5033 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005034 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 if (str == NULL)
5036 return -1;
5037 *ptr = (void *) PyString_AS_STRING(str);
5038 return PyString_GET_SIZE(str);
5039}
5040
5041/* Helpers for PyUnicode_Format() */
5042
5043static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005044getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045{
5046 int argidx = *p_argidx;
5047 if (argidx < arglen) {
5048 (*p_argidx)++;
5049 if (arglen < 0)
5050 return args;
5051 else
5052 return PyTuple_GetItem(args, argidx);
5053 }
5054 PyErr_SetString(PyExc_TypeError,
5055 "not enough arguments for format string");
5056 return NULL;
5057}
5058
5059#define F_LJUST (1<<0)
5060#define F_SIGN (1<<1)
5061#define F_BLANK (1<<2)
5062#define F_ALT (1<<3)
5063#define F_ZERO (1<<4)
5064
5065static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067{
5068 register int i;
5069 int len;
5070 va_list va;
5071 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
5074 /* First, format the string as char array, then expand to Py_UNICODE
5075 array. */
5076 charbuffer = (char *)buffer;
5077 len = vsprintf(charbuffer, format, va);
5078 for (i = len - 1; i >= 0; i--)
5079 buffer[i] = (Py_UNICODE) charbuffer[i];
5080
5081 va_end(va);
5082 return len;
5083}
5084
5085static int
5086formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005087 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 int flags,
5089 int prec,
5090 int type,
5091 PyObject *v)
5092{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005093 /* fmt = '%#.' + `prec` + `type`
5094 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 char fmt[20];
5096 double x;
5097
5098 x = PyFloat_AsDouble(v);
5099 if (x == -1.0 && PyErr_Occurred())
5100 return -1;
5101 if (prec < 0)
5102 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5104 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005105 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5106 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005107 /* worst case length calc to ensure no buffer overrun:
5108 fmt = %#.<prec>g
5109 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5110 for any double rep.)
5111 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5112 If prec=0 the effective precision is 1 (the leading digit is
5113 always given), therefore increase by one to 10+prec. */
5114 if (buflen <= (size_t)10 + (size_t)prec) {
5115 PyErr_SetString(PyExc_OverflowError,
5116 "formatted float is too long (precision too long?)");
5117 return -1;
5118 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 return usprintf(buf, fmt, x);
5120}
5121
Tim Peters38fd5b62000-09-21 05:43:11 +00005122static PyObject*
5123formatlong(PyObject *val, int flags, int prec, int type)
5124{
5125 char *buf;
5126 int i, len;
5127 PyObject *str; /* temporary string object. */
5128 PyUnicodeObject *result;
5129
5130 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5131 if (!str)
5132 return NULL;
5133 result = _PyUnicode_New(len);
5134 for (i = 0; i < len; i++)
5135 result->str[i] = buf[i];
5136 result->str[len] = 0;
5137 Py_DECREF(str);
5138 return (PyObject*)result;
5139}
5140
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141static int
5142formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005143 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 int flags,
5145 int prec,
5146 int type,
5147 PyObject *v)
5148{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005149 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005150 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5151 * + 1 + 1
5152 * = 24
5153 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005154 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 long x;
5156
5157 x = PyInt_AsLong(v);
5158 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005159 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005161 prec = 1;
5162
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005163 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005164 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5165 */
5166 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005167 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005168 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005169 return -1;
5170 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005171
5172 if ((flags & F_ALT) &&
5173 (type == 'x' || type == 'X')) {
5174 /* When converting under %#x or %#X, there are a number
5175 * of issues that cause pain:
5176 * - when 0 is being converted, the C standard leaves off
5177 * the '0x' or '0X', which is inconsistent with other
5178 * %#x/%#X conversions and inconsistent with Python's
5179 * hex() function
5180 * - there are platforms that violate the standard and
5181 * convert 0 with the '0x' or '0X'
5182 * (Metrowerks, Compaq Tru64)
5183 * - there are platforms that give '0x' when converting
5184 * under %#X, but convert 0 in accordance with the
5185 * standard (OS/2 EMX)
5186 *
5187 * We can achieve the desired consistency by inserting our
5188 * own '0x' or '0X' prefix, and substituting %x/%X in place
5189 * of %#x/%#X.
5190 *
5191 * Note that this is the same approach as used in
5192 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005193 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005194 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5195 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005196 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005197 else {
5198 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5199 (flags&F_ALT) ? "#" : "",
5200 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 return usprintf(buf, fmt, x);
5203}
5204
5205static int
5206formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005207 size_t buflen,
5208 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005210 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005211 if (PyUnicode_Check(v)) {
5212 if (PyUnicode_GET_SIZE(v) != 1)
5213 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005217 else if (PyString_Check(v)) {
5218 if (PyString_GET_SIZE(v) != 1)
5219 goto onError;
5220 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5221 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
5223 else {
5224 /* Integer input truncated to a character */
5225 long x;
5226 x = PyInt_AsLong(v);
5227 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 buf[0] = (char) x;
5230 }
5231 buf[1] = '\0';
5232 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005233
5234 onError:
5235 PyErr_SetString(PyExc_TypeError,
5236 "%c requires int or char");
5237 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238}
5239
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005240/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5241
5242 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5243 chars are formatted. XXX This is a magic number. Each formatting
5244 routine does bounds checking to ensure no overflow, but a better
5245 solution may be to malloc a buffer of appropriate size for each
5246 format. For now, the current solution is sufficient.
5247*/
5248#define FORMATBUFLEN (size_t)120
5249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250PyObject *PyUnicode_Format(PyObject *format,
5251 PyObject *args)
5252{
5253 Py_UNICODE *fmt, *res;
5254 int fmtcnt, rescnt, reslen, arglen, argidx;
5255 int args_owned = 0;
5256 PyUnicodeObject *result = NULL;
5257 PyObject *dict = NULL;
5258 PyObject *uformat;
5259
5260 if (format == NULL || args == NULL) {
5261 PyErr_BadInternalCall();
5262 return NULL;
5263 }
5264 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005265 if (uformat == NULL)
5266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 fmt = PyUnicode_AS_UNICODE(uformat);
5268 fmtcnt = PyUnicode_GET_SIZE(uformat);
5269
5270 reslen = rescnt = fmtcnt + 100;
5271 result = _PyUnicode_New(reslen);
5272 if (result == NULL)
5273 goto onError;
5274 res = PyUnicode_AS_UNICODE(result);
5275
5276 if (PyTuple_Check(args)) {
5277 arglen = PyTuple_Size(args);
5278 argidx = 0;
5279 }
5280 else {
5281 arglen = -1;
5282 argidx = -2;
5283 }
5284 if (args->ob_type->tp_as_mapping)
5285 dict = args;
5286
5287 while (--fmtcnt >= 0) {
5288 if (*fmt != '%') {
5289 if (--rescnt < 0) {
5290 rescnt = fmtcnt + 100;
5291 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005292 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 return NULL;
5294 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5295 --rescnt;
5296 }
5297 *res++ = *fmt++;
5298 }
5299 else {
5300 /* Got a format specifier */
5301 int flags = 0;
5302 int width = -1;
5303 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 Py_UNICODE c = '\0';
5305 Py_UNICODE fill;
5306 PyObject *v = NULL;
5307 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005308 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 Py_UNICODE sign;
5310 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005311 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312
5313 fmt++;
5314 if (*fmt == '(') {
5315 Py_UNICODE *keystart;
5316 int keylen;
5317 PyObject *key;
5318 int pcount = 1;
5319
5320 if (dict == NULL) {
5321 PyErr_SetString(PyExc_TypeError,
5322 "format requires a mapping");
5323 goto onError;
5324 }
5325 ++fmt;
5326 --fmtcnt;
5327 keystart = fmt;
5328 /* Skip over balanced parentheses */
5329 while (pcount > 0 && --fmtcnt >= 0) {
5330 if (*fmt == ')')
5331 --pcount;
5332 else if (*fmt == '(')
5333 ++pcount;
5334 fmt++;
5335 }
5336 keylen = fmt - keystart - 1;
5337 if (fmtcnt < 0 || pcount > 0) {
5338 PyErr_SetString(PyExc_ValueError,
5339 "incomplete format key");
5340 goto onError;
5341 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005342#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005343 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 then looked up since Python uses strings to hold
5345 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005346 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 key = PyUnicode_EncodeUTF8(keystart,
5348 keylen,
5349 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005350#else
5351 key = PyUnicode_FromUnicode(keystart, keylen);
5352#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 if (key == NULL)
5354 goto onError;
5355 if (args_owned) {
5356 Py_DECREF(args);
5357 args_owned = 0;
5358 }
5359 args = PyObject_GetItem(dict, key);
5360 Py_DECREF(key);
5361 if (args == NULL) {
5362 goto onError;
5363 }
5364 args_owned = 1;
5365 arglen = -1;
5366 argidx = -2;
5367 }
5368 while (--fmtcnt >= 0) {
5369 switch (c = *fmt++) {
5370 case '-': flags |= F_LJUST; continue;
5371 case '+': flags |= F_SIGN; continue;
5372 case ' ': flags |= F_BLANK; continue;
5373 case '#': flags |= F_ALT; continue;
5374 case '0': flags |= F_ZERO; continue;
5375 }
5376 break;
5377 }
5378 if (c == '*') {
5379 v = getnextarg(args, arglen, &argidx);
5380 if (v == NULL)
5381 goto onError;
5382 if (!PyInt_Check(v)) {
5383 PyErr_SetString(PyExc_TypeError,
5384 "* wants int");
5385 goto onError;
5386 }
5387 width = PyInt_AsLong(v);
5388 if (width < 0) {
5389 flags |= F_LJUST;
5390 width = -width;
5391 }
5392 if (--fmtcnt >= 0)
5393 c = *fmt++;
5394 }
5395 else if (c >= '0' && c <= '9') {
5396 width = c - '0';
5397 while (--fmtcnt >= 0) {
5398 c = *fmt++;
5399 if (c < '0' || c > '9')
5400 break;
5401 if ((width*10) / 10 != width) {
5402 PyErr_SetString(PyExc_ValueError,
5403 "width too big");
5404 goto onError;
5405 }
5406 width = width*10 + (c - '0');
5407 }
5408 }
5409 if (c == '.') {
5410 prec = 0;
5411 if (--fmtcnt >= 0)
5412 c = *fmt++;
5413 if (c == '*') {
5414 v = getnextarg(args, arglen, &argidx);
5415 if (v == NULL)
5416 goto onError;
5417 if (!PyInt_Check(v)) {
5418 PyErr_SetString(PyExc_TypeError,
5419 "* wants int");
5420 goto onError;
5421 }
5422 prec = PyInt_AsLong(v);
5423 if (prec < 0)
5424 prec = 0;
5425 if (--fmtcnt >= 0)
5426 c = *fmt++;
5427 }
5428 else if (c >= '0' && c <= '9') {
5429 prec = c - '0';
5430 while (--fmtcnt >= 0) {
5431 c = Py_CHARMASK(*fmt++);
5432 if (c < '0' || c > '9')
5433 break;
5434 if ((prec*10) / 10 != prec) {
5435 PyErr_SetString(PyExc_ValueError,
5436 "prec too big");
5437 goto onError;
5438 }
5439 prec = prec*10 + (c - '0');
5440 }
5441 }
5442 } /* prec */
5443 if (fmtcnt >= 0) {
5444 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 if (--fmtcnt >= 0)
5446 c = *fmt++;
5447 }
5448 }
5449 if (fmtcnt < 0) {
5450 PyErr_SetString(PyExc_ValueError,
5451 "incomplete format");
5452 goto onError;
5453 }
5454 if (c != '%') {
5455 v = getnextarg(args, arglen, &argidx);
5456 if (v == NULL)
5457 goto onError;
5458 }
5459 sign = 0;
5460 fill = ' ';
5461 switch (c) {
5462
5463 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005464 pbuf = formatbuf;
5465 /* presume that buffer length is at least 1 */
5466 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 len = 1;
5468 break;
5469
5470 case 's':
5471 case 'r':
5472 if (PyUnicode_Check(v) && c == 's') {
5473 temp = v;
5474 Py_INCREF(temp);
5475 }
5476 else {
5477 PyObject *unicode;
5478 if (c == 's')
5479 temp = PyObject_Str(v);
5480 else
5481 temp = PyObject_Repr(v);
5482 if (temp == NULL)
5483 goto onError;
5484 if (!PyString_Check(temp)) {
5485 /* XXX Note: this should never happen, since
5486 PyObject_Repr() and PyObject_Str() assure
5487 this */
5488 Py_DECREF(temp);
5489 PyErr_SetString(PyExc_TypeError,
5490 "%s argument has non-string str()");
5491 goto onError;
5492 }
Fred Drakee4315f52000-05-09 19:53:39 +00005493 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005495 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 "strict");
5497 Py_DECREF(temp);
5498 temp = unicode;
5499 if (temp == NULL)
5500 goto onError;
5501 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005502 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 len = PyUnicode_GET_SIZE(temp);
5504 if (prec >= 0 && len > prec)
5505 len = prec;
5506 break;
5507
5508 case 'i':
5509 case 'd':
5510 case 'u':
5511 case 'o':
5512 case 'x':
5513 case 'X':
5514 if (c == 'i')
5515 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005516 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005517 temp = formatlong(v, flags, prec, c);
5518 if (!temp)
5519 goto onError;
5520 pbuf = PyUnicode_AS_UNICODE(temp);
5521 len = PyUnicode_GET_SIZE(temp);
5522 /* unbounded ints can always produce
5523 a sign character! */
5524 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005526 else {
5527 pbuf = formatbuf;
5528 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5529 flags, prec, c, v);
5530 if (len < 0)
5531 goto onError;
5532 /* only d conversion is signed */
5533 sign = c == 'd';
5534 }
5535 if (flags & F_ZERO)
5536 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 break;
5538
5539 case 'e':
5540 case 'E':
5541 case 'f':
5542 case 'g':
5543 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005544 pbuf = formatbuf;
5545 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5546 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 if (len < 0)
5548 goto onError;
5549 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005550 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 fill = '0';
5552 break;
5553
5554 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005555 pbuf = formatbuf;
5556 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 if (len < 0)
5558 goto onError;
5559 break;
5560
5561 default:
5562 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005563 "unsupported format character '%c' (0x%x) "
5564 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005565 (31<=c && c<=126) ? c : '?',
5566 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 goto onError;
5568 }
5569 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005570 if (*pbuf == '-' || *pbuf == '+') {
5571 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 len--;
5573 }
5574 else if (flags & F_SIGN)
5575 sign = '+';
5576 else if (flags & F_BLANK)
5577 sign = ' ';
5578 else
5579 sign = 0;
5580 }
5581 if (width < len)
5582 width = len;
5583 if (rescnt < width + (sign != 0)) {
5584 reslen -= rescnt;
5585 rescnt = width + fmtcnt + 100;
5586 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005587 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 return NULL;
5589 res = PyUnicode_AS_UNICODE(result)
5590 + reslen - rescnt;
5591 }
5592 if (sign) {
5593 if (fill != ' ')
5594 *res++ = sign;
5595 rescnt--;
5596 if (width > len)
5597 width--;
5598 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005599 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5600 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005601 assert(pbuf[1] == c);
5602 if (fill != ' ') {
5603 *res++ = *pbuf++;
5604 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005605 }
Tim Petersfff53252001-04-12 18:38:48 +00005606 rescnt -= 2;
5607 width -= 2;
5608 if (width < 0)
5609 width = 0;
5610 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 if (width > len && !(flags & F_LJUST)) {
5613 do {
5614 --rescnt;
5615 *res++ = fill;
5616 } while (--width > len);
5617 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005618 if (fill == ' ') {
5619 if (sign)
5620 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005621 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005622 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005623 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005624 *res++ = *pbuf++;
5625 *res++ = *pbuf++;
5626 }
5627 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005628 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 res += len;
5630 rescnt -= len;
5631 while (--width >= len) {
5632 --rescnt;
5633 *res++ = ' ';
5634 }
5635 if (dict && (argidx < arglen) && c != '%') {
5636 PyErr_SetString(PyExc_TypeError,
5637 "not all arguments converted");
5638 goto onError;
5639 }
5640 Py_XDECREF(temp);
5641 } /* '%' */
5642 } /* until end */
5643 if (argidx < arglen && !dict) {
5644 PyErr_SetString(PyExc_TypeError,
5645 "not all arguments converted");
5646 goto onError;
5647 }
5648
5649 if (args_owned) {
5650 Py_DECREF(args);
5651 }
5652 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005653 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 return (PyObject *)result;
5656
5657 onError:
5658 Py_XDECREF(result);
5659 Py_DECREF(uformat);
5660 if (args_owned) {
5661 Py_DECREF(args);
5662 }
5663 return NULL;
5664}
5665
5666static PyBufferProcs unicode_as_buffer = {
5667 (getreadbufferproc) unicode_buffer_getreadbuf,
5668 (getwritebufferproc) unicode_buffer_getwritebuf,
5669 (getsegcountproc) unicode_buffer_getsegcount,
5670 (getcharbufferproc) unicode_buffer_getcharbuf,
5671};
5672
Guido van Rossume023fe02001-08-30 03:12:59 +00005673staticforward PyObject *
5674unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5675
Tim Peters6d6c1a32001-08-02 04:15:00 +00005676static PyObject *
5677unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5678{
5679 PyObject *x = NULL;
5680 static char *kwlist[] = {"string", "encoding", "errors", 0};
5681 char *encoding = NULL;
5682 char *errors = NULL;
5683
Guido van Rossume023fe02001-08-30 03:12:59 +00005684 if (type != &PyUnicode_Type)
5685 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005686 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5687 kwlist, &x, &encoding, &errors))
5688 return NULL;
5689 if (x == NULL)
5690 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005691 if (encoding == NULL && errors == NULL)
5692 return PyObject_Unicode(x);
5693 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005694 return PyUnicode_FromEncodedObject(x, encoding, errors);
5695}
5696
Guido van Rossume023fe02001-08-30 03:12:59 +00005697static PyObject *
5698unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5699{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005700 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005701 int n;
5702
5703 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5704 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5705 if (tmp == NULL)
5706 return NULL;
5707 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005708 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5709 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005710 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005711 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5712 if (pnew->str == NULL) {
5713 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauerdcc819a2002-03-22 15:33:15 +00005714 PyMalloc_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005715 return NULL;
5716 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005717 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5718 pnew->length = n;
5719 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005720 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005721 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005722}
5723
Tim Peters6d6c1a32001-08-02 04:15:00 +00005724static char unicode_doc[] =
5725"unicode(string [, encoding[, errors]]) -> object\n\
5726\n\
5727Create a new Unicode object from the given encoded string.\n\
5728encoding defaults to the current default string encoding and \n\
5729errors, defining the error handling, to 'strict'.";
5730
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731PyTypeObject PyUnicode_Type = {
5732 PyObject_HEAD_INIT(&PyType_Type)
5733 0, /* ob_size */
5734 "unicode", /* tp_name */
5735 sizeof(PyUnicodeObject), /* tp_size */
5736 0, /* tp_itemsize */
5737 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005738 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005740 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 0, /* tp_setattr */
5742 (cmpfunc) unicode_compare, /* tp_compare */
5743 (reprfunc) unicode_repr, /* tp_repr */
5744 0, /* tp_as_number */
5745 &unicode_as_sequence, /* tp_as_sequence */
5746 0, /* tp_as_mapping */
5747 (hashfunc) unicode_hash, /* tp_hash*/
5748 0, /* tp_call*/
5749 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005750 PyObject_GenericGetAttr, /* tp_getattro */
5751 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005753 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005754 unicode_doc, /* tp_doc */
5755 0, /* tp_traverse */
5756 0, /* tp_clear */
5757 0, /* tp_richcompare */
5758 0, /* tp_weaklistoffset */
5759 0, /* tp_iter */
5760 0, /* tp_iternext */
5761 unicode_methods, /* tp_methods */
5762 0, /* tp_members */
5763 0, /* tp_getset */
5764 0, /* tp_base */
5765 0, /* tp_dict */
5766 0, /* tp_descr_get */
5767 0, /* tp_descr_set */
5768 0, /* tp_dictoffset */
5769 0, /* tp_init */
5770 0, /* tp_alloc */
5771 unicode_new, /* tp_new */
Neil Schemenauerdcc819a2002-03-22 15:33:15 +00005772 _PyMalloc_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773};
5774
5775/* Initialize the Unicode implementation */
5776
Thomas Wouters78890102000-07-22 19:25:51 +00005777void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005779 int i;
5780
Fred Drakee4315f52000-05-09 19:53:39 +00005781 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005782 unicode_freelist = NULL;
5783 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005785 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005786 for (i = 0; i < 256; i++)
5787 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788}
5789
5790/* Finalize the Unicode implementation */
5791
5792void
Thomas Wouters78890102000-07-22 19:25:51 +00005793_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005795 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005796 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005798 Py_XDECREF(unicode_empty);
5799 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005800
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005801 for (i = 0; i < 256; i++) {
5802 if (unicode_latin1[i]) {
5803 Py_DECREF(unicode_latin1[i]);
5804 unicode_latin1[i] = NULL;
5805 }
5806 }
5807
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005808 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 PyUnicodeObject *v = u;
5810 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005811 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005812 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005813 Py_XDECREF(v->defenc);
Neil Schemenauerdcc819a2002-03-22 15:33:15 +00005814 PyMalloc_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005816 unicode_freelist = NULL;
5817 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818}