blob: a67caa30262034eb351862e516153ceb978e0b9b [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001068 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069 errmsg = "illegal encoding";
1070 goto utf8Error;
1071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001073 *p++ = (Py_UNICODE)ch;
1074 break;
1075
1076 case 4:
1077 if ((s[1] & 0xc0) != 0x80 ||
1078 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001079 (s[3] & 0xc0) != 0x80) {
1080 errmsg = "invalid data";
1081 goto utf8Error;
1082 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001083 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001086 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001087 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001088 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001089 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001091 errmsg = "illegal encoding";
1092 goto utf8Error;
1093 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001094#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 *p++ = (Py_UNICODE)ch;
1096#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001097 /* compute and append the two surrogates: */
1098
1099 /* translate from 10000..10FFFF to 0..FFFF */
1100 ch -= 0x10000;
1101
1102 /* high surrogate = top 10 bits added to D800 */
1103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1104
1105 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001107#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 break;
1109
1110 default:
1111 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001112 errmsg = "unsupported Unicode code range";
1113 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
1115 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001116 continue;
1117
1118 utf8Error:
1119 if (utf8_decoding_error(&s, &p, errors, errmsg))
1120 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122
1123 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001124 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 goto onError;
1126
1127 return (PyObject *)unicode;
1128
1129onError:
1130 Py_DECREF(unicode);
1131 return NULL;
1132}
1133
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134/* Not used anymore, now that the encoder supports UTF-16
1135 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001136#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137static
1138int utf8_encoding_error(const Py_UNICODE **source,
1139 char **dest,
1140 const char *errors,
1141 const char *details)
1142{
1143 if ((errors == NULL) ||
1144 (strcmp(errors,"strict") == 0)) {
1145 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001146 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 details);
1148 return -1;
1149 }
1150 else if (strcmp(errors,"ignore") == 0) {
1151 return 0;
1152 }
1153 else if (strcmp(errors,"replace") == 0) {
1154 **dest = '?';
1155 (*dest)++;
1156 return 0;
1157 }
1158 else {
1159 PyErr_Format(PyExc_ValueError,
1160 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001161 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 errors);
1163 return -1;
1164 }
1165}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001166#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167
1168PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169 int size,
1170 const char *errors)
1171{
1172 PyObject *v;
1173 char *p;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001174 unsigned int cbAllocated = 2 * size;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001175 unsigned int cbWritten = 0;
1176 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001178 v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 if (v == NULL)
1180 return NULL;
1181 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001182 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001184 p = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001185 while (i < size) {
1186 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001187
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001188 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 cbWritten++;
1191 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 else if (ch < 0x0800) {
1194 *p++ = 0xc0 | (ch >> 6);
1195 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 cbWritten += 2;
1197 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001198
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001199 else {
1200
1201 /* Assure that we have enough room for high order Unicode
1202 ordinals */
1203 if (cbWritten >= cbAllocated) {
1204 cbAllocated += 4 * 10;
1205 if (_PyString_Resize(&v, cbAllocated + 4))
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001206 goto onError;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001207 p = PyString_AS_STRING(v) + cbWritten;
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001208 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001209
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001210 if (ch < 0x10000) {
1211 /* Check for high surrogate */
1212 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1213 Py_UCS4 ch2 = s[i];
1214 /* Check for low surrogate */
1215 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001217 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001218 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001219 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1220 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001221 i++;
1222 cbWritten += 4;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001223 continue;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001224 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001225 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001226 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001227 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001228 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1229 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001230 cbWritten += 3;
1231
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001232 } else {
1233 *p++ = 0xf0 | (ch>>18);
1234 *p++ = 0x80 | ((ch>>12) & 0x3f);
1235 *p++ = 0x80 | ((ch>>6) & 0x3f);
1236 *p++ = 0x80 | (ch & 0x3f);
1237 cbWritten += 4;
1238 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241 *p = '\0';
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001242 if (_PyString_Resize(&v, cbWritten))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 return v;
1245
1246 onError:
1247 Py_DECREF(v);
1248 return NULL;
1249}
1250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1252{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 if (!PyUnicode_Check(unicode)) {
1254 PyErr_BadArgument();
1255 return NULL;
1256 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001257 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1258 PyUnicode_GET_SIZE(unicode),
1259 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260}
1261
1262/* --- UTF-16 Codec ------------------------------------------------------- */
1263
1264static
Tim Peters772747b2001-08-09 22:21:55 +00001265int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266 const char *errors,
1267 const char *details)
1268{
1269 if ((errors == NULL) ||
1270 (strcmp(errors,"strict") == 0)) {
1271 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001272 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 details);
1274 return -1;
1275 }
1276 else if (strcmp(errors,"ignore") == 0) {
1277 return 0;
1278 }
1279 else if (strcmp(errors,"replace") == 0) {
1280 if (dest) {
1281 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1282 (*dest)++;
1283 }
1284 return 0;
1285 }
1286 else {
1287 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001288 "UTF-16 decoding error; "
1289 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290 errors);
1291 return -1;
1292 }
1293}
1294
Tim Peters772747b2001-08-09 22:21:55 +00001295PyObject *
1296PyUnicode_DecodeUTF16(const char *s,
1297 int size,
1298 const char *errors,
1299 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 PyUnicodeObject *unicode;
1302 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001303 const unsigned char *q, *e;
1304 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001306 /* Offsets from q for retrieving byte pairs in the right order. */
1307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1308 int ihi = 1, ilo = 0;
1309#else
1310 int ihi = 0, ilo = 1;
1311#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312
1313 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001314 if (size & 1) {
1315 if (utf16_decoding_error(NULL, errors, "truncated data"))
1316 return NULL;
1317 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 }
1319
1320 /* Note: size will always be longer than the resulting Unicode
1321 character count */
1322 unicode = _PyUnicode_New(size);
1323 if (!unicode)
1324 return NULL;
1325 if (size == 0)
1326 return (PyObject *)unicode;
1327
1328 /* Unpack UTF-16 encoded data */
1329 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001330 q = (unsigned char *)s;
1331 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332
1333 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001334 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001336 /* Check for BOM marks (U+FEFF) in the input and adjust current
1337 byte order setting accordingly. In native mode, the leading BOM
1338 mark is skipped, in all other modes, it is copied to the output
1339 stream as-is (giving a ZWNBSP character). */
1340 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001341 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001342#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001343 if (bom == 0xFEFF) {
1344 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001346 }
1347 else if (bom == 0xFFFE) {
1348 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001349 bo = 1;
1350 }
1351#else
Tim Peters772747b2001-08-09 22:21:55 +00001352 if (bom == 0xFEFF) {
1353 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001354 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001355 }
1356 else if (bom == 0xFFFE) {
1357 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001358 bo = -1;
1359 }
1360#endif
1361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362
Tim Peters772747b2001-08-09 22:21:55 +00001363 if (bo == -1) {
1364 /* force LE */
1365 ihi = 1;
1366 ilo = 0;
1367 }
1368 else if (bo == 1) {
1369 /* force BE */
1370 ihi = 0;
1371 ilo = 1;
1372 }
1373
1374 while (q < e) {
1375 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1376 q += 2;
1377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 if (ch < 0xD800 || ch > 0xDFFF) {
1379 *p++ = ch;
1380 continue;
1381 }
1382
1383 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001384 if (q >= e) {
1385 errmsg = "unexpected end of data";
1386 goto utf16Error;
1387 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001388 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001389 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1390 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001391 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001392#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001393 *p++ = ch;
1394 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001395#else
1396 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001398 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 }
1400 else {
1401 errmsg = "illegal UTF-16 surrogate";
1402 goto utf16Error;
1403 }
1404
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001406 errmsg = "illegal encoding";
1407 /* Fall through to report the error */
1408
1409 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001410 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001411 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
1413
1414 if (byteorder)
1415 *byteorder = bo;
1416
1417 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001418 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419 goto onError;
1420
1421 return (PyObject *)unicode;
1422
1423onError:
1424 Py_DECREF(unicode);
1425 return NULL;
1426}
1427
Tim Peters772747b2001-08-09 22:21:55 +00001428PyObject *
1429PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1430 int size,
1431 const char *errors,
1432 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433{
1434 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001435 unsigned char *p;
1436 int i, pairs;
1437 /* Offsets from p for storing byte pairs in the right order. */
1438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1439 int ihi = 1, ilo = 0;
1440#else
1441 int ihi = 0, ilo = 1;
1442#endif
1443
1444#define STORECHAR(CH) \
1445 do { \
1446 p[ihi] = ((CH) >> 8) & 0xff; \
1447 p[ilo] = (CH) & 0xff; \
1448 p += 2; \
1449 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001451 for (i = pairs = 0; i < size; i++)
1452 if (s[i] >= 0x10000)
1453 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001455 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 if (v == NULL)
1457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458
Tim Peters772747b2001-08-09 22:21:55 +00001459 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001461 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001462 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001463 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001464
1465 if (byteorder == -1) {
1466 /* force LE */
1467 ihi = 1;
1468 ilo = 0;
1469 }
1470 else if (byteorder == 1) {
1471 /* force BE */
1472 ihi = 0;
1473 ilo = 1;
1474 }
1475
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001476 while (size-- > 0) {
1477 Py_UNICODE ch = *s++;
1478 Py_UNICODE ch2 = 0;
1479 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001480 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1481 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 }
Tim Peters772747b2001-08-09 22:21:55 +00001483 STORECHAR(ch);
1484 if (ch2)
1485 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001488#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489}
1490
1491PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1492{
1493 if (!PyUnicode_Check(unicode)) {
1494 PyErr_BadArgument();
1495 return NULL;
1496 }
1497 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1498 PyUnicode_GET_SIZE(unicode),
1499 NULL,
1500 0);
1501}
1502
1503/* --- Unicode Escape Codec ----------------------------------------------- */
1504
1505static
1506int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001507 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508 const char *errors,
1509 const char *details)
1510{
1511 if ((errors == NULL) ||
1512 (strcmp(errors,"strict") == 0)) {
1513 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001514 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 details);
1516 return -1;
1517 }
1518 else if (strcmp(errors,"ignore") == 0) {
1519 return 0;
1520 }
1521 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001522 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 return 0;
1524 }
1525 else {
1526 PyErr_Format(PyExc_ValueError,
1527 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001528 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 errors);
1530 return -1;
1531 }
1532}
1533
Fredrik Lundh06d12682001-01-24 07:59:11 +00001534static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001535
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1537 int size,
1538 const char *errors)
1539{
1540 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001541 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001543 char* message;
1544 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1545
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546 /* Escaped strings will always be longer than the resulting
1547 Unicode string, so we start with size here and then reduce the
1548 length after conversion to the true value. */
1549 v = _PyUnicode_New(size);
1550 if (v == NULL)
1551 goto onError;
1552 if (size == 0)
1553 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 p = buf = PyUnicode_AS_UNICODE(v);
1556 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001557
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 while (s < end) {
1559 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001560 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001561 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562
1563 /* Non-escape characters are interpreted as Unicode ordinals */
1564 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001565 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 continue;
1567 }
1568
1569 /* \ - Escapes */
1570 s++;
1571 switch (*s++) {
1572
1573 /* \x escapes */
1574 case '\n': break;
1575 case '\\': *p++ = '\\'; break;
1576 case '\'': *p++ = '\''; break;
1577 case '\"': *p++ = '\"'; break;
1578 case 'b': *p++ = '\b'; break;
1579 case 'f': *p++ = '\014'; break; /* FF */
1580 case 't': *p++ = '\t'; break;
1581 case 'n': *p++ = '\n'; break;
1582 case 'r': *p++ = '\r'; break;
1583 case 'v': *p++ = '\013'; break; /* VT */
1584 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1585
1586 /* \OOO (octal) escapes */
1587 case '0': case '1': case '2': case '3':
1588 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001589 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001591 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001593 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001595 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 break;
1597
Fredrik Lundhccc74732001-02-18 22:13:49 +00001598 /* hex escapes */
1599 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001601 digits = 2;
1602 message = "truncated \\xXX escape";
1603 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604
Fredrik Lundhccc74732001-02-18 22:13:49 +00001605 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001607 digits = 4;
1608 message = "truncated \\uXXXX escape";
1609 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610
Fredrik Lundhccc74732001-02-18 22:13:49 +00001611 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001612 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001613 digits = 8;
1614 message = "truncated \\UXXXXXXXX escape";
1615 hexescape:
1616 chr = 0;
1617 for (i = 0; i < digits; i++) {
1618 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001619 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001620 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001621 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001622 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001623 i++;
1624 break;
1625 }
1626 chr = (chr<<4) & ~0xF;
1627 if (c >= '0' && c <= '9')
1628 chr += c - '0';
1629 else if (c >= 'a' && c <= 'f')
1630 chr += 10 + c - 'a';
1631 else
1632 chr += 10 + c - 'A';
1633 }
1634 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001635 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001636 /* when we get here, chr is a 32-bit unicode character */
1637 if (chr <= 0xffff)
1638 /* UCS-2 character */
1639 *p++ = (Py_UNICODE) chr;
1640 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001641 /* UCS-4 character. Either store directly, or as
1642 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001643#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 *p++ = chr;
1645#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001646 chr -= 0x10000L;
1647 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001648 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001650 } else {
1651 if (unicodeescape_decoding_error(
1652 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001653 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001654 )
1655 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001658 break;
1659
1660 /* \N{name} */
1661 case 'N':
1662 message = "malformed \\N character escape";
1663 if (ucnhash_CAPI == NULL) {
1664 /* load the unicode data module */
1665 PyObject *m, *v;
1666 m = PyImport_ImportModule("unicodedata");
1667 if (m == NULL)
1668 goto ucnhashError;
1669 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1670 Py_DECREF(m);
1671 if (v == NULL)
1672 goto ucnhashError;
1673 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1674 Py_DECREF(v);
1675 if (ucnhash_CAPI == NULL)
1676 goto ucnhashError;
1677 }
1678 if (*s == '{') {
1679 const char *start = s+1;
1680 /* look for the closing brace */
1681 while (*s != '}' && s < end)
1682 s++;
1683 if (s > start && s < end && *s == '}') {
1684 /* found a name. look it up in the unicode database */
1685 message = "unknown Unicode character name";
1686 s++;
1687 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1688 goto store;
1689 }
1690 }
1691 if (unicodeescape_decoding_error(&s, &x, errors, message))
1692 goto onError;
1693 *p++ = x;
1694 break;
1695
1696 default:
1697 *p++ = '\\';
1698 *p++ = (unsigned char)s[-1];
1699 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 }
1701 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001702 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001703 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 return (PyObject *)v;
1705
Fredrik Lundhccc74732001-02-18 22:13:49 +00001706ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001707 PyErr_SetString(
1708 PyExc_UnicodeError,
1709 "\\N escapes not supported (can't load unicodedata module)"
1710 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001711 return NULL;
1712
Fredrik Lundhccc74732001-02-18 22:13:49 +00001713onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 Py_XDECREF(v);
1715 return NULL;
1716}
1717
1718/* Return a Unicode-Escape string version of the Unicode object.
1719
1720 If quotes is true, the string is enclosed in u"" or u'' quotes as
1721 appropriate.
1722
1723*/
1724
Barry Warsaw51ac5802000-03-20 16:36:48 +00001725static const Py_UNICODE *findchar(const Py_UNICODE *s,
1726 int size,
1727 Py_UNICODE ch);
1728
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729static
1730PyObject *unicodeescape_string(const Py_UNICODE *s,
1731 int size,
1732 int quotes)
1733{
1734 PyObject *repr;
1735 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001737 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738
1739 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1740 if (repr == NULL)
1741 return NULL;
1742
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001743 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744
1745 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 *p++ = 'u';
1747 *p++ = (findchar(s, size, '\'') &&
1748 !findchar(s, size, '"')) ? '"' : '\'';
1749 }
1750 while (size-- > 0) {
1751 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001752
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001754 if (quotes &&
1755 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 *p++ = '\\';
1757 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001758 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001760
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001761#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001762 /* Map 21-bit characters to '\U00xxxxxx' */
1763 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001764 int offset = p - PyString_AS_STRING(repr);
1765
1766 /* Resize the string if necessary */
1767 if (offset + 12 > PyString_GET_SIZE(repr)) {
1768 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1769 goto onError;
1770 p = PyString_AS_STRING(repr) + offset;
1771 }
1772
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001773 *p++ = '\\';
1774 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001775 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1777 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1778 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1780 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1781 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001782 *p++ = hexdigit[ch & 0x0000000F];
1783 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001784 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001785#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001786 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1787 else if (ch >= 0xD800 && ch < 0xDC00) {
1788 Py_UNICODE ch2;
1789 Py_UCS4 ucs;
1790
1791 ch2 = *s++;
1792 size--;
1793 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1794 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1795 *p++ = '\\';
1796 *p++ = 'U';
1797 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1799 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1800 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1802 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1803 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1804 *p++ = hexdigit[ucs & 0x0000000F];
1805 continue;
1806 }
1807 /* Fall through: isolated surrogates are copied as-is */
1808 s--;
1809 size++;
1810 }
1811
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001813 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 *p++ = '\\';
1815 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001816 *p++ = hexdigit[(ch >> 12) & 0x000F];
1817 *p++ = hexdigit[(ch >> 8) & 0x000F];
1818 *p++ = hexdigit[(ch >> 4) & 0x000F];
1819 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001820 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001821
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001822 /* Map special whitespace to '\t', \n', '\r' */
1823 else if (ch == '\t') {
1824 *p++ = '\\';
1825 *p++ = 't';
1826 }
1827 else if (ch == '\n') {
1828 *p++ = '\\';
1829 *p++ = 'n';
1830 }
1831 else if (ch == '\r') {
1832 *p++ = '\\';
1833 *p++ = 'r';
1834 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001835
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001836 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001837 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001839 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001840 *p++ = hexdigit[(ch >> 4) & 0x000F];
1841 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001843
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 /* Copy everything else as-is */
1845 else
1846 *p++ = (char) ch;
1847 }
1848 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001849 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850
1851 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001852 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
1855 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001856
1857 onError:
1858 Py_DECREF(repr);
1859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860}
1861
1862PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1863 int size)
1864{
1865 return unicodeescape_string(s, size, 0);
1866}
1867
1868PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1869{
1870 if (!PyUnicode_Check(unicode)) {
1871 PyErr_BadArgument();
1872 return NULL;
1873 }
1874 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1875 PyUnicode_GET_SIZE(unicode));
1876}
1877
1878/* --- Raw Unicode Escape Codec ------------------------------------------- */
1879
1880PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1881 int size,
1882 const char *errors)
1883{
1884 PyUnicodeObject *v;
1885 Py_UNICODE *p, *buf;
1886 const char *end;
1887 const char *bs;
1888
1889 /* Escaped strings will always be longer than the resulting
1890 Unicode string, so we start with size here and then reduce the
1891 length after conversion to the true value. */
1892 v = _PyUnicode_New(size);
1893 if (v == NULL)
1894 goto onError;
1895 if (size == 0)
1896 return (PyObject *)v;
1897 p = buf = PyUnicode_AS_UNICODE(v);
1898 end = s + size;
1899 while (s < end) {
1900 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001901 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 int i;
1903
1904 /* Non-escape characters are interpreted as Unicode ordinals */
1905 if (*s != '\\') {
1906 *p++ = (unsigned char)*s++;
1907 continue;
1908 }
1909
1910 /* \u-escapes are only interpreted iff the number of leading
1911 backslashes if odd */
1912 bs = s;
1913 for (;s < end;) {
1914 if (*s != '\\')
1915 break;
1916 *p++ = (unsigned char)*s++;
1917 }
1918 if (((s - bs) & 1) == 0 ||
1919 s >= end ||
1920 *s != 'u') {
1921 continue;
1922 }
1923 p--;
1924 s++;
1925
1926 /* \uXXXX with 4 hex digits */
1927 for (x = 0, i = 0; i < 4; i++) {
1928 c = (unsigned char)s[i];
1929 if (!isxdigit(c)) {
1930 if (unicodeescape_decoding_error(&s, &x, errors,
1931 "truncated \\uXXXX"))
1932 goto onError;
1933 i++;
1934 break;
1935 }
1936 x = (x<<4) & ~0xF;
1937 if (c >= '0' && c <= '9')
1938 x += c - '0';
1939 else if (c >= 'a' && c <= 'f')
1940 x += 10 + c - 'a';
1941 else
1942 x += 10 + c - 'A';
1943 }
1944 s += i;
1945 *p++ = x;
1946 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001947 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 return (PyObject *)v;
1950
1951 onError:
1952 Py_XDECREF(v);
1953 return NULL;
1954}
1955
1956PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1957 int size)
1958{
1959 PyObject *repr;
1960 char *p;
1961 char *q;
1962
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001963 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964
1965 repr = PyString_FromStringAndSize(NULL, 6 * size);
1966 if (repr == NULL)
1967 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001968 if (size == 0)
1969 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970
1971 p = q = PyString_AS_STRING(repr);
1972 while (size-- > 0) {
1973 Py_UNICODE ch = *s++;
1974 /* Map 16-bit characters to '\uxxxx' */
1975 if (ch >= 256) {
1976 *p++ = '\\';
1977 *p++ = 'u';
1978 *p++ = hexdigit[(ch >> 12) & 0xf];
1979 *p++ = hexdigit[(ch >> 8) & 0xf];
1980 *p++ = hexdigit[(ch >> 4) & 0xf];
1981 *p++ = hexdigit[ch & 15];
1982 }
1983 /* Copy everything else as-is */
1984 else
1985 *p++ = (char) ch;
1986 }
1987 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001988 if (_PyString_Resize(&repr, p - q))
1989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990
1991 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001992
1993 onError:
1994 Py_DECREF(repr);
1995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996}
1997
1998PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1999{
2000 if (!PyUnicode_Check(unicode)) {
2001 PyErr_BadArgument();
2002 return NULL;
2003 }
2004 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2005 PyUnicode_GET_SIZE(unicode));
2006}
2007
2008/* --- Latin-1 Codec ------------------------------------------------------ */
2009
2010PyObject *PyUnicode_DecodeLatin1(const char *s,
2011 int size,
2012 const char *errors)
2013{
2014 PyUnicodeObject *v;
2015 Py_UNICODE *p;
2016
2017 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002018 if (size == 1 && *(unsigned char*)s < 256) {
2019 Py_UNICODE r = *(unsigned char*)s;
2020 return PyUnicode_FromUnicode(&r, 1);
2021 }
2022
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 v = _PyUnicode_New(size);
2024 if (v == NULL)
2025 goto onError;
2026 if (size == 0)
2027 return (PyObject *)v;
2028 p = PyUnicode_AS_UNICODE(v);
2029 while (size-- > 0)
2030 *p++ = (unsigned char)*s++;
2031 return (PyObject *)v;
2032
2033 onError:
2034 Py_XDECREF(v);
2035 return NULL;
2036}
2037
2038static
2039int latin1_encoding_error(const Py_UNICODE **source,
2040 char **dest,
2041 const char *errors,
2042 const char *details)
2043{
2044 if ((errors == NULL) ||
2045 (strcmp(errors,"strict") == 0)) {
2046 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002047 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 details);
2049 return -1;
2050 }
2051 else if (strcmp(errors,"ignore") == 0) {
2052 return 0;
2053 }
2054 else if (strcmp(errors,"replace") == 0) {
2055 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002056 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 return 0;
2058 }
2059 else {
2060 PyErr_Format(PyExc_ValueError,
2061 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002062 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 errors);
2064 return -1;
2065 }
2066}
2067
2068PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2069 int size,
2070 const char *errors)
2071{
2072 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002073 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002074
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 repr = PyString_FromStringAndSize(NULL, size);
2076 if (repr == NULL)
2077 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002078 if (size == 0)
2079 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080
2081 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002082 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 while (size-- > 0) {
2084 Py_UNICODE ch = *p++;
2085 if (ch >= 256) {
2086 if (latin1_encoding_error(&p, &s, errors,
2087 "ordinal not in range(256)"))
2088 goto onError;
2089 }
2090 else
2091 *s++ = (char)ch;
2092 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002093 /* Resize if error handling skipped some characters */
2094 if (s - start < PyString_GET_SIZE(repr))
2095 if (_PyString_Resize(&repr, s - start))
2096 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 return repr;
2098
2099 onError:
2100 Py_DECREF(repr);
2101 return NULL;
2102}
2103
2104PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2105{
2106 if (!PyUnicode_Check(unicode)) {
2107 PyErr_BadArgument();
2108 return NULL;
2109 }
2110 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2111 PyUnicode_GET_SIZE(unicode),
2112 NULL);
2113}
2114
2115/* --- 7-bit ASCII Codec -------------------------------------------------- */
2116
2117static
2118int ascii_decoding_error(const char **source,
2119 Py_UNICODE **dest,
2120 const char *errors,
2121 const char *details)
2122{
2123 if ((errors == NULL) ||
2124 (strcmp(errors,"strict") == 0)) {
2125 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002126 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 details);
2128 return -1;
2129 }
2130 else if (strcmp(errors,"ignore") == 0) {
2131 return 0;
2132 }
2133 else if (strcmp(errors,"replace") == 0) {
2134 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2135 (*dest)++;
2136 return 0;
2137 }
2138 else {
2139 PyErr_Format(PyExc_ValueError,
2140 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002141 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 errors);
2143 return -1;
2144 }
2145}
2146
2147PyObject *PyUnicode_DecodeASCII(const char *s,
2148 int size,
2149 const char *errors)
2150{
2151 PyUnicodeObject *v;
2152 Py_UNICODE *p;
2153
2154 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002155 if (size == 1 && *(unsigned char*)s < 128) {
2156 Py_UNICODE r = *(unsigned char*)s;
2157 return PyUnicode_FromUnicode(&r, 1);
2158 }
2159
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 v = _PyUnicode_New(size);
2161 if (v == NULL)
2162 goto onError;
2163 if (size == 0)
2164 return (PyObject *)v;
2165 p = PyUnicode_AS_UNICODE(v);
2166 while (size-- > 0) {
2167 register unsigned char c;
2168
2169 c = (unsigned char)*s++;
2170 if (c < 128)
2171 *p++ = c;
2172 else if (ascii_decoding_error(&s, &p, errors,
2173 "ordinal not in range(128)"))
2174 goto onError;
2175 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002176 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002177 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 return (PyObject *)v;
2180
2181 onError:
2182 Py_XDECREF(v);
2183 return NULL;
2184}
2185
2186static
2187int ascii_encoding_error(const Py_UNICODE **source,
2188 char **dest,
2189 const char *errors,
2190 const char *details)
2191{
2192 if ((errors == NULL) ||
2193 (strcmp(errors,"strict") == 0)) {
2194 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002195 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 details);
2197 return -1;
2198 }
2199 else if (strcmp(errors,"ignore") == 0) {
2200 return 0;
2201 }
2202 else if (strcmp(errors,"replace") == 0) {
2203 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002204 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 return 0;
2206 }
2207 else {
2208 PyErr_Format(PyExc_ValueError,
2209 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002210 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 errors);
2212 return -1;
2213 }
2214}
2215
2216PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2217 int size,
2218 const char *errors)
2219{
2220 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002221 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002222
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 repr = PyString_FromStringAndSize(NULL, size);
2224 if (repr == NULL)
2225 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002226 if (size == 0)
2227 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
2229 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002230 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 while (size-- > 0) {
2232 Py_UNICODE ch = *p++;
2233 if (ch >= 128) {
2234 if (ascii_encoding_error(&p, &s, errors,
2235 "ordinal not in range(128)"))
2236 goto onError;
2237 }
2238 else
2239 *s++ = (char)ch;
2240 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002241 /* Resize if error handling skipped some characters */
2242 if (s - start < PyString_GET_SIZE(repr))
2243 if (_PyString_Resize(&repr, s - start))
2244 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 return repr;
2246
2247 onError:
2248 Py_DECREF(repr);
2249 return NULL;
2250}
2251
2252PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2253{
2254 if (!PyUnicode_Check(unicode)) {
2255 PyErr_BadArgument();
2256 return NULL;
2257 }
2258 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2259 PyUnicode_GET_SIZE(unicode),
2260 NULL);
2261}
2262
Fredrik Lundh30831632001-06-26 15:11:00 +00002263#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002264
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002265/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002266
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002267PyObject *PyUnicode_DecodeMBCS(const char *s,
2268 int size,
2269 const char *errors)
2270{
2271 PyUnicodeObject *v;
2272 Py_UNICODE *p;
2273
2274 /* First get the size of the result */
2275 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002276 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002277 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2278
2279 v = _PyUnicode_New(usize);
2280 if (v == NULL)
2281 return NULL;
2282 if (usize == 0)
2283 return (PyObject *)v;
2284 p = PyUnicode_AS_UNICODE(v);
2285 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2286 Py_DECREF(v);
2287 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2288 }
2289
2290 return (PyObject *)v;
2291}
2292
2293PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2294 int size,
2295 const char *errors)
2296{
2297 PyObject *repr;
2298 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002299 DWORD mbcssize;
2300
2301 /* If there are no characters, bail now! */
2302 if (size==0)
2303 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002304
2305 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002306 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002307 if (mbcssize==0)
2308 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2309
2310 repr = PyString_FromStringAndSize(NULL, mbcssize);
2311 if (repr == NULL)
2312 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002313 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002314 return repr;
2315
2316 /* Do the conversion */
2317 s = PyString_AS_STRING(repr);
2318 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2319 Py_DECREF(repr);
2320 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2321 }
2322 return repr;
2323}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002324
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002325#endif /* MS_WIN32 */
2326
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327/* --- Character Mapping Codec -------------------------------------------- */
2328
2329static
2330int charmap_decoding_error(const char **source,
2331 Py_UNICODE **dest,
2332 const char *errors,
2333 const char *details)
2334{
2335 if ((errors == NULL) ||
2336 (strcmp(errors,"strict") == 0)) {
2337 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002338 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002339 details);
2340 return -1;
2341 }
2342 else if (strcmp(errors,"ignore") == 0) {
2343 return 0;
2344 }
2345 else if (strcmp(errors,"replace") == 0) {
2346 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2347 (*dest)++;
2348 return 0;
2349 }
2350 else {
2351 PyErr_Format(PyExc_ValueError,
2352 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002353 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 errors);
2355 return -1;
2356 }
2357}
2358
2359PyObject *PyUnicode_DecodeCharmap(const char *s,
2360 int size,
2361 PyObject *mapping,
2362 const char *errors)
2363{
2364 PyUnicodeObject *v;
2365 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002366 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367
2368 /* Default to Latin-1 */
2369 if (mapping == NULL)
2370 return PyUnicode_DecodeLatin1(s, size, errors);
2371
2372 v = _PyUnicode_New(size);
2373 if (v == NULL)
2374 goto onError;
2375 if (size == 0)
2376 return (PyObject *)v;
2377 p = PyUnicode_AS_UNICODE(v);
2378 while (size-- > 0) {
2379 unsigned char ch = *s++;
2380 PyObject *w, *x;
2381
2382 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2383 w = PyInt_FromLong((long)ch);
2384 if (w == NULL)
2385 goto onError;
2386 x = PyObject_GetItem(mapping, w);
2387 Py_DECREF(w);
2388 if (x == NULL) {
2389 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002390 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002392 x = Py_None;
2393 Py_INCREF(x);
2394 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002395 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 }
2397
2398 /* Apply mapping */
2399 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002400 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 if (value < 0 || value > 65535) {
2402 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002403 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 Py_DECREF(x);
2405 goto onError;
2406 }
2407 *p++ = (Py_UNICODE)value;
2408 }
2409 else if (x == Py_None) {
2410 /* undefined mapping */
2411 if (charmap_decoding_error(&s, &p, errors,
2412 "character maps to <undefined>")) {
2413 Py_DECREF(x);
2414 goto onError;
2415 }
2416 }
2417 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002418 int targetsize = PyUnicode_GET_SIZE(x);
2419
2420 if (targetsize == 1)
2421 /* 1-1 mapping */
2422 *p++ = *PyUnicode_AS_UNICODE(x);
2423
2424 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002426 if (targetsize > extrachars) {
2427 /* resize first */
2428 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2429 int needed = (targetsize - extrachars) + \
2430 (targetsize << 2);
2431 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002432 if (_PyUnicode_Resize(&v,
2433 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002434 Py_DECREF(x);
2435 goto onError;
2436 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002437 p = PyUnicode_AS_UNICODE(v) + oldpos;
2438 }
2439 Py_UNICODE_COPY(p,
2440 PyUnicode_AS_UNICODE(x),
2441 targetsize);
2442 p += targetsize;
2443 extrachars -= targetsize;
2444 }
2445 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446 }
2447 else {
2448 /* wrong return value */
2449 PyErr_SetString(PyExc_TypeError,
2450 "character mapping must return integer, None or unicode");
2451 Py_DECREF(x);
2452 goto onError;
2453 }
2454 Py_DECREF(x);
2455 }
2456 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002457 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 goto onError;
2459 return (PyObject *)v;
2460
2461 onError:
2462 Py_XDECREF(v);
2463 return NULL;
2464}
2465
2466static
2467int charmap_encoding_error(const Py_UNICODE **source,
2468 char **dest,
2469 const char *errors,
2470 const char *details)
2471{
2472 if ((errors == NULL) ||
2473 (strcmp(errors,"strict") == 0)) {
2474 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002475 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 details);
2477 return -1;
2478 }
2479 else if (strcmp(errors,"ignore") == 0) {
2480 return 0;
2481 }
2482 else if (strcmp(errors,"replace") == 0) {
2483 **dest = '?';
2484 (*dest)++;
2485 return 0;
2486 }
2487 else {
2488 PyErr_Format(PyExc_ValueError,
2489 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002490 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 errors);
2492 return -1;
2493 }
2494}
2495
2496PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2497 int size,
2498 PyObject *mapping,
2499 const char *errors)
2500{
2501 PyObject *v;
2502 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002503 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504
2505 /* Default to Latin-1 */
2506 if (mapping == NULL)
2507 return PyUnicode_EncodeLatin1(p, size, errors);
2508
2509 v = PyString_FromStringAndSize(NULL, size);
2510 if (v == NULL)
2511 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002512 if (size == 0)
2513 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 s = PyString_AS_STRING(v);
2515 while (size-- > 0) {
2516 Py_UNICODE ch = *p++;
2517 PyObject *w, *x;
2518
2519 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2520 w = PyInt_FromLong((long)ch);
2521 if (w == NULL)
2522 goto onError;
2523 x = PyObject_GetItem(mapping, w);
2524 Py_DECREF(w);
2525 if (x == NULL) {
2526 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002527 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002529 x = Py_None;
2530 Py_INCREF(x);
2531 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002532 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 }
2534
2535 /* Apply mapping */
2536 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002537 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 if (value < 0 || value > 255) {
2539 PyErr_SetString(PyExc_TypeError,
2540 "character mapping must be in range(256)");
2541 Py_DECREF(x);
2542 goto onError;
2543 }
2544 *s++ = (char)value;
2545 }
2546 else if (x == Py_None) {
2547 /* undefined mapping */
2548 if (charmap_encoding_error(&p, &s, errors,
2549 "character maps to <undefined>")) {
2550 Py_DECREF(x);
2551 goto onError;
2552 }
2553 }
2554 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002555 int targetsize = PyString_GET_SIZE(x);
2556
2557 if (targetsize == 1)
2558 /* 1-1 mapping */
2559 *s++ = *PyString_AS_STRING(x);
2560
2561 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002563 if (targetsize > extrachars) {
2564 /* resize first */
2565 int oldpos = (int)(s - PyString_AS_STRING(v));
2566 int needed = (targetsize - extrachars) + \
2567 (targetsize << 2);
2568 extrachars += needed;
2569 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002570 Py_DECREF(x);
2571 goto onError;
2572 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002573 s = PyString_AS_STRING(v) + oldpos;
2574 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002575 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002576 s += targetsize;
2577 extrachars -= targetsize;
2578 }
2579 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 }
2581 else {
2582 /* wrong return value */
2583 PyErr_SetString(PyExc_TypeError,
2584 "character mapping must return integer, None or unicode");
2585 Py_DECREF(x);
2586 goto onError;
2587 }
2588 Py_DECREF(x);
2589 }
2590 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2591 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2592 goto onError;
2593 return v;
2594
2595 onError:
2596 Py_DECREF(v);
2597 return NULL;
2598}
2599
2600PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2601 PyObject *mapping)
2602{
2603 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2604 PyErr_BadArgument();
2605 return NULL;
2606 }
2607 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2608 PyUnicode_GET_SIZE(unicode),
2609 mapping,
2610 NULL);
2611}
2612
2613static
2614int translate_error(const Py_UNICODE **source,
2615 Py_UNICODE **dest,
2616 const char *errors,
2617 const char *details)
2618{
2619 if ((errors == NULL) ||
2620 (strcmp(errors,"strict") == 0)) {
2621 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002622 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 details);
2624 return -1;
2625 }
2626 else if (strcmp(errors,"ignore") == 0) {
2627 return 0;
2628 }
2629 else if (strcmp(errors,"replace") == 0) {
2630 **dest = '?';
2631 (*dest)++;
2632 return 0;
2633 }
2634 else {
2635 PyErr_Format(PyExc_ValueError,
2636 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002637 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 errors);
2639 return -1;
2640 }
2641}
2642
2643PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2644 int size,
2645 PyObject *mapping,
2646 const char *errors)
2647{
2648 PyUnicodeObject *v;
2649 Py_UNICODE *p;
2650
2651 if (mapping == NULL) {
2652 PyErr_BadArgument();
2653 return NULL;
2654 }
2655
2656 /* Output will never be longer than input */
2657 v = _PyUnicode_New(size);
2658 if (v == NULL)
2659 goto onError;
2660 if (size == 0)
2661 goto done;
2662 p = PyUnicode_AS_UNICODE(v);
2663 while (size-- > 0) {
2664 Py_UNICODE ch = *s++;
2665 PyObject *w, *x;
2666
2667 /* Get mapping */
2668 w = PyInt_FromLong(ch);
2669 if (w == NULL)
2670 goto onError;
2671 x = PyObject_GetItem(mapping, w);
2672 Py_DECREF(w);
2673 if (x == NULL) {
2674 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2675 /* No mapping found: default to 1-1 mapping */
2676 PyErr_Clear();
2677 *p++ = ch;
2678 continue;
2679 }
2680 goto onError;
2681 }
2682
2683 /* Apply mapping */
2684 if (PyInt_Check(x))
2685 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2686 else if (x == Py_None) {
2687 /* undefined mapping */
2688 if (translate_error(&s, &p, errors,
2689 "character maps to <undefined>")) {
2690 Py_DECREF(x);
2691 goto onError;
2692 }
2693 }
2694 else if (PyUnicode_Check(x)) {
2695 if (PyUnicode_GET_SIZE(x) != 1) {
2696 /* 1-n mapping */
2697 PyErr_SetString(PyExc_NotImplementedError,
2698 "1-n mappings are currently not implemented");
2699 Py_DECREF(x);
2700 goto onError;
2701 }
2702 *p++ = *PyUnicode_AS_UNICODE(x);
2703 }
2704 else {
2705 /* wrong return value */
2706 PyErr_SetString(PyExc_TypeError,
2707 "translate mapping must return integer, None or unicode");
2708 Py_DECREF(x);
2709 goto onError;
2710 }
2711 Py_DECREF(x);
2712 }
2713 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002714 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002715 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716
2717 done:
2718 return (PyObject *)v;
2719
2720 onError:
2721 Py_XDECREF(v);
2722 return NULL;
2723}
2724
2725PyObject *PyUnicode_Translate(PyObject *str,
2726 PyObject *mapping,
2727 const char *errors)
2728{
2729 PyObject *result;
2730
2731 str = PyUnicode_FromObject(str);
2732 if (str == NULL)
2733 goto onError;
2734 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2735 PyUnicode_GET_SIZE(str),
2736 mapping,
2737 errors);
2738 Py_DECREF(str);
2739 return result;
2740
2741 onError:
2742 Py_XDECREF(str);
2743 return NULL;
2744}
2745
Guido van Rossum9e896b32000-04-05 20:11:21 +00002746/* --- Decimal Encoder ---------------------------------------------------- */
2747
2748int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2749 int length,
2750 char *output,
2751 const char *errors)
2752{
2753 Py_UNICODE *p, *end;
2754
2755 if (output == NULL) {
2756 PyErr_BadArgument();
2757 return -1;
2758 }
2759
2760 p = s;
2761 end = s + length;
2762 while (p < end) {
2763 register Py_UNICODE ch = *p++;
2764 int decimal;
2765
2766 if (Py_UNICODE_ISSPACE(ch)) {
2767 *output++ = ' ';
2768 continue;
2769 }
2770 decimal = Py_UNICODE_TODECIMAL(ch);
2771 if (decimal >= 0) {
2772 *output++ = '0' + decimal;
2773 continue;
2774 }
Guido van Rossumba477042000-04-06 18:18:10 +00002775 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002776 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002777 continue;
2778 }
2779 /* All other characters are considered invalid */
2780 if (errors == NULL || strcmp(errors, "strict") == 0) {
2781 PyErr_SetString(PyExc_ValueError,
2782 "invalid decimal Unicode string");
2783 goto onError;
2784 }
2785 else if (strcmp(errors, "ignore") == 0)
2786 continue;
2787 else if (strcmp(errors, "replace") == 0) {
2788 *output++ = '?';
2789 continue;
2790 }
2791 }
2792 /* 0-terminate the output string */
2793 *output++ = '\0';
2794 return 0;
2795
2796 onError:
2797 return -1;
2798}
2799
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800/* --- Helpers ------------------------------------------------------------ */
2801
2802static
2803int count(PyUnicodeObject *self,
2804 int start,
2805 int end,
2806 PyUnicodeObject *substring)
2807{
2808 int count = 0;
2809
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002810 if (start < 0)
2811 start += self->length;
2812 if (start < 0)
2813 start = 0;
2814 if (end > self->length)
2815 end = self->length;
2816 if (end < 0)
2817 end += self->length;
2818 if (end < 0)
2819 end = 0;
2820
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002821 if (substring->length == 0)
2822 return (end - start + 1);
2823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 end -= substring->length;
2825
2826 while (start <= end)
2827 if (Py_UNICODE_MATCH(self, start, substring)) {
2828 count++;
2829 start += substring->length;
2830 } else
2831 start++;
2832
2833 return count;
2834}
2835
2836int PyUnicode_Count(PyObject *str,
2837 PyObject *substr,
2838 int start,
2839 int end)
2840{
2841 int result;
2842
2843 str = PyUnicode_FromObject(str);
2844 if (str == NULL)
2845 return -1;
2846 substr = PyUnicode_FromObject(substr);
2847 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002848 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 return -1;
2850 }
2851
2852 result = count((PyUnicodeObject *)str,
2853 start, end,
2854 (PyUnicodeObject *)substr);
2855
2856 Py_DECREF(str);
2857 Py_DECREF(substr);
2858 return result;
2859}
2860
2861static
2862int findstring(PyUnicodeObject *self,
2863 PyUnicodeObject *substring,
2864 int start,
2865 int end,
2866 int direction)
2867{
2868 if (start < 0)
2869 start += self->length;
2870 if (start < 0)
2871 start = 0;
2872
2873 if (substring->length == 0)
2874 return start;
2875
2876 if (end > self->length)
2877 end = self->length;
2878 if (end < 0)
2879 end += self->length;
2880 if (end < 0)
2881 end = 0;
2882
2883 end -= substring->length;
2884
2885 if (direction < 0) {
2886 for (; end >= start; end--)
2887 if (Py_UNICODE_MATCH(self, end, substring))
2888 return end;
2889 } else {
2890 for (; start <= end; start++)
2891 if (Py_UNICODE_MATCH(self, start, substring))
2892 return start;
2893 }
2894
2895 return -1;
2896}
2897
2898int PyUnicode_Find(PyObject *str,
2899 PyObject *substr,
2900 int start,
2901 int end,
2902 int direction)
2903{
2904 int result;
2905
2906 str = PyUnicode_FromObject(str);
2907 if (str == NULL)
2908 return -1;
2909 substr = PyUnicode_FromObject(substr);
2910 if (substr == NULL) {
2911 Py_DECREF(substr);
2912 return -1;
2913 }
2914
2915 result = findstring((PyUnicodeObject *)str,
2916 (PyUnicodeObject *)substr,
2917 start, end, direction);
2918 Py_DECREF(str);
2919 Py_DECREF(substr);
2920 return result;
2921}
2922
2923static
2924int tailmatch(PyUnicodeObject *self,
2925 PyUnicodeObject *substring,
2926 int start,
2927 int end,
2928 int direction)
2929{
2930 if (start < 0)
2931 start += self->length;
2932 if (start < 0)
2933 start = 0;
2934
2935 if (substring->length == 0)
2936 return 1;
2937
2938 if (end > self->length)
2939 end = self->length;
2940 if (end < 0)
2941 end += self->length;
2942 if (end < 0)
2943 end = 0;
2944
2945 end -= substring->length;
2946 if (end < start)
2947 return 0;
2948
2949 if (direction > 0) {
2950 if (Py_UNICODE_MATCH(self, end, substring))
2951 return 1;
2952 } else {
2953 if (Py_UNICODE_MATCH(self, start, substring))
2954 return 1;
2955 }
2956
2957 return 0;
2958}
2959
2960int PyUnicode_Tailmatch(PyObject *str,
2961 PyObject *substr,
2962 int start,
2963 int end,
2964 int direction)
2965{
2966 int result;
2967
2968 str = PyUnicode_FromObject(str);
2969 if (str == NULL)
2970 return -1;
2971 substr = PyUnicode_FromObject(substr);
2972 if (substr == NULL) {
2973 Py_DECREF(substr);
2974 return -1;
2975 }
2976
2977 result = tailmatch((PyUnicodeObject *)str,
2978 (PyUnicodeObject *)substr,
2979 start, end, direction);
2980 Py_DECREF(str);
2981 Py_DECREF(substr);
2982 return result;
2983}
2984
2985static
2986const Py_UNICODE *findchar(const Py_UNICODE *s,
2987 int size,
2988 Py_UNICODE ch)
2989{
2990 /* like wcschr, but doesn't stop at NULL characters */
2991
2992 while (size-- > 0) {
2993 if (*s == ch)
2994 return s;
2995 s++;
2996 }
2997
2998 return NULL;
2999}
3000
3001/* Apply fixfct filter to the Unicode object self and return a
3002 reference to the modified object */
3003
3004static
3005PyObject *fixup(PyUnicodeObject *self,
3006 int (*fixfct)(PyUnicodeObject *s))
3007{
3008
3009 PyUnicodeObject *u;
3010
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003011 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 if (u == NULL)
3013 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003014
3015 Py_UNICODE_COPY(u->str, self->str, self->length);
3016
Tim Peters7a29bd52001-09-12 03:03:31 +00003017 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 /* fixfct should return TRUE if it modified the buffer. If
3019 FALSE, return a reference to the original buffer instead
3020 (to save space, not time) */
3021 Py_INCREF(self);
3022 Py_DECREF(u);
3023 return (PyObject*) self;
3024 }
3025 return (PyObject*) u;
3026}
3027
3028static
3029int fixupper(PyUnicodeObject *self)
3030{
3031 int len = self->length;
3032 Py_UNICODE *s = self->str;
3033 int status = 0;
3034
3035 while (len-- > 0) {
3036 register Py_UNICODE ch;
3037
3038 ch = Py_UNICODE_TOUPPER(*s);
3039 if (ch != *s) {
3040 status = 1;
3041 *s = ch;
3042 }
3043 s++;
3044 }
3045
3046 return status;
3047}
3048
3049static
3050int fixlower(PyUnicodeObject *self)
3051{
3052 int len = self->length;
3053 Py_UNICODE *s = self->str;
3054 int status = 0;
3055
3056 while (len-- > 0) {
3057 register Py_UNICODE ch;
3058
3059 ch = Py_UNICODE_TOLOWER(*s);
3060 if (ch != *s) {
3061 status = 1;
3062 *s = ch;
3063 }
3064 s++;
3065 }
3066
3067 return status;
3068}
3069
3070static
3071int fixswapcase(PyUnicodeObject *self)
3072{
3073 int len = self->length;
3074 Py_UNICODE *s = self->str;
3075 int status = 0;
3076
3077 while (len-- > 0) {
3078 if (Py_UNICODE_ISUPPER(*s)) {
3079 *s = Py_UNICODE_TOLOWER(*s);
3080 status = 1;
3081 } else if (Py_UNICODE_ISLOWER(*s)) {
3082 *s = Py_UNICODE_TOUPPER(*s);
3083 status = 1;
3084 }
3085 s++;
3086 }
3087
3088 return status;
3089}
3090
3091static
3092int fixcapitalize(PyUnicodeObject *self)
3093{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003094 int len = self->length;
3095 Py_UNICODE *s = self->str;
3096 int status = 0;
3097
3098 if (len == 0)
3099 return 0;
3100 if (Py_UNICODE_ISLOWER(*s)) {
3101 *s = Py_UNICODE_TOUPPER(*s);
3102 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003104 s++;
3105 while (--len > 0) {
3106 if (Py_UNICODE_ISUPPER(*s)) {
3107 *s = Py_UNICODE_TOLOWER(*s);
3108 status = 1;
3109 }
3110 s++;
3111 }
3112 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113}
3114
3115static
3116int fixtitle(PyUnicodeObject *self)
3117{
3118 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3119 register Py_UNICODE *e;
3120 int previous_is_cased;
3121
3122 /* Shortcut for single character strings */
3123 if (PyUnicode_GET_SIZE(self) == 1) {
3124 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3125 if (*p != ch) {
3126 *p = ch;
3127 return 1;
3128 }
3129 else
3130 return 0;
3131 }
3132
3133 e = p + PyUnicode_GET_SIZE(self);
3134 previous_is_cased = 0;
3135 for (; p < e; p++) {
3136 register const Py_UNICODE ch = *p;
3137
3138 if (previous_is_cased)
3139 *p = Py_UNICODE_TOLOWER(ch);
3140 else
3141 *p = Py_UNICODE_TOTITLE(ch);
3142
3143 if (Py_UNICODE_ISLOWER(ch) ||
3144 Py_UNICODE_ISUPPER(ch) ||
3145 Py_UNICODE_ISTITLE(ch))
3146 previous_is_cased = 1;
3147 else
3148 previous_is_cased = 0;
3149 }
3150 return 1;
3151}
3152
3153PyObject *PyUnicode_Join(PyObject *separator,
3154 PyObject *seq)
3155{
3156 Py_UNICODE *sep;
3157 int seplen;
3158 PyUnicodeObject *res = NULL;
3159 int reslen = 0;
3160 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 int sz = 100;
3162 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003163 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164
Tim Peters2cfe3682001-05-05 05:36:48 +00003165 it = PyObject_GetIter(seq);
3166 if (it == NULL)
3167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168
3169 if (separator == NULL) {
3170 Py_UNICODE blank = ' ';
3171 sep = &blank;
3172 seplen = 1;
3173 }
3174 else {
3175 separator = PyUnicode_FromObject(separator);
3176 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003177 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 sep = PyUnicode_AS_UNICODE(separator);
3179 seplen = PyUnicode_GET_SIZE(separator);
3180 }
3181
3182 res = _PyUnicode_New(sz);
3183 if (res == NULL)
3184 goto onError;
3185 p = PyUnicode_AS_UNICODE(res);
3186 reslen = 0;
3187
Tim Peters2cfe3682001-05-05 05:36:48 +00003188 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003190 PyObject *item = PyIter_Next(it);
3191 if (item == NULL) {
3192 if (PyErr_Occurred())
3193 goto onError;
3194 break;
3195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 if (!PyUnicode_Check(item)) {
3197 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003198 if (!PyString_Check(item)) {
3199 PyErr_Format(PyExc_TypeError,
3200 "sequence item %i: expected string or Unicode,"
3201 " %.80s found",
3202 i, item->ob_type->tp_name);
3203 Py_DECREF(item);
3204 goto onError;
3205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 v = PyUnicode_FromObject(item);
3207 Py_DECREF(item);
3208 item = v;
3209 if (item == NULL)
3210 goto onError;
3211 }
3212 itemlen = PyUnicode_GET_SIZE(item);
3213 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003214 if (_PyUnicode_Resize(&res, sz*2)) {
3215 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 sz *= 2;
3219 p = PyUnicode_AS_UNICODE(res) + reslen;
3220 }
3221 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003222 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 p += seplen;
3224 reslen += seplen;
3225 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003226 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 p += itemlen;
3228 reslen += itemlen;
3229 Py_DECREF(item);
3230 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003231 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 goto onError;
3233
3234 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003235 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return (PyObject *)res;
3237
3238 onError:
3239 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003240 Py_XDECREF(res);
3241 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return NULL;
3243}
3244
3245static
3246PyUnicodeObject *pad(PyUnicodeObject *self,
3247 int left,
3248 int right,
3249 Py_UNICODE fill)
3250{
3251 PyUnicodeObject *u;
3252
3253 if (left < 0)
3254 left = 0;
3255 if (right < 0)
3256 right = 0;
3257
Tim Peters7a29bd52001-09-12 03:03:31 +00003258 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 Py_INCREF(self);
3260 return self;
3261 }
3262
3263 u = _PyUnicode_New(left + self->length + right);
3264 if (u) {
3265 if (left)
3266 Py_UNICODE_FILL(u->str, fill, left);
3267 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3268 if (right)
3269 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3270 }
3271
3272 return u;
3273}
3274
3275#define SPLIT_APPEND(data, left, right) \
3276 str = PyUnicode_FromUnicode(data + left, right - left); \
3277 if (!str) \
3278 goto onError; \
3279 if (PyList_Append(list, str)) { \
3280 Py_DECREF(str); \
3281 goto onError; \
3282 } \
3283 else \
3284 Py_DECREF(str);
3285
3286static
3287PyObject *split_whitespace(PyUnicodeObject *self,
3288 PyObject *list,
3289 int maxcount)
3290{
3291 register int i;
3292 register int j;
3293 int len = self->length;
3294 PyObject *str;
3295
3296 for (i = j = 0; i < len; ) {
3297 /* find a token */
3298 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3299 i++;
3300 j = i;
3301 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3302 i++;
3303 if (j < i) {
3304 if (maxcount-- <= 0)
3305 break;
3306 SPLIT_APPEND(self->str, j, i);
3307 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3308 i++;
3309 j = i;
3310 }
3311 }
3312 if (j < len) {
3313 SPLIT_APPEND(self->str, j, len);
3314 }
3315 return list;
3316
3317 onError:
3318 Py_DECREF(list);
3319 return NULL;
3320}
3321
3322PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003323 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324{
3325 register int i;
3326 register int j;
3327 int len;
3328 PyObject *list;
3329 PyObject *str;
3330 Py_UNICODE *data;
3331
3332 string = PyUnicode_FromObject(string);
3333 if (string == NULL)
3334 return NULL;
3335 data = PyUnicode_AS_UNICODE(string);
3336 len = PyUnicode_GET_SIZE(string);
3337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 list = PyList_New(0);
3339 if (!list)
3340 goto onError;
3341
3342 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003343 int eol;
3344
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 /* Find a line and append it */
3346 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3347 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348
3349 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003350 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 if (i < len) {
3352 if (data[i] == '\r' && i + 1 < len &&
3353 data[i+1] == '\n')
3354 i += 2;
3355 else
3356 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003357 if (keepends)
3358 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 }
Guido van Rossum86662912000-04-11 15:38:46 +00003360 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 j = i;
3362 }
3363 if (j < len) {
3364 SPLIT_APPEND(data, j, len);
3365 }
3366
3367 Py_DECREF(string);
3368 return list;
3369
3370 onError:
3371 Py_DECREF(list);
3372 Py_DECREF(string);
3373 return NULL;
3374}
3375
3376static
3377PyObject *split_char(PyUnicodeObject *self,
3378 PyObject *list,
3379 Py_UNICODE ch,
3380 int maxcount)
3381{
3382 register int i;
3383 register int j;
3384 int len = self->length;
3385 PyObject *str;
3386
3387 for (i = j = 0; i < len; ) {
3388 if (self->str[i] == ch) {
3389 if (maxcount-- <= 0)
3390 break;
3391 SPLIT_APPEND(self->str, j, i);
3392 i = j = i + 1;
3393 } else
3394 i++;
3395 }
3396 if (j <= len) {
3397 SPLIT_APPEND(self->str, j, len);
3398 }
3399 return list;
3400
3401 onError:
3402 Py_DECREF(list);
3403 return NULL;
3404}
3405
3406static
3407PyObject *split_substring(PyUnicodeObject *self,
3408 PyObject *list,
3409 PyUnicodeObject *substring,
3410 int maxcount)
3411{
3412 register int i;
3413 register int j;
3414 int len = self->length;
3415 int sublen = substring->length;
3416 PyObject *str;
3417
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003418 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 if (Py_UNICODE_MATCH(self, i, substring)) {
3420 if (maxcount-- <= 0)
3421 break;
3422 SPLIT_APPEND(self->str, j, i);
3423 i = j = i + sublen;
3424 } else
3425 i++;
3426 }
3427 if (j <= len) {
3428 SPLIT_APPEND(self->str, j, len);
3429 }
3430 return list;
3431
3432 onError:
3433 Py_DECREF(list);
3434 return NULL;
3435}
3436
3437#undef SPLIT_APPEND
3438
3439static
3440PyObject *split(PyUnicodeObject *self,
3441 PyUnicodeObject *substring,
3442 int maxcount)
3443{
3444 PyObject *list;
3445
3446 if (maxcount < 0)
3447 maxcount = INT_MAX;
3448
3449 list = PyList_New(0);
3450 if (!list)
3451 return NULL;
3452
3453 if (substring == NULL)
3454 return split_whitespace(self,list,maxcount);
3455
3456 else if (substring->length == 1)
3457 return split_char(self,list,substring->str[0],maxcount);
3458
3459 else if (substring->length == 0) {
3460 Py_DECREF(list);
3461 PyErr_SetString(PyExc_ValueError, "empty separator");
3462 return NULL;
3463 }
3464 else
3465 return split_substring(self,list,substring,maxcount);
3466}
3467
3468static
3469PyObject *strip(PyUnicodeObject *self,
3470 int left,
3471 int right)
3472{
3473 Py_UNICODE *p = self->str;
3474 int start = 0;
3475 int end = self->length;
3476
3477 if (left)
3478 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3479 start++;
3480
3481 if (right)
3482 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3483 end--;
3484
Tim Peters7a29bd52001-09-12 03:03:31 +00003485 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 /* couldn't strip anything off, return original string */
3487 Py_INCREF(self);
3488 return (PyObject*) self;
3489 }
3490
3491 return (PyObject*) PyUnicode_FromUnicode(
3492 self->str + start,
3493 end - start
3494 );
3495}
3496
3497static
3498PyObject *replace(PyUnicodeObject *self,
3499 PyUnicodeObject *str1,
3500 PyUnicodeObject *str2,
3501 int maxcount)
3502{
3503 PyUnicodeObject *u;
3504
3505 if (maxcount < 0)
3506 maxcount = INT_MAX;
3507
3508 if (str1->length == 1 && str2->length == 1) {
3509 int i;
3510
3511 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003512 if (!findchar(self->str, self->length, str1->str[0]) &&
3513 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 /* nothing to replace, return original string */
3515 Py_INCREF(self);
3516 u = self;
3517 } else {
3518 Py_UNICODE u1 = str1->str[0];
3519 Py_UNICODE u2 = str2->str[0];
3520
3521 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003522 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 self->length
3524 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003525 if (u != NULL) {
3526 Py_UNICODE_COPY(u->str, self->str,
3527 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528 for (i = 0; i < u->length; i++)
3529 if (u->str[i] == u1) {
3530 if (--maxcount < 0)
3531 break;
3532 u->str[i] = u2;
3533 }
3534 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536
3537 } else {
3538 int n, i;
3539 Py_UNICODE *p;
3540
3541 /* replace strings */
3542 n = count(self, 0, self->length, str1);
3543 if (n > maxcount)
3544 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003545 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 /* nothing to replace, return original string */
3547 Py_INCREF(self);
3548 u = self;
3549 } else {
3550 u = _PyUnicode_New(
3551 self->length + n * (str2->length - str1->length));
3552 if (u) {
3553 i = 0;
3554 p = u->str;
3555 while (i <= self->length - str1->length)
3556 if (Py_UNICODE_MATCH(self, i, str1)) {
3557 /* replace string segment */
3558 Py_UNICODE_COPY(p, str2->str, str2->length);
3559 p += str2->length;
3560 i += str1->length;
3561 if (--n <= 0) {
3562 /* copy remaining part */
3563 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3564 break;
3565 }
3566 } else
3567 *p++ = self->str[i++];
3568 }
3569 }
3570 }
3571
3572 return (PyObject *) u;
3573}
3574
3575/* --- Unicode Object Methods --------------------------------------------- */
3576
3577static char title__doc__[] =
3578"S.title() -> unicode\n\
3579\n\
3580Return a titlecased version of S, i.e. words start with title case\n\
3581characters, all remaining cased characters have lower case.";
3582
3583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003584unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 return fixup(self, fixtitle);
3587}
3588
3589static char capitalize__doc__[] =
3590"S.capitalize() -> unicode\n\
3591\n\
3592Return a capitalized version of S, i.e. make the first character\n\
3593have upper case.";
3594
3595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003596unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 return fixup(self, fixcapitalize);
3599}
3600
3601#if 0
3602static char capwords__doc__[] =
3603"S.capwords() -> unicode\n\
3604\n\
3605Apply .capitalize() to all words in S and return the result with\n\
3606normalized whitespace (all whitespace strings are replaced by ' ').";
3607
3608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003609unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610{
3611 PyObject *list;
3612 PyObject *item;
3613 int i;
3614
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 /* Split into words */
3616 list = split(self, NULL, -1);
3617 if (!list)
3618 return NULL;
3619
3620 /* Capitalize each word */
3621 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3622 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3623 fixcapitalize);
3624 if (item == NULL)
3625 goto onError;
3626 Py_DECREF(PyList_GET_ITEM(list, i));
3627 PyList_SET_ITEM(list, i, item);
3628 }
3629
3630 /* Join the words to form a new string */
3631 item = PyUnicode_Join(NULL, list);
3632
3633onError:
3634 Py_DECREF(list);
3635 return (PyObject *)item;
3636}
3637#endif
3638
3639static char center__doc__[] =
3640"S.center(width) -> unicode\n\
3641\n\
3642Return S centered in a Unicode string of length width. Padding is done\n\
3643using spaces.";
3644
3645static PyObject *
3646unicode_center(PyUnicodeObject *self, PyObject *args)
3647{
3648 int marg, left;
3649 int width;
3650
3651 if (!PyArg_ParseTuple(args, "i:center", &width))
3652 return NULL;
3653
Tim Peters7a29bd52001-09-12 03:03:31 +00003654 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 Py_INCREF(self);
3656 return (PyObject*) self;
3657 }
3658
3659 marg = width - self->length;
3660 left = marg / 2 + (marg & width & 1);
3661
3662 return (PyObject*) pad(self, left, marg - left, ' ');
3663}
3664
Marc-André Lemburge5034372000-08-08 08:04:29 +00003665#if 0
3666
3667/* This code should go into some future Unicode collation support
3668 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003669 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003670
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003671/* speedy UTF-16 code point order comparison */
3672/* gleaned from: */
3673/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3674
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003675static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003676{
3677 0, 0, 0, 0, 0, 0, 0, 0,
3678 0, 0, 0, 0, 0, 0, 0, 0,
3679 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003680 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003681};
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683static int
3684unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3685{
3686 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003687
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 Py_UNICODE *s1 = str1->str;
3689 Py_UNICODE *s2 = str2->str;
3690
3691 len1 = str1->length;
3692 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003693
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003695 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696
3697 c1 = *s1++;
3698 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003699
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003700 if (c1 > (1<<11) * 26)
3701 c1 += utf16Fixup[c1>>11];
3702 if (c2 > (1<<11) * 26)
3703 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003704 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003705
3706 if (c1 != c2)
3707 return (c1 < c2) ? -1 : 1;
3708
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003709 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 }
3711
3712 return (len1 < len2) ? -1 : (len1 != len2);
3713}
3714
Marc-André Lemburge5034372000-08-08 08:04:29 +00003715#else
3716
3717static int
3718unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3719{
3720 register int len1, len2;
3721
3722 Py_UNICODE *s1 = str1->str;
3723 Py_UNICODE *s2 = str2->str;
3724
3725 len1 = str1->length;
3726 len2 = str2->length;
3727
3728 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003729 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003730
Fredrik Lundh45714e92001-06-26 16:39:36 +00003731 c1 = *s1++;
3732 c2 = *s2++;
3733
3734 if (c1 != c2)
3735 return (c1 < c2) ? -1 : 1;
3736
Marc-André Lemburge5034372000-08-08 08:04:29 +00003737 len1--; len2--;
3738 }
3739
3740 return (len1 < len2) ? -1 : (len1 != len2);
3741}
3742
3743#endif
3744
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745int PyUnicode_Compare(PyObject *left,
3746 PyObject *right)
3747{
3748 PyUnicodeObject *u = NULL, *v = NULL;
3749 int result;
3750
3751 /* Coerce the two arguments */
3752 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3753 if (u == NULL)
3754 goto onError;
3755 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3756 if (v == NULL)
3757 goto onError;
3758
Thomas Wouters7e474022000-07-16 12:04:32 +00003759 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 if (v == u) {
3761 Py_DECREF(u);
3762 Py_DECREF(v);
3763 return 0;
3764 }
3765
3766 result = unicode_compare(u, v);
3767
3768 Py_DECREF(u);
3769 Py_DECREF(v);
3770 return result;
3771
3772onError:
3773 Py_XDECREF(u);
3774 Py_XDECREF(v);
3775 return -1;
3776}
3777
Guido van Rossum403d68b2000-03-13 15:55:09 +00003778int PyUnicode_Contains(PyObject *container,
3779 PyObject *element)
3780{
3781 PyUnicodeObject *u = NULL, *v = NULL;
3782 int result;
3783 register const Py_UNICODE *p, *e;
3784 register Py_UNICODE ch;
3785
3786 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003787 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003788 if (v == NULL) {
3789 PyErr_SetString(PyExc_TypeError,
3790 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003791 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003792 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003793 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3794 if (u == NULL) {
3795 Py_DECREF(v);
3796 goto onError;
3797 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003798
3799 /* Check v in u */
3800 if (PyUnicode_GET_SIZE(v) != 1) {
3801 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003802 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003803 goto onError;
3804 }
3805 ch = *PyUnicode_AS_UNICODE(v);
3806 p = PyUnicode_AS_UNICODE(u);
3807 e = p + PyUnicode_GET_SIZE(u);
3808 result = 0;
3809 while (p < e) {
3810 if (*p++ == ch) {
3811 result = 1;
3812 break;
3813 }
3814 }
3815
3816 Py_DECREF(u);
3817 Py_DECREF(v);
3818 return result;
3819
3820onError:
3821 Py_XDECREF(u);
3822 Py_XDECREF(v);
3823 return -1;
3824}
3825
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826/* Concat to string or Unicode object giving a new Unicode object. */
3827
3828PyObject *PyUnicode_Concat(PyObject *left,
3829 PyObject *right)
3830{
3831 PyUnicodeObject *u = NULL, *v = NULL, *w;
3832
3833 /* Coerce the two arguments */
3834 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3835 if (u == NULL)
3836 goto onError;
3837 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3838 if (v == NULL)
3839 goto onError;
3840
3841 /* Shortcuts */
3842 if (v == unicode_empty) {
3843 Py_DECREF(v);
3844 return (PyObject *)u;
3845 }
3846 if (u == unicode_empty) {
3847 Py_DECREF(u);
3848 return (PyObject *)v;
3849 }
3850
3851 /* Concat the two Unicode strings */
3852 w = _PyUnicode_New(u->length + v->length);
3853 if (w == NULL)
3854 goto onError;
3855 Py_UNICODE_COPY(w->str, u->str, u->length);
3856 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3857
3858 Py_DECREF(u);
3859 Py_DECREF(v);
3860 return (PyObject *)w;
3861
3862onError:
3863 Py_XDECREF(u);
3864 Py_XDECREF(v);
3865 return NULL;
3866}
3867
3868static char count__doc__[] =
3869"S.count(sub[, start[, end]]) -> int\n\
3870\n\
3871Return the number of occurrences of substring sub in Unicode string\n\
3872S[start:end]. Optional arguments start and end are\n\
3873interpreted as in slice notation.";
3874
3875static PyObject *
3876unicode_count(PyUnicodeObject *self, PyObject *args)
3877{
3878 PyUnicodeObject *substring;
3879 int start = 0;
3880 int end = INT_MAX;
3881 PyObject *result;
3882
Guido van Rossumb8872e62000-05-09 14:14:27 +00003883 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3884 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 return NULL;
3886
3887 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3888 (PyObject *)substring);
3889 if (substring == NULL)
3890 return NULL;
3891
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 if (start < 0)
3893 start += self->length;
3894 if (start < 0)
3895 start = 0;
3896 if (end > self->length)
3897 end = self->length;
3898 if (end < 0)
3899 end += self->length;
3900 if (end < 0)
3901 end = 0;
3902
3903 result = PyInt_FromLong((long) count(self, start, end, substring));
3904
3905 Py_DECREF(substring);
3906 return result;
3907}
3908
3909static char encode__doc__[] =
3910"S.encode([encoding[,errors]]) -> string\n\
3911\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003912Return an encoded string version of S. Default encoding is the current\n\
3913default string encoding. errors may be given to set a different error\n\
3914handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3915a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916
3917static PyObject *
3918unicode_encode(PyUnicodeObject *self, PyObject *args)
3919{
3920 char *encoding = NULL;
3921 char *errors = NULL;
3922 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3923 return NULL;
3924 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3925}
3926
3927static char expandtabs__doc__[] =
3928"S.expandtabs([tabsize]) -> unicode\n\
3929\n\
3930Return a copy of S where all tab characters are expanded using spaces.\n\
3931If tabsize is not given, a tab size of 8 characters is assumed.";
3932
3933static PyObject*
3934unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3935{
3936 Py_UNICODE *e;
3937 Py_UNICODE *p;
3938 Py_UNICODE *q;
3939 int i, j;
3940 PyUnicodeObject *u;
3941 int tabsize = 8;
3942
3943 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3944 return NULL;
3945
Thomas Wouters7e474022000-07-16 12:04:32 +00003946 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 i = j = 0;
3948 e = self->str + self->length;
3949 for (p = self->str; p < e; p++)
3950 if (*p == '\t') {
3951 if (tabsize > 0)
3952 j += tabsize - (j % tabsize);
3953 }
3954 else {
3955 j++;
3956 if (*p == '\n' || *p == '\r') {
3957 i += j;
3958 j = 0;
3959 }
3960 }
3961
3962 /* Second pass: create output string and fill it */
3963 u = _PyUnicode_New(i + j);
3964 if (!u)
3965 return NULL;
3966
3967 j = 0;
3968 q = u->str;
3969
3970 for (p = self->str; p < e; p++)
3971 if (*p == '\t') {
3972 if (tabsize > 0) {
3973 i = tabsize - (j % tabsize);
3974 j += i;
3975 while (i--)
3976 *q++ = ' ';
3977 }
3978 }
3979 else {
3980 j++;
3981 *q++ = *p;
3982 if (*p == '\n' || *p == '\r')
3983 j = 0;
3984 }
3985
3986 return (PyObject*) u;
3987}
3988
3989static char find__doc__[] =
3990"S.find(sub [,start [,end]]) -> int\n\
3991\n\
3992Return the lowest index in S where substring sub is found,\n\
3993such that sub is contained within s[start,end]. Optional\n\
3994arguments start and end are interpreted as in slice notation.\n\
3995\n\
3996Return -1 on failure.";
3997
3998static PyObject *
3999unicode_find(PyUnicodeObject *self, PyObject *args)
4000{
4001 PyUnicodeObject *substring;
4002 int start = 0;
4003 int end = INT_MAX;
4004 PyObject *result;
4005
Guido van Rossumb8872e62000-05-09 14:14:27 +00004006 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4007 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 return NULL;
4009 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4010 (PyObject *)substring);
4011 if (substring == NULL)
4012 return NULL;
4013
4014 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4015
4016 Py_DECREF(substring);
4017 return result;
4018}
4019
4020static PyObject *
4021unicode_getitem(PyUnicodeObject *self, int index)
4022{
4023 if (index < 0 || index >= self->length) {
4024 PyErr_SetString(PyExc_IndexError, "string index out of range");
4025 return NULL;
4026 }
4027
4028 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4029}
4030
4031static long
4032unicode_hash(PyUnicodeObject *self)
4033{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004034 /* Since Unicode objects compare equal to their ASCII string
4035 counterparts, they should use the individual character values
4036 as basis for their hash value. This is needed to assure that
4037 strings and Unicode objects behave in the same way as
4038 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039
Fredrik Lundhdde61642000-07-10 18:27:47 +00004040 register int len;
4041 register Py_UNICODE *p;
4042 register long x;
4043
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 if (self->hash != -1)
4045 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004046 len = PyUnicode_GET_SIZE(self);
4047 p = PyUnicode_AS_UNICODE(self);
4048 x = *p << 7;
4049 while (--len >= 0)
4050 x = (1000003*x) ^ *p++;
4051 x ^= PyUnicode_GET_SIZE(self);
4052 if (x == -1)
4053 x = -2;
4054 self->hash = x;
4055 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056}
4057
4058static char index__doc__[] =
4059"S.index(sub [,start [,end]]) -> int\n\
4060\n\
4061Like S.find() but raise ValueError when the substring is not found.";
4062
4063static PyObject *
4064unicode_index(PyUnicodeObject *self, PyObject *args)
4065{
4066 int result;
4067 PyUnicodeObject *substring;
4068 int start = 0;
4069 int end = INT_MAX;
4070
Guido van Rossumb8872e62000-05-09 14:14:27 +00004071 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4072 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 return NULL;
4074
4075 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4076 (PyObject *)substring);
4077 if (substring == NULL)
4078 return NULL;
4079
4080 result = findstring(self, substring, start, end, 1);
4081
4082 Py_DECREF(substring);
4083 if (result < 0) {
4084 PyErr_SetString(PyExc_ValueError, "substring not found");
4085 return NULL;
4086 }
4087 return PyInt_FromLong(result);
4088}
4089
4090static char islower__doc__[] =
4091"S.islower() -> int\n\
4092\n\
4093Return 1 if all cased characters in S are lowercase and there is\n\
4094at least one cased character in S, 0 otherwise.";
4095
4096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004097unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098{
4099 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4100 register const Py_UNICODE *e;
4101 int cased;
4102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 /* Shortcut for single character strings */
4104 if (PyUnicode_GET_SIZE(self) == 1)
4105 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4106
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004107 /* Special case for empty strings */
4108 if (PyString_GET_SIZE(self) == 0)
4109 return PyInt_FromLong(0);
4110
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 e = p + PyUnicode_GET_SIZE(self);
4112 cased = 0;
4113 for (; p < e; p++) {
4114 register const Py_UNICODE ch = *p;
4115
4116 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4117 return PyInt_FromLong(0);
4118 else if (!cased && Py_UNICODE_ISLOWER(ch))
4119 cased = 1;
4120 }
4121 return PyInt_FromLong(cased);
4122}
4123
4124static char isupper__doc__[] =
4125"S.isupper() -> int\n\
4126\n\
4127Return 1 if all cased characters in S are uppercase and there is\n\
4128at least one cased character in S, 0 otherwise.";
4129
4130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004131unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
4133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4134 register const Py_UNICODE *e;
4135 int cased;
4136
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 /* Shortcut for single character strings */
4138 if (PyUnicode_GET_SIZE(self) == 1)
4139 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004141 /* Special case for empty strings */
4142 if (PyString_GET_SIZE(self) == 0)
4143 return PyInt_FromLong(0);
4144
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 e = p + PyUnicode_GET_SIZE(self);
4146 cased = 0;
4147 for (; p < e; p++) {
4148 register const Py_UNICODE ch = *p;
4149
4150 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4151 return PyInt_FromLong(0);
4152 else if (!cased && Py_UNICODE_ISUPPER(ch))
4153 cased = 1;
4154 }
4155 return PyInt_FromLong(cased);
4156}
4157
4158static char istitle__doc__[] =
4159"S.istitle() -> int\n\
4160\n\
4161Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4162may only follow uncased characters and lowercase characters only cased\n\
4163ones. Return 0 otherwise.";
4164
4165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004166unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167{
4168 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4169 register const Py_UNICODE *e;
4170 int cased, previous_is_cased;
4171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 /* Shortcut for single character strings */
4173 if (PyUnicode_GET_SIZE(self) == 1)
4174 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4175 (Py_UNICODE_ISUPPER(*p) != 0));
4176
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004177 /* Special case for empty strings */
4178 if (PyString_GET_SIZE(self) == 0)
4179 return PyInt_FromLong(0);
4180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 e = p + PyUnicode_GET_SIZE(self);
4182 cased = 0;
4183 previous_is_cased = 0;
4184 for (; p < e; p++) {
4185 register const Py_UNICODE ch = *p;
4186
4187 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4188 if (previous_is_cased)
4189 return PyInt_FromLong(0);
4190 previous_is_cased = 1;
4191 cased = 1;
4192 }
4193 else if (Py_UNICODE_ISLOWER(ch)) {
4194 if (!previous_is_cased)
4195 return PyInt_FromLong(0);
4196 previous_is_cased = 1;
4197 cased = 1;
4198 }
4199 else
4200 previous_is_cased = 0;
4201 }
4202 return PyInt_FromLong(cased);
4203}
4204
4205static char isspace__doc__[] =
4206"S.isspace() -> int\n\
4207\n\
4208Return 1 if there are only whitespace characters in S,\n\
42090 otherwise.";
4210
4211static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004212unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213{
4214 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4215 register const Py_UNICODE *e;
4216
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217 /* Shortcut for single character strings */
4218 if (PyUnicode_GET_SIZE(self) == 1 &&
4219 Py_UNICODE_ISSPACE(*p))
4220 return PyInt_FromLong(1);
4221
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004222 /* Special case for empty strings */
4223 if (PyString_GET_SIZE(self) == 0)
4224 return PyInt_FromLong(0);
4225
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 e = p + PyUnicode_GET_SIZE(self);
4227 for (; p < e; p++) {
4228 if (!Py_UNICODE_ISSPACE(*p))
4229 return PyInt_FromLong(0);
4230 }
4231 return PyInt_FromLong(1);
4232}
4233
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004234static char isalpha__doc__[] =
4235"S.isalpha() -> int\n\
4236\n\
4237Return 1 if all characters in S are alphabetic\n\
4238and there is at least one character in S, 0 otherwise.";
4239
4240static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004241unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004242{
4243 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4244 register const Py_UNICODE *e;
4245
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004246 /* Shortcut for single character strings */
4247 if (PyUnicode_GET_SIZE(self) == 1 &&
4248 Py_UNICODE_ISALPHA(*p))
4249 return PyInt_FromLong(1);
4250
4251 /* Special case for empty strings */
4252 if (PyString_GET_SIZE(self) == 0)
4253 return PyInt_FromLong(0);
4254
4255 e = p + PyUnicode_GET_SIZE(self);
4256 for (; p < e; p++) {
4257 if (!Py_UNICODE_ISALPHA(*p))
4258 return PyInt_FromLong(0);
4259 }
4260 return PyInt_FromLong(1);
4261}
4262
4263static char isalnum__doc__[] =
4264"S.isalnum() -> int\n\
4265\n\
4266Return 1 if all characters in S are alphanumeric\n\
4267and there is at least one character in S, 0 otherwise.";
4268
4269static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004270unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004271{
4272 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4273 register const Py_UNICODE *e;
4274
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004275 /* Shortcut for single character strings */
4276 if (PyUnicode_GET_SIZE(self) == 1 &&
4277 Py_UNICODE_ISALNUM(*p))
4278 return PyInt_FromLong(1);
4279
4280 /* Special case for empty strings */
4281 if (PyString_GET_SIZE(self) == 0)
4282 return PyInt_FromLong(0);
4283
4284 e = p + PyUnicode_GET_SIZE(self);
4285 for (; p < e; p++) {
4286 if (!Py_UNICODE_ISALNUM(*p))
4287 return PyInt_FromLong(0);
4288 }
4289 return PyInt_FromLong(1);
4290}
4291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292static char isdecimal__doc__[] =
4293"S.isdecimal() -> int\n\
4294\n\
4295Return 1 if there are only decimal characters in S,\n\
42960 otherwise.";
4297
4298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004299unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300{
4301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4302 register const Py_UNICODE *e;
4303
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 /* Shortcut for single character strings */
4305 if (PyUnicode_GET_SIZE(self) == 1 &&
4306 Py_UNICODE_ISDECIMAL(*p))
4307 return PyInt_FromLong(1);
4308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004309 /* Special case for empty strings */
4310 if (PyString_GET_SIZE(self) == 0)
4311 return PyInt_FromLong(0);
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 e = p + PyUnicode_GET_SIZE(self);
4314 for (; p < e; p++) {
4315 if (!Py_UNICODE_ISDECIMAL(*p))
4316 return PyInt_FromLong(0);
4317 }
4318 return PyInt_FromLong(1);
4319}
4320
4321static char isdigit__doc__[] =
4322"S.isdigit() -> int\n\
4323\n\
4324Return 1 if there are only digit characters in S,\n\
43250 otherwise.";
4326
4327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004328unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329{
4330 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4331 register const Py_UNICODE *e;
4332
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 /* Shortcut for single character strings */
4334 if (PyUnicode_GET_SIZE(self) == 1 &&
4335 Py_UNICODE_ISDIGIT(*p))
4336 return PyInt_FromLong(1);
4337
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004338 /* Special case for empty strings */
4339 if (PyString_GET_SIZE(self) == 0)
4340 return PyInt_FromLong(0);
4341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 e = p + PyUnicode_GET_SIZE(self);
4343 for (; p < e; p++) {
4344 if (!Py_UNICODE_ISDIGIT(*p))
4345 return PyInt_FromLong(0);
4346 }
4347 return PyInt_FromLong(1);
4348}
4349
4350static char isnumeric__doc__[] =
4351"S.isnumeric() -> int\n\
4352\n\
4353Return 1 if there are only numeric characters in S,\n\
43540 otherwise.";
4355
4356static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004357unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358{
4359 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4360 register const Py_UNICODE *e;
4361
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 /* Shortcut for single character strings */
4363 if (PyUnicode_GET_SIZE(self) == 1 &&
4364 Py_UNICODE_ISNUMERIC(*p))
4365 return PyInt_FromLong(1);
4366
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004367 /* Special case for empty strings */
4368 if (PyString_GET_SIZE(self) == 0)
4369 return PyInt_FromLong(0);
4370
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 e = p + PyUnicode_GET_SIZE(self);
4372 for (; p < e; p++) {
4373 if (!Py_UNICODE_ISNUMERIC(*p))
4374 return PyInt_FromLong(0);
4375 }
4376 return PyInt_FromLong(1);
4377}
4378
4379static char join__doc__[] =
4380"S.join(sequence) -> unicode\n\
4381\n\
4382Return a string which is the concatenation of the strings in the\n\
4383sequence. The separator between elements is S.";
4384
4385static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004386unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004388 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389}
4390
4391static int
4392unicode_length(PyUnicodeObject *self)
4393{
4394 return self->length;
4395}
4396
4397static char ljust__doc__[] =
4398"S.ljust(width) -> unicode\n\
4399\n\
4400Return S left justified in a Unicode string of length width. Padding is\n\
4401done using spaces.";
4402
4403static PyObject *
4404unicode_ljust(PyUnicodeObject *self, PyObject *args)
4405{
4406 int width;
4407 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4408 return NULL;
4409
Tim Peters7a29bd52001-09-12 03:03:31 +00004410 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 Py_INCREF(self);
4412 return (PyObject*) self;
4413 }
4414
4415 return (PyObject*) pad(self, 0, width - self->length, ' ');
4416}
4417
4418static char lower__doc__[] =
4419"S.lower() -> unicode\n\
4420\n\
4421Return a copy of the string S converted to lowercase.";
4422
4423static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004424unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 return fixup(self, fixlower);
4427}
4428
4429static char lstrip__doc__[] =
4430"S.lstrip() -> unicode\n\
4431\n\
4432Return a copy of the string S with leading whitespace removed.";
4433
4434static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004435unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 return strip(self, 1, 0);
4438}
4439
4440static PyObject*
4441unicode_repeat(PyUnicodeObject *str, int len)
4442{
4443 PyUnicodeObject *u;
4444 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004445 int nchars;
4446 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447
4448 if (len < 0)
4449 len = 0;
4450
Tim Peters7a29bd52001-09-12 03:03:31 +00004451 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 /* no repeat, return original string */
4453 Py_INCREF(str);
4454 return (PyObject*) str;
4455 }
Tim Peters8f422462000-09-09 06:13:41 +00004456
4457 /* ensure # of chars needed doesn't overflow int and # of bytes
4458 * needed doesn't overflow size_t
4459 */
4460 nchars = len * str->length;
4461 if (len && nchars / len != str->length) {
4462 PyErr_SetString(PyExc_OverflowError,
4463 "repeated string is too long");
4464 return NULL;
4465 }
4466 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4467 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4468 PyErr_SetString(PyExc_OverflowError,
4469 "repeated string is too long");
4470 return NULL;
4471 }
4472 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 if (!u)
4474 return NULL;
4475
4476 p = u->str;
4477
4478 while (len-- > 0) {
4479 Py_UNICODE_COPY(p, str->str, str->length);
4480 p += str->length;
4481 }
4482
4483 return (PyObject*) u;
4484}
4485
4486PyObject *PyUnicode_Replace(PyObject *obj,
4487 PyObject *subobj,
4488 PyObject *replobj,
4489 int maxcount)
4490{
4491 PyObject *self;
4492 PyObject *str1;
4493 PyObject *str2;
4494 PyObject *result;
4495
4496 self = PyUnicode_FromObject(obj);
4497 if (self == NULL)
4498 return NULL;
4499 str1 = PyUnicode_FromObject(subobj);
4500 if (str1 == NULL) {
4501 Py_DECREF(self);
4502 return NULL;
4503 }
4504 str2 = PyUnicode_FromObject(replobj);
4505 if (str2 == NULL) {
4506 Py_DECREF(self);
4507 Py_DECREF(str1);
4508 return NULL;
4509 }
4510 result = replace((PyUnicodeObject *)self,
4511 (PyUnicodeObject *)str1,
4512 (PyUnicodeObject *)str2,
4513 maxcount);
4514 Py_DECREF(self);
4515 Py_DECREF(str1);
4516 Py_DECREF(str2);
4517 return result;
4518}
4519
4520static char replace__doc__[] =
4521"S.replace (old, new[, maxsplit]) -> unicode\n\
4522\n\
4523Return a copy of S with all occurrences of substring\n\
4524old replaced by new. If the optional argument maxsplit is\n\
4525given, only the first maxsplit occurrences are replaced.";
4526
4527static PyObject*
4528unicode_replace(PyUnicodeObject *self, PyObject *args)
4529{
4530 PyUnicodeObject *str1;
4531 PyUnicodeObject *str2;
4532 int maxcount = -1;
4533 PyObject *result;
4534
4535 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4536 return NULL;
4537 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4538 if (str1 == NULL)
4539 return NULL;
4540 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4541 if (str2 == NULL)
4542 return NULL;
4543
4544 result = replace(self, str1, str2, maxcount);
4545
4546 Py_DECREF(str1);
4547 Py_DECREF(str2);
4548 return result;
4549}
4550
4551static
4552PyObject *unicode_repr(PyObject *unicode)
4553{
4554 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4555 PyUnicode_GET_SIZE(unicode),
4556 1);
4557}
4558
4559static char rfind__doc__[] =
4560"S.rfind(sub [,start [,end]]) -> int\n\
4561\n\
4562Return the highest index in S where substring sub is found,\n\
4563such that sub is contained within s[start,end]. Optional\n\
4564arguments start and end are interpreted as in slice notation.\n\
4565\n\
4566Return -1 on failure.";
4567
4568static PyObject *
4569unicode_rfind(PyUnicodeObject *self, PyObject *args)
4570{
4571 PyUnicodeObject *substring;
4572 int start = 0;
4573 int end = INT_MAX;
4574 PyObject *result;
4575
Guido van Rossumb8872e62000-05-09 14:14:27 +00004576 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4577 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 return NULL;
4579 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4580 (PyObject *)substring);
4581 if (substring == NULL)
4582 return NULL;
4583
4584 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4585
4586 Py_DECREF(substring);
4587 return result;
4588}
4589
4590static char rindex__doc__[] =
4591"S.rindex(sub [,start [,end]]) -> int\n\
4592\n\
4593Like S.rfind() but raise ValueError when the substring is not found.";
4594
4595static PyObject *
4596unicode_rindex(PyUnicodeObject *self, PyObject *args)
4597{
4598 int result;
4599 PyUnicodeObject *substring;
4600 int start = 0;
4601 int end = INT_MAX;
4602
Guido van Rossumb8872e62000-05-09 14:14:27 +00004603 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4604 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 return NULL;
4606 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4607 (PyObject *)substring);
4608 if (substring == NULL)
4609 return NULL;
4610
4611 result = findstring(self, substring, start, end, -1);
4612
4613 Py_DECREF(substring);
4614 if (result < 0) {
4615 PyErr_SetString(PyExc_ValueError, "substring not found");
4616 return NULL;
4617 }
4618 return PyInt_FromLong(result);
4619}
4620
4621static char rjust__doc__[] =
4622"S.rjust(width) -> unicode\n\
4623\n\
4624Return S right justified in a Unicode string of length width. Padding is\n\
4625done using spaces.";
4626
4627static PyObject *
4628unicode_rjust(PyUnicodeObject *self, PyObject *args)
4629{
4630 int width;
4631 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4632 return NULL;
4633
Tim Peters7a29bd52001-09-12 03:03:31 +00004634 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 Py_INCREF(self);
4636 return (PyObject*) self;
4637 }
4638
4639 return (PyObject*) pad(self, width - self->length, 0, ' ');
4640}
4641
4642static char rstrip__doc__[] =
4643"S.rstrip() -> unicode\n\
4644\n\
4645Return a copy of the string S with trailing whitespace removed.";
4646
4647static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004648unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 return strip(self, 0, 1);
4651}
4652
4653static PyObject*
4654unicode_slice(PyUnicodeObject *self, int start, int end)
4655{
4656 /* standard clamping */
4657 if (start < 0)
4658 start = 0;
4659 if (end < 0)
4660 end = 0;
4661 if (end > self->length)
4662 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004663 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 /* full slice, return original string */
4665 Py_INCREF(self);
4666 return (PyObject*) self;
4667 }
4668 if (start > end)
4669 start = end;
4670 /* copy slice */
4671 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4672 end - start);
4673}
4674
4675PyObject *PyUnicode_Split(PyObject *s,
4676 PyObject *sep,
4677 int maxsplit)
4678{
4679 PyObject *result;
4680
4681 s = PyUnicode_FromObject(s);
4682 if (s == NULL)
4683 return NULL;
4684 if (sep != NULL) {
4685 sep = PyUnicode_FromObject(sep);
4686 if (sep == NULL) {
4687 Py_DECREF(s);
4688 return NULL;
4689 }
4690 }
4691
4692 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4693
4694 Py_DECREF(s);
4695 Py_XDECREF(sep);
4696 return result;
4697}
4698
4699static char split__doc__[] =
4700"S.split([sep [,maxsplit]]) -> list of strings\n\
4701\n\
4702Return a list of the words in S, using sep as the\n\
4703delimiter string. If maxsplit is given, at most maxsplit\n\
4704splits are done. If sep is not specified, any whitespace string\n\
4705is a separator.";
4706
4707static PyObject*
4708unicode_split(PyUnicodeObject *self, PyObject *args)
4709{
4710 PyObject *substring = Py_None;
4711 int maxcount = -1;
4712
4713 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4714 return NULL;
4715
4716 if (substring == Py_None)
4717 return split(self, NULL, maxcount);
4718 else if (PyUnicode_Check(substring))
4719 return split(self, (PyUnicodeObject *)substring, maxcount);
4720 else
4721 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4722}
4723
4724static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004725"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726\n\
4727Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004728Line breaks are not included in the resulting list unless keepends\n\
4729is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730
4731static PyObject*
4732unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4733{
Guido van Rossum86662912000-04-11 15:38:46 +00004734 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735
Guido van Rossum86662912000-04-11 15:38:46 +00004736 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 return NULL;
4738
Guido van Rossum86662912000-04-11 15:38:46 +00004739 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740}
4741
4742static
4743PyObject *unicode_str(PyUnicodeObject *self)
4744{
Fred Drakee4315f52000-05-09 19:53:39 +00004745 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746}
4747
4748static char strip__doc__[] =
4749"S.strip() -> unicode\n\
4750\n\
4751Return a copy of S with leading and trailing whitespace removed.";
4752
4753static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004754unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 return strip(self, 1, 1);
4757}
4758
4759static char swapcase__doc__[] =
4760"S.swapcase() -> unicode\n\
4761\n\
4762Return a copy of S with uppercase characters converted to lowercase\n\
4763and vice versa.";
4764
4765static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004766unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 return fixup(self, fixswapcase);
4769}
4770
4771static char translate__doc__[] =
4772"S.translate(table) -> unicode\n\
4773\n\
4774Return a copy of the string S, where all characters have been mapped\n\
4775through the given translation table, which must be a mapping of\n\
4776Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4777are left untouched. Characters mapped to None are deleted.";
4778
4779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004780unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 return PyUnicode_TranslateCharmap(self->str,
4783 self->length,
4784 table,
4785 "ignore");
4786}
4787
4788static char upper__doc__[] =
4789"S.upper() -> unicode\n\
4790\n\
4791Return a copy of S converted to uppercase.";
4792
4793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004794unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 return fixup(self, fixupper);
4797}
4798
4799#if 0
4800static char zfill__doc__[] =
4801"S.zfill(width) -> unicode\n\
4802\n\
4803Pad a numeric string x with zeros on the left, to fill a field\n\
4804of the specified width. The string x is never truncated.";
4805
4806static PyObject *
4807unicode_zfill(PyUnicodeObject *self, PyObject *args)
4808{
4809 int fill;
4810 PyUnicodeObject *u;
4811
4812 int width;
4813 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4814 return NULL;
4815
4816 if (self->length >= width) {
4817 Py_INCREF(self);
4818 return (PyObject*) self;
4819 }
4820
4821 fill = width - self->length;
4822
4823 u = pad(self, fill, 0, '0');
4824
4825 if (u->str[fill] == '+' || u->str[fill] == '-') {
4826 /* move sign to beginning of string */
4827 u->str[0] = u->str[fill];
4828 u->str[fill] = '0';
4829 }
4830
4831 return (PyObject*) u;
4832}
4833#endif
4834
4835#if 0
4836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004837unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 return PyInt_FromLong(unicode_freelist_size);
4840}
4841#endif
4842
4843static char startswith__doc__[] =
4844"S.startswith(prefix[, start[, end]]) -> int\n\
4845\n\
4846Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4847optional start, test S beginning at that position. With optional end, stop\n\
4848comparing S at that position.";
4849
4850static PyObject *
4851unicode_startswith(PyUnicodeObject *self,
4852 PyObject *args)
4853{
4854 PyUnicodeObject *substring;
4855 int start = 0;
4856 int end = INT_MAX;
4857 PyObject *result;
4858
Guido van Rossumb8872e62000-05-09 14:14:27 +00004859 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4860 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 return NULL;
4862 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4863 (PyObject *)substring);
4864 if (substring == NULL)
4865 return NULL;
4866
4867 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4868
4869 Py_DECREF(substring);
4870 return result;
4871}
4872
4873
4874static char endswith__doc__[] =
4875"S.endswith(suffix[, start[, end]]) -> int\n\
4876\n\
4877Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4878optional start, test S beginning at that position. With optional end, stop\n\
4879comparing S at that position.";
4880
4881static PyObject *
4882unicode_endswith(PyUnicodeObject *self,
4883 PyObject *args)
4884{
4885 PyUnicodeObject *substring;
4886 int start = 0;
4887 int end = INT_MAX;
4888 PyObject *result;
4889
Guido van Rossumb8872e62000-05-09 14:14:27 +00004890 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4891 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 return NULL;
4893 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4894 (PyObject *)substring);
4895 if (substring == NULL)
4896 return NULL;
4897
4898 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4899
4900 Py_DECREF(substring);
4901 return result;
4902}
4903
4904
4905static PyMethodDef unicode_methods[] = {
4906
4907 /* Order is according to common usage: often used methods should
4908 appear first, since lookup is done sequentially. */
4909
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004910 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4911 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4912 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4913 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4914 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4915 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4916 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4917 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4918 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4919 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4921 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4922 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4923 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4924/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4925 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4926 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4927 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4928 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4929 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4930 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4931 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4932 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4933 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4934 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4935 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4936 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4937 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4938 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4939 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4940 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4941 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4942 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4943 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4944 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004946 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4947 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948#endif
4949
4950#if 0
4951 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004952 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953#endif
4954
4955 {NULL, NULL}
4956};
4957
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958static PySequenceMethods unicode_as_sequence = {
4959 (inquiry) unicode_length, /* sq_length */
4960 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4961 (intargfunc) unicode_repeat, /* sq_repeat */
4962 (intargfunc) unicode_getitem, /* sq_item */
4963 (intintargfunc) unicode_slice, /* sq_slice */
4964 0, /* sq_ass_item */
4965 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004966 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967};
4968
4969static int
4970unicode_buffer_getreadbuf(PyUnicodeObject *self,
4971 int index,
4972 const void **ptr)
4973{
4974 if (index != 0) {
4975 PyErr_SetString(PyExc_SystemError,
4976 "accessing non-existent unicode segment");
4977 return -1;
4978 }
4979 *ptr = (void *) self->str;
4980 return PyUnicode_GET_DATA_SIZE(self);
4981}
4982
4983static int
4984unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4985 const void **ptr)
4986{
4987 PyErr_SetString(PyExc_TypeError,
4988 "cannot use unicode as modifyable buffer");
4989 return -1;
4990}
4991
4992static int
4993unicode_buffer_getsegcount(PyUnicodeObject *self,
4994 int *lenp)
4995{
4996 if (lenp)
4997 *lenp = PyUnicode_GET_DATA_SIZE(self);
4998 return 1;
4999}
5000
5001static int
5002unicode_buffer_getcharbuf(PyUnicodeObject *self,
5003 int index,
5004 const void **ptr)
5005{
5006 PyObject *str;
5007
5008 if (index != 0) {
5009 PyErr_SetString(PyExc_SystemError,
5010 "accessing non-existent unicode segment");
5011 return -1;
5012 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005013 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 if (str == NULL)
5015 return -1;
5016 *ptr = (void *) PyString_AS_STRING(str);
5017 return PyString_GET_SIZE(str);
5018}
5019
5020/* Helpers for PyUnicode_Format() */
5021
5022static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005023getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024{
5025 int argidx = *p_argidx;
5026 if (argidx < arglen) {
5027 (*p_argidx)++;
5028 if (arglen < 0)
5029 return args;
5030 else
5031 return PyTuple_GetItem(args, argidx);
5032 }
5033 PyErr_SetString(PyExc_TypeError,
5034 "not enough arguments for format string");
5035 return NULL;
5036}
5037
5038#define F_LJUST (1<<0)
5039#define F_SIGN (1<<1)
5040#define F_BLANK (1<<2)
5041#define F_ALT (1<<3)
5042#define F_ZERO (1<<4)
5043
5044static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046{
5047 register int i;
5048 int len;
5049 va_list va;
5050 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052
5053 /* First, format the string as char array, then expand to Py_UNICODE
5054 array. */
5055 charbuffer = (char *)buffer;
5056 len = vsprintf(charbuffer, format, va);
5057 for (i = len - 1; i >= 0; i--)
5058 buffer[i] = (Py_UNICODE) charbuffer[i];
5059
5060 va_end(va);
5061 return len;
5062}
5063
5064static int
5065formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005066 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 int flags,
5068 int prec,
5069 int type,
5070 PyObject *v)
5071{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005072 /* fmt = '%#.' + `prec` + `type`
5073 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 char fmt[20];
5075 double x;
5076
5077 x = PyFloat_AsDouble(v);
5078 if (x == -1.0 && PyErr_Occurred())
5079 return -1;
5080 if (prec < 0)
5081 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5083 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005084 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5085 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005086 /* worst case length calc to ensure no buffer overrun:
5087 fmt = %#.<prec>g
5088 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5089 for any double rep.)
5090 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5091 If prec=0 the effective precision is 1 (the leading digit is
5092 always given), therefore increase by one to 10+prec. */
5093 if (buflen <= (size_t)10 + (size_t)prec) {
5094 PyErr_SetString(PyExc_OverflowError,
5095 "formatted float is too long (precision too long?)");
5096 return -1;
5097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return usprintf(buf, fmt, x);
5099}
5100
Tim Peters38fd5b62000-09-21 05:43:11 +00005101static PyObject*
5102formatlong(PyObject *val, int flags, int prec, int type)
5103{
5104 char *buf;
5105 int i, len;
5106 PyObject *str; /* temporary string object. */
5107 PyUnicodeObject *result;
5108
5109 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5110 if (!str)
5111 return NULL;
5112 result = _PyUnicode_New(len);
5113 for (i = 0; i < len; i++)
5114 result->str[i] = buf[i];
5115 result->str[len] = 0;
5116 Py_DECREF(str);
5117 return (PyObject*)result;
5118}
5119
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120static int
5121formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005122 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 int flags,
5124 int prec,
5125 int type,
5126 PyObject *v)
5127{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005128 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005129 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5130 + 1 + 1 = 24*/
5131 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005133 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
5135 x = PyInt_AsLong(v);
5136 if (x == -1 && PyErr_Occurred())
5137 return -1;
5138 if (prec < 0)
5139 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005140 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5141 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5142 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5143 PyErr_SetString(PyExc_OverflowError,
5144 "formatted integer is too long (precision too long?)");
5145 return -1;
5146 }
Tim Petersfff53252001-04-12 18:38:48 +00005147 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5148 * but we want it (for consistency with other %#x conversions, and
5149 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005150 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5151 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5152 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005153 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005154 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5155 /* Only way to know what the platform does is to try it. */
Barry Warsawe5c492d2001-11-28 21:00:41 +00005156 PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005157 if (fmt[1] != (char)type) {
5158 /* Supply our own leading 0x/0X -- needed under std C */
5159 use_native_c_format = 0;
Barry Warsawe5c492d2001-11-28 21:00:41 +00005160 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005161 }
5162 }
5163 if (use_native_c_format)
Barry Warsawe5c492d2001-11-28 21:00:41 +00005164 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5165 (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 return usprintf(buf, fmt, x);
5167}
5168
5169static int
5170formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005171 size_t buflen,
5172 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005174 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005175 if (PyUnicode_Check(v)) {
5176 if (PyUnicode_GET_SIZE(v) != 1)
5177 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005181 else if (PyString_Check(v)) {
5182 if (PyString_GET_SIZE(v) != 1)
5183 goto onError;
5184 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
5187 else {
5188 /* Integer input truncated to a character */
5189 long x;
5190 x = PyInt_AsLong(v);
5191 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 buf[0] = (char) x;
5194 }
5195 buf[1] = '\0';
5196 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005197
5198 onError:
5199 PyErr_SetString(PyExc_TypeError,
5200 "%c requires int or char");
5201 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}
5203
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005204/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5205
5206 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5207 chars are formatted. XXX This is a magic number. Each formatting
5208 routine does bounds checking to ensure no overflow, but a better
5209 solution may be to malloc a buffer of appropriate size for each
5210 format. For now, the current solution is sufficient.
5211*/
5212#define FORMATBUFLEN (size_t)120
5213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214PyObject *PyUnicode_Format(PyObject *format,
5215 PyObject *args)
5216{
5217 Py_UNICODE *fmt, *res;
5218 int fmtcnt, rescnt, reslen, arglen, argidx;
5219 int args_owned = 0;
5220 PyUnicodeObject *result = NULL;
5221 PyObject *dict = NULL;
5222 PyObject *uformat;
5223
5224 if (format == NULL || args == NULL) {
5225 PyErr_BadInternalCall();
5226 return NULL;
5227 }
5228 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005229 if (uformat == NULL)
5230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 fmt = PyUnicode_AS_UNICODE(uformat);
5232 fmtcnt = PyUnicode_GET_SIZE(uformat);
5233
5234 reslen = rescnt = fmtcnt + 100;
5235 result = _PyUnicode_New(reslen);
5236 if (result == NULL)
5237 goto onError;
5238 res = PyUnicode_AS_UNICODE(result);
5239
5240 if (PyTuple_Check(args)) {
5241 arglen = PyTuple_Size(args);
5242 argidx = 0;
5243 }
5244 else {
5245 arglen = -1;
5246 argidx = -2;
5247 }
5248 if (args->ob_type->tp_as_mapping)
5249 dict = args;
5250
5251 while (--fmtcnt >= 0) {
5252 if (*fmt != '%') {
5253 if (--rescnt < 0) {
5254 rescnt = fmtcnt + 100;
5255 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005256 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 return NULL;
5258 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5259 --rescnt;
5260 }
5261 *res++ = *fmt++;
5262 }
5263 else {
5264 /* Got a format specifier */
5265 int flags = 0;
5266 int width = -1;
5267 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 Py_UNICODE c = '\0';
5269 Py_UNICODE fill;
5270 PyObject *v = NULL;
5271 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005272 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 Py_UNICODE sign;
5274 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005275 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
5277 fmt++;
5278 if (*fmt == '(') {
5279 Py_UNICODE *keystart;
5280 int keylen;
5281 PyObject *key;
5282 int pcount = 1;
5283
5284 if (dict == NULL) {
5285 PyErr_SetString(PyExc_TypeError,
5286 "format requires a mapping");
5287 goto onError;
5288 }
5289 ++fmt;
5290 --fmtcnt;
5291 keystart = fmt;
5292 /* Skip over balanced parentheses */
5293 while (pcount > 0 && --fmtcnt >= 0) {
5294 if (*fmt == ')')
5295 --pcount;
5296 else if (*fmt == '(')
5297 ++pcount;
5298 fmt++;
5299 }
5300 keylen = fmt - keystart - 1;
5301 if (fmtcnt < 0 || pcount > 0) {
5302 PyErr_SetString(PyExc_ValueError,
5303 "incomplete format key");
5304 goto onError;
5305 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005306#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005307 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 then looked up since Python uses strings to hold
5309 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005310 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 key = PyUnicode_EncodeUTF8(keystart,
5312 keylen,
5313 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005314#else
5315 key = PyUnicode_FromUnicode(keystart, keylen);
5316#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 if (key == NULL)
5318 goto onError;
5319 if (args_owned) {
5320 Py_DECREF(args);
5321 args_owned = 0;
5322 }
5323 args = PyObject_GetItem(dict, key);
5324 Py_DECREF(key);
5325 if (args == NULL) {
5326 goto onError;
5327 }
5328 args_owned = 1;
5329 arglen = -1;
5330 argidx = -2;
5331 }
5332 while (--fmtcnt >= 0) {
5333 switch (c = *fmt++) {
5334 case '-': flags |= F_LJUST; continue;
5335 case '+': flags |= F_SIGN; continue;
5336 case ' ': flags |= F_BLANK; continue;
5337 case '#': flags |= F_ALT; continue;
5338 case '0': flags |= F_ZERO; continue;
5339 }
5340 break;
5341 }
5342 if (c == '*') {
5343 v = getnextarg(args, arglen, &argidx);
5344 if (v == NULL)
5345 goto onError;
5346 if (!PyInt_Check(v)) {
5347 PyErr_SetString(PyExc_TypeError,
5348 "* wants int");
5349 goto onError;
5350 }
5351 width = PyInt_AsLong(v);
5352 if (width < 0) {
5353 flags |= F_LJUST;
5354 width = -width;
5355 }
5356 if (--fmtcnt >= 0)
5357 c = *fmt++;
5358 }
5359 else if (c >= '0' && c <= '9') {
5360 width = c - '0';
5361 while (--fmtcnt >= 0) {
5362 c = *fmt++;
5363 if (c < '0' || c > '9')
5364 break;
5365 if ((width*10) / 10 != width) {
5366 PyErr_SetString(PyExc_ValueError,
5367 "width too big");
5368 goto onError;
5369 }
5370 width = width*10 + (c - '0');
5371 }
5372 }
5373 if (c == '.') {
5374 prec = 0;
5375 if (--fmtcnt >= 0)
5376 c = *fmt++;
5377 if (c == '*') {
5378 v = getnextarg(args, arglen, &argidx);
5379 if (v == NULL)
5380 goto onError;
5381 if (!PyInt_Check(v)) {
5382 PyErr_SetString(PyExc_TypeError,
5383 "* wants int");
5384 goto onError;
5385 }
5386 prec = PyInt_AsLong(v);
5387 if (prec < 0)
5388 prec = 0;
5389 if (--fmtcnt >= 0)
5390 c = *fmt++;
5391 }
5392 else if (c >= '0' && c <= '9') {
5393 prec = c - '0';
5394 while (--fmtcnt >= 0) {
5395 c = Py_CHARMASK(*fmt++);
5396 if (c < '0' || c > '9')
5397 break;
5398 if ((prec*10) / 10 != prec) {
5399 PyErr_SetString(PyExc_ValueError,
5400 "prec too big");
5401 goto onError;
5402 }
5403 prec = prec*10 + (c - '0');
5404 }
5405 }
5406 } /* prec */
5407 if (fmtcnt >= 0) {
5408 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 if (--fmtcnt >= 0)
5410 c = *fmt++;
5411 }
5412 }
5413 if (fmtcnt < 0) {
5414 PyErr_SetString(PyExc_ValueError,
5415 "incomplete format");
5416 goto onError;
5417 }
5418 if (c != '%') {
5419 v = getnextarg(args, arglen, &argidx);
5420 if (v == NULL)
5421 goto onError;
5422 }
5423 sign = 0;
5424 fill = ' ';
5425 switch (c) {
5426
5427 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005428 pbuf = formatbuf;
5429 /* presume that buffer length is at least 1 */
5430 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 len = 1;
5432 break;
5433
5434 case 's':
5435 case 'r':
5436 if (PyUnicode_Check(v) && c == 's') {
5437 temp = v;
5438 Py_INCREF(temp);
5439 }
5440 else {
5441 PyObject *unicode;
5442 if (c == 's')
5443 temp = PyObject_Str(v);
5444 else
5445 temp = PyObject_Repr(v);
5446 if (temp == NULL)
5447 goto onError;
5448 if (!PyString_Check(temp)) {
5449 /* XXX Note: this should never happen, since
5450 PyObject_Repr() and PyObject_Str() assure
5451 this */
5452 Py_DECREF(temp);
5453 PyErr_SetString(PyExc_TypeError,
5454 "%s argument has non-string str()");
5455 goto onError;
5456 }
Fred Drakee4315f52000-05-09 19:53:39 +00005457 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005459 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 "strict");
5461 Py_DECREF(temp);
5462 temp = unicode;
5463 if (temp == NULL)
5464 goto onError;
5465 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005466 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 len = PyUnicode_GET_SIZE(temp);
5468 if (prec >= 0 && len > prec)
5469 len = prec;
5470 break;
5471
5472 case 'i':
5473 case 'd':
5474 case 'u':
5475 case 'o':
5476 case 'x':
5477 case 'X':
5478 if (c == 'i')
5479 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005480 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005481 temp = formatlong(v, flags, prec, c);
5482 if (!temp)
5483 goto onError;
5484 pbuf = PyUnicode_AS_UNICODE(temp);
5485 len = PyUnicode_GET_SIZE(temp);
5486 /* unbounded ints can always produce
5487 a sign character! */
5488 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005490 else {
5491 pbuf = formatbuf;
5492 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5493 flags, prec, c, v);
5494 if (len < 0)
5495 goto onError;
5496 /* only d conversion is signed */
5497 sign = c == 'd';
5498 }
5499 if (flags & F_ZERO)
5500 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 break;
5502
5503 case 'e':
5504 case 'E':
5505 case 'f':
5506 case 'g':
5507 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005508 pbuf = formatbuf;
5509 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5510 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 if (len < 0)
5512 goto onError;
5513 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005514 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 fill = '0';
5516 break;
5517
5518 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005519 pbuf = formatbuf;
5520 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (len < 0)
5522 goto onError;
5523 break;
5524
5525 default:
5526 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005527 "unsupported format character '%c' (0x%x) "
5528 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005529 (31<=c && c<=126) ? c : '?',
5530 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 goto onError;
5532 }
5533 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005534 if (*pbuf == '-' || *pbuf == '+') {
5535 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 len--;
5537 }
5538 else if (flags & F_SIGN)
5539 sign = '+';
5540 else if (flags & F_BLANK)
5541 sign = ' ';
5542 else
5543 sign = 0;
5544 }
5545 if (width < len)
5546 width = len;
5547 if (rescnt < width + (sign != 0)) {
5548 reslen -= rescnt;
5549 rescnt = width + fmtcnt + 100;
5550 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005551 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 return NULL;
5553 res = PyUnicode_AS_UNICODE(result)
5554 + reslen - rescnt;
5555 }
5556 if (sign) {
5557 if (fill != ' ')
5558 *res++ = sign;
5559 rescnt--;
5560 if (width > len)
5561 width--;
5562 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005563 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5564 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005565 assert(pbuf[1] == c);
5566 if (fill != ' ') {
5567 *res++ = *pbuf++;
5568 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005569 }
Tim Petersfff53252001-04-12 18:38:48 +00005570 rescnt -= 2;
5571 width -= 2;
5572 if (width < 0)
5573 width = 0;
5574 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 if (width > len && !(flags & F_LJUST)) {
5577 do {
5578 --rescnt;
5579 *res++ = fill;
5580 } while (--width > len);
5581 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005582 if (fill == ' ') {
5583 if (sign)
5584 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005585 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005586 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005587 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005588 *res++ = *pbuf++;
5589 *res++ = *pbuf++;
5590 }
5591 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005592 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 res += len;
5594 rescnt -= len;
5595 while (--width >= len) {
5596 --rescnt;
5597 *res++ = ' ';
5598 }
5599 if (dict && (argidx < arglen) && c != '%') {
5600 PyErr_SetString(PyExc_TypeError,
5601 "not all arguments converted");
5602 goto onError;
5603 }
5604 Py_XDECREF(temp);
5605 } /* '%' */
5606 } /* until end */
5607 if (argidx < arglen && !dict) {
5608 PyErr_SetString(PyExc_TypeError,
5609 "not all arguments converted");
5610 goto onError;
5611 }
5612
5613 if (args_owned) {
5614 Py_DECREF(args);
5615 }
5616 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005617 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 return (PyObject *)result;
5620
5621 onError:
5622 Py_XDECREF(result);
5623 Py_DECREF(uformat);
5624 if (args_owned) {
5625 Py_DECREF(args);
5626 }
5627 return NULL;
5628}
5629
5630static PyBufferProcs unicode_as_buffer = {
5631 (getreadbufferproc) unicode_buffer_getreadbuf,
5632 (getwritebufferproc) unicode_buffer_getwritebuf,
5633 (getsegcountproc) unicode_buffer_getsegcount,
5634 (getcharbufferproc) unicode_buffer_getcharbuf,
5635};
5636
Guido van Rossume023fe02001-08-30 03:12:59 +00005637staticforward PyObject *
5638unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5639
Tim Peters6d6c1a32001-08-02 04:15:00 +00005640static PyObject *
5641unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5642{
5643 PyObject *x = NULL;
5644 static char *kwlist[] = {"string", "encoding", "errors", 0};
5645 char *encoding = NULL;
5646 char *errors = NULL;
5647
Guido van Rossume023fe02001-08-30 03:12:59 +00005648 if (type != &PyUnicode_Type)
5649 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005650 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5651 kwlist, &x, &encoding, &errors))
5652 return NULL;
5653 if (x == NULL)
5654 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005655 if (encoding == NULL && errors == NULL)
5656 return PyObject_Unicode(x);
5657 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005658 return PyUnicode_FromEncodedObject(x, encoding, errors);
5659}
5660
Guido van Rossume023fe02001-08-30 03:12:59 +00005661static PyObject *
5662unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5663{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005664 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005665 int n;
5666
5667 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5668 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5669 if (tmp == NULL)
5670 return NULL;
5671 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005672 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5673 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005674 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005675 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5676 if (pnew->str == NULL) {
5677 _Py_ForgetReference((PyObject *)pnew);
5678 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005679 return NULL;
5680 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005681 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5682 pnew->length = n;
5683 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005684 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005685 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005686}
5687
Tim Peters6d6c1a32001-08-02 04:15:00 +00005688static char unicode_doc[] =
5689"unicode(string [, encoding[, errors]]) -> object\n\
5690\n\
5691Create a new Unicode object from the given encoded string.\n\
5692encoding defaults to the current default string encoding and \n\
5693errors, defining the error handling, to 'strict'.";
5694
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695PyTypeObject PyUnicode_Type = {
5696 PyObject_HEAD_INIT(&PyType_Type)
5697 0, /* ob_size */
5698 "unicode", /* tp_name */
5699 sizeof(PyUnicodeObject), /* tp_size */
5700 0, /* tp_itemsize */
5701 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005702 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005704 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 0, /* tp_setattr */
5706 (cmpfunc) unicode_compare, /* tp_compare */
5707 (reprfunc) unicode_repr, /* tp_repr */
5708 0, /* tp_as_number */
5709 &unicode_as_sequence, /* tp_as_sequence */
5710 0, /* tp_as_mapping */
5711 (hashfunc) unicode_hash, /* tp_hash*/
5712 0, /* tp_call*/
5713 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005714 PyObject_GenericGetAttr, /* tp_getattro */
5715 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005717 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005718 unicode_doc, /* tp_doc */
5719 0, /* tp_traverse */
5720 0, /* tp_clear */
5721 0, /* tp_richcompare */
5722 0, /* tp_weaklistoffset */
5723 0, /* tp_iter */
5724 0, /* tp_iternext */
5725 unicode_methods, /* tp_methods */
5726 0, /* tp_members */
5727 0, /* tp_getset */
5728 0, /* tp_base */
5729 0, /* tp_dict */
5730 0, /* tp_descr_get */
5731 0, /* tp_descr_set */
5732 0, /* tp_dictoffset */
5733 0, /* tp_init */
5734 0, /* tp_alloc */
5735 unicode_new, /* tp_new */
Guido van Rossum9475a232001-10-05 20:51:39 +00005736 _PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737};
5738
5739/* Initialize the Unicode implementation */
5740
Thomas Wouters78890102000-07-22 19:25:51 +00005741void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005743 int i;
5744
Fred Drakee4315f52000-05-09 19:53:39 +00005745 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005746 unicode_freelist = NULL;
5747 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005749 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005750 for (i = 0; i < 256; i++)
5751 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752}
5753
5754/* Finalize the Unicode implementation */
5755
5756void
Thomas Wouters78890102000-07-22 19:25:51 +00005757_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005759 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005760 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005762 Py_XDECREF(unicode_empty);
5763 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005764
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005765 for (i = 0; i < 256; i++) {
5766 if (unicode_latin1[i]) {
5767 Py_DECREF(unicode_latin1[i]);
5768 unicode_latin1[i] = NULL;
5769 }
5770 }
5771
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005772 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 PyUnicodeObject *v = u;
5774 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005775 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005776 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005777 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005778 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005780 unicode_freelist = NULL;
5781 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}