blob: 68afaa05c85fb25c5e3c1a7932b0878d36f2ee9f [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001068 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069 errmsg = "illegal encoding";
1070 goto utf8Error;
1071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001073 *p++ = (Py_UNICODE)ch;
1074 break;
1075
1076 case 4:
1077 if ((s[1] & 0xc0) != 0x80 ||
1078 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001079 (s[3] & 0xc0) != 0x80) {
1080 errmsg = "invalid data";
1081 goto utf8Error;
1082 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001083 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001086 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001087 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001088 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001089 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001091 errmsg = "illegal encoding";
1092 goto utf8Error;
1093 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001094#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 *p++ = (Py_UNICODE)ch;
1096#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001097 /* compute and append the two surrogates: */
1098
1099 /* translate from 10000..10FFFF to 0..FFFF */
1100 ch -= 0x10000;
1101
1102 /* high surrogate = top 10 bits added to D800 */
1103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1104
1105 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001107#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 break;
1109
1110 default:
1111 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001112 errmsg = "unsupported Unicode code range";
1113 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
1115 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001116 continue;
1117
1118 utf8Error:
1119 if (utf8_decoding_error(&s, &p, errors, errmsg))
1120 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122
1123 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001124 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 goto onError;
1126
1127 return (PyObject *)unicode;
1128
1129onError:
1130 Py_DECREF(unicode);
1131 return NULL;
1132}
1133
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134/* Not used anymore, now that the encoder supports UTF-16
1135 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001136#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137static
1138int utf8_encoding_error(const Py_UNICODE **source,
1139 char **dest,
1140 const char *errors,
1141 const char *details)
1142{
1143 if ((errors == NULL) ||
1144 (strcmp(errors,"strict") == 0)) {
1145 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001146 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 details);
1148 return -1;
1149 }
1150 else if (strcmp(errors,"ignore") == 0) {
1151 return 0;
1152 }
1153 else if (strcmp(errors,"replace") == 0) {
1154 **dest = '?';
1155 (*dest)++;
1156 return 0;
1157 }
1158 else {
1159 PyErr_Format(PyExc_ValueError,
1160 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001161 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 errors);
1163 return -1;
1164 }
1165}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001166#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167
1168PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169 int size,
1170 const char *errors)
1171{
1172 PyObject *v;
1173 char *p;
1174 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001175 Py_UCS4 ch2;
1176 unsigned int cbAllocated = 3 * size;
1177 unsigned int cbWritten = 0;
1178 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001180 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 if (v == NULL)
1182 return NULL;
1183 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001184 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185
1186 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001187 while (i < size) {
1188 Py_UCS4 ch = s[i++];
1189 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 cbWritten++;
1192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 else if (ch < 0x0800) {
1194 *p++ = 0xc0 | (ch >> 6);
1195 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 cbWritten += 2;
1197 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001198 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001199 /* Check for high surrogate */
1200 if (0xD800 <= ch && ch <= 0xDBFF) {
1201 if (i != size) {
1202 ch2 = s[i];
1203 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1204
1205 if (cbWritten >= (cbAllocated - 4)) {
1206 /* Provide enough room for some more
1207 surrogates */
1208 cbAllocated += 4*10;
1209 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001211 }
1212
1213 /* combine the two values */
1214 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1215
1216 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001217 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001218 i++;
1219 cbWritten += 4;
1220 }
1221 }
1222 }
1223 else {
1224 *p++ = (char)(0xe0 | (ch >> 12));
1225 cbWritten += 3;
1226 }
1227 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1228 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001229 } else {
1230 *p++ = 0xf0 | (ch>>18);
1231 *p++ = 0x80 | ((ch>>12) & 0x3f);
1232 *p++ = 0x80 | ((ch>>6) & 0x3f);
1233 *p++ = 0x80 | (ch & 0x3f);
1234 cbWritten += 4;
1235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 }
1237 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001238 if (_PyString_Resize(&v, p - q))
1239 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 return v;
1241
1242 onError:
1243 Py_DECREF(v);
1244 return NULL;
1245}
1246
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1248{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 if (!PyUnicode_Check(unicode)) {
1250 PyErr_BadArgument();
1251 return NULL;
1252 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001253 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1254 PyUnicode_GET_SIZE(unicode),
1255 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256}
1257
1258/* --- UTF-16 Codec ------------------------------------------------------- */
1259
1260static
Tim Peters772747b2001-08-09 22:21:55 +00001261int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 const char *errors,
1263 const char *details)
1264{
1265 if ((errors == NULL) ||
1266 (strcmp(errors,"strict") == 0)) {
1267 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001268 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 details);
1270 return -1;
1271 }
1272 else if (strcmp(errors,"ignore") == 0) {
1273 return 0;
1274 }
1275 else if (strcmp(errors,"replace") == 0) {
1276 if (dest) {
1277 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1278 (*dest)++;
1279 }
1280 return 0;
1281 }
1282 else {
1283 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001284 "UTF-16 decoding error; "
1285 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 errors);
1287 return -1;
1288 }
1289}
1290
Tim Peters772747b2001-08-09 22:21:55 +00001291PyObject *
1292PyUnicode_DecodeUTF16(const char *s,
1293 int size,
1294 const char *errors,
1295 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296{
1297 PyUnicodeObject *unicode;
1298 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001299 const unsigned char *q, *e;
1300 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001302 /* Offsets from q for retrieving byte pairs in the right order. */
1303#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1304 int ihi = 1, ilo = 0;
1305#else
1306 int ihi = 0, ilo = 1;
1307#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308
1309 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001310 if (size & 1) {
1311 if (utf16_decoding_error(NULL, errors, "truncated data"))
1312 return NULL;
1313 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 }
1315
1316 /* Note: size will always be longer than the resulting Unicode
1317 character count */
1318 unicode = _PyUnicode_New(size);
1319 if (!unicode)
1320 return NULL;
1321 if (size == 0)
1322 return (PyObject *)unicode;
1323
1324 /* Unpack UTF-16 encoded data */
1325 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001326 q = (unsigned char *)s;
1327 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001330 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001332 /* Check for BOM marks (U+FEFF) in the input and adjust current
1333 byte order setting accordingly. In native mode, the leading BOM
1334 mark is skipped, in all other modes, it is copied to the output
1335 stream as-is (giving a ZWNBSP character). */
1336 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001337 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001338#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001339 if (bom == 0xFEFF) {
1340 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001341 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001342 }
1343 else if (bom == 0xFFFE) {
1344 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345 bo = 1;
1346 }
1347#else
Tim Peters772747b2001-08-09 22:21:55 +00001348 if (bom == 0xFEFF) {
1349 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001350 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001351 }
1352 else if (bom == 0xFFFE) {
1353 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001354 bo = -1;
1355 }
1356#endif
1357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
Tim Peters772747b2001-08-09 22:21:55 +00001359 if (bo == -1) {
1360 /* force LE */
1361 ihi = 1;
1362 ilo = 0;
1363 }
1364 else if (bo == 1) {
1365 /* force BE */
1366 ihi = 0;
1367 ilo = 1;
1368 }
1369
1370 while (q < e) {
1371 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1372 q += 2;
1373
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 if (ch < 0xD800 || ch > 0xDFFF) {
1375 *p++ = ch;
1376 continue;
1377 }
1378
1379 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001380 if (q >= e) {
1381 errmsg = "unexpected end of data";
1382 goto utf16Error;
1383 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001384 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001385 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1386 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001387 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001388#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001389 *p++ = ch;
1390 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001391#else
1392 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001393#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001394 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001395 }
1396 else {
1397 errmsg = "illegal UTF-16 surrogate";
1398 goto utf16Error;
1399 }
1400
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
1403 /* Fall through to report the error */
1404
1405 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001406 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 }
1409
1410 if (byteorder)
1411 *byteorder = bo;
1412
1413 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 goto onError;
1416
1417 return (PyObject *)unicode;
1418
1419onError:
1420 Py_DECREF(unicode);
1421 return NULL;
1422}
1423
Tim Peters772747b2001-08-09 22:21:55 +00001424PyObject *
1425PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1426 int size,
1427 const char *errors,
1428 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
1430 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001431 unsigned char *p;
1432 int i, pairs;
1433 /* Offsets from p for storing byte pairs in the right order. */
1434#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1435 int ihi = 1, ilo = 0;
1436#else
1437 int ihi = 0, ilo = 1;
1438#endif
1439
1440#define STORECHAR(CH) \
1441 do { \
1442 p[ihi] = ((CH) >> 8) & 0xff; \
1443 p[ilo] = (CH) & 0xff; \
1444 p += 2; \
1445 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001447 for (i = pairs = 0; i < size; i++)
1448 if (s[i] >= 0x10000)
1449 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001451 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 if (v == NULL)
1453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454
Tim Peters772747b2001-08-09 22:21:55 +00001455 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001457 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001458 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001459 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001460
1461 if (byteorder == -1) {
1462 /* force LE */
1463 ihi = 1;
1464 ilo = 0;
1465 }
1466 else if (byteorder == 1) {
1467 /* force BE */
1468 ihi = 0;
1469 ilo = 1;
1470 }
1471
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001472 while (size-- > 0) {
1473 Py_UNICODE ch = *s++;
1474 Py_UNICODE ch2 = 0;
1475 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001476 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1477 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478 }
Tim Peters772747b2001-08-09 22:21:55 +00001479 STORECHAR(ch);
1480 if (ch2)
1481 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001484#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485}
1486
1487PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1488{
1489 if (!PyUnicode_Check(unicode)) {
1490 PyErr_BadArgument();
1491 return NULL;
1492 }
1493 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1494 PyUnicode_GET_SIZE(unicode),
1495 NULL,
1496 0);
1497}
1498
1499/* --- Unicode Escape Codec ----------------------------------------------- */
1500
1501static
1502int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001503 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 const char *errors,
1505 const char *details)
1506{
1507 if ((errors == NULL) ||
1508 (strcmp(errors,"strict") == 0)) {
1509 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001510 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 details);
1512 return -1;
1513 }
1514 else if (strcmp(errors,"ignore") == 0) {
1515 return 0;
1516 }
1517 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001518 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 return 0;
1520 }
1521 else {
1522 PyErr_Format(PyExc_ValueError,
1523 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001524 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 errors);
1526 return -1;
1527 }
1528}
1529
Fredrik Lundh06d12682001-01-24 07:59:11 +00001530static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001531
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1533 int size,
1534 const char *errors)
1535{
1536 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001537 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001539 char* message;
1540 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1541
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 /* Escaped strings will always be longer than the resulting
1543 Unicode string, so we start with size here and then reduce the
1544 length after conversion to the true value. */
1545 v = _PyUnicode_New(size);
1546 if (v == NULL)
1547 goto onError;
1548 if (size == 0)
1549 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001550
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551 p = buf = PyUnicode_AS_UNICODE(v);
1552 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 while (s < end) {
1555 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001556 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001557 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
1559 /* Non-escape characters are interpreted as Unicode ordinals */
1560 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001561 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 continue;
1563 }
1564
1565 /* \ - Escapes */
1566 s++;
1567 switch (*s++) {
1568
1569 /* \x escapes */
1570 case '\n': break;
1571 case '\\': *p++ = '\\'; break;
1572 case '\'': *p++ = '\''; break;
1573 case '\"': *p++ = '\"'; break;
1574 case 'b': *p++ = '\b'; break;
1575 case 'f': *p++ = '\014'; break; /* FF */
1576 case 't': *p++ = '\t'; break;
1577 case 'n': *p++ = '\n'; break;
1578 case 'r': *p++ = '\r'; break;
1579 case 'v': *p++ = '\013'; break; /* VT */
1580 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1581
1582 /* \OOO (octal) escapes */
1583 case '0': case '1': case '2': case '3':
1584 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001585 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001587 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001589 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001591 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 break;
1593
Fredrik Lundhccc74732001-02-18 22:13:49 +00001594 /* hex escapes */
1595 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001597 digits = 2;
1598 message = "truncated \\xXX escape";
1599 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
Fredrik Lundhccc74732001-02-18 22:13:49 +00001601 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001603 digits = 4;
1604 message = "truncated \\uXXXX escape";
1605 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
Fredrik Lundhccc74732001-02-18 22:13:49 +00001607 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001608 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001609 digits = 8;
1610 message = "truncated \\UXXXXXXXX escape";
1611 hexescape:
1612 chr = 0;
1613 for (i = 0; i < digits; i++) {
1614 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001615 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001617 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001619 i++;
1620 break;
1621 }
1622 chr = (chr<<4) & ~0xF;
1623 if (c >= '0' && c <= '9')
1624 chr += c - '0';
1625 else if (c >= 'a' && c <= 'f')
1626 chr += 10 + c - 'a';
1627 else
1628 chr += 10 + c - 'A';
1629 }
1630 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001632 /* when we get here, chr is a 32-bit unicode character */
1633 if (chr <= 0xffff)
1634 /* UCS-2 character */
1635 *p++ = (Py_UNICODE) chr;
1636 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001637 /* UCS-4 character. Either store directly, or as
1638 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001639#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640 *p++ = chr;
1641#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001642 chr -= 0x10000L;
1643 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001644 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001645#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001646 } else {
1647 if (unicodeescape_decoding_error(
1648 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001649 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001650 )
1651 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001652 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001653 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001654 break;
1655
1656 /* \N{name} */
1657 case 'N':
1658 message = "malformed \\N character escape";
1659 if (ucnhash_CAPI == NULL) {
1660 /* load the unicode data module */
1661 PyObject *m, *v;
1662 m = PyImport_ImportModule("unicodedata");
1663 if (m == NULL)
1664 goto ucnhashError;
1665 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1666 Py_DECREF(m);
1667 if (v == NULL)
1668 goto ucnhashError;
1669 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1670 Py_DECREF(v);
1671 if (ucnhash_CAPI == NULL)
1672 goto ucnhashError;
1673 }
1674 if (*s == '{') {
1675 const char *start = s+1;
1676 /* look for the closing brace */
1677 while (*s != '}' && s < end)
1678 s++;
1679 if (s > start && s < end && *s == '}') {
1680 /* found a name. look it up in the unicode database */
1681 message = "unknown Unicode character name";
1682 s++;
1683 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1684 goto store;
1685 }
1686 }
1687 if (unicodeescape_decoding_error(&s, &x, errors, message))
1688 goto onError;
1689 *p++ = x;
1690 break;
1691
1692 default:
1693 *p++ = '\\';
1694 *p++ = (unsigned char)s[-1];
1695 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 }
1697 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001698 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 return (PyObject *)v;
1701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001703 PyErr_SetString(
1704 PyExc_UnicodeError,
1705 "\\N escapes not supported (can't load unicodedata module)"
1706 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001707 return NULL;
1708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 Py_XDECREF(v);
1711 return NULL;
1712}
1713
1714/* Return a Unicode-Escape string version of the Unicode object.
1715
1716 If quotes is true, the string is enclosed in u"" or u'' quotes as
1717 appropriate.
1718
1719*/
1720
Barry Warsaw51ac5802000-03-20 16:36:48 +00001721static const Py_UNICODE *findchar(const Py_UNICODE *s,
1722 int size,
1723 Py_UNICODE ch);
1724
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725static
1726PyObject *unicodeescape_string(const Py_UNICODE *s,
1727 int size,
1728 int quotes)
1729{
1730 PyObject *repr;
1731 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001733 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
1735 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1736 if (repr == NULL)
1737 return NULL;
1738
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001739 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740
1741 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 *p++ = 'u';
1743 *p++ = (findchar(s, size, '\'') &&
1744 !findchar(s, size, '"')) ? '"' : '\'';
1745 }
1746 while (size-- > 0) {
1747 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001750 if (quotes &&
1751 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 *p++ = '\\';
1753 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001754 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001756
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001757#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001758 /* Map 21-bit characters to '\U00xxxxxx' */
1759 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001760 int offset = p - PyString_AS_STRING(repr);
1761
1762 /* Resize the string if necessary */
1763 if (offset + 12 > PyString_GET_SIZE(repr)) {
1764 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1765 goto onError;
1766 p = PyString_AS_STRING(repr) + offset;
1767 }
1768
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001769 *p++ = '\\';
1770 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001771 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1772 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1773 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1774 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1777 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001778 *p++ = hexdigit[ch & 0x0000000F];
1779 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001780 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001781#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001782 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1783 else if (ch >= 0xD800 && ch < 0xDC00) {
1784 Py_UNICODE ch2;
1785 Py_UCS4 ucs;
1786
1787 ch2 = *s++;
1788 size--;
1789 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1790 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1791 *p++ = '\\';
1792 *p++ = 'U';
1793 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1794 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1795 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1796 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1799 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1800 *p++ = hexdigit[ucs & 0x0000000F];
1801 continue;
1802 }
1803 /* Fall through: isolated surrogates are copied as-is */
1804 s--;
1805 size++;
1806 }
1807
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001809 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 *p++ = '\\';
1811 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001812 *p++ = hexdigit[(ch >> 12) & 0x000F];
1813 *p++ = hexdigit[(ch >> 8) & 0x000F];
1814 *p++ = hexdigit[(ch >> 4) & 0x000F];
1815 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001817
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001818 /* Map special whitespace to '\t', \n', '\r' */
1819 else if (ch == '\t') {
1820 *p++ = '\\';
1821 *p++ = 't';
1822 }
1823 else if (ch == '\n') {
1824 *p++ = '\\';
1825 *p++ = 'n';
1826 }
1827 else if (ch == '\r') {
1828 *p++ = '\\';
1829 *p++ = 'r';
1830 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001831
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001832 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001833 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001835 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001836 *p++ = hexdigit[(ch >> 4) & 0x000F];
1837 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001839
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 /* Copy everything else as-is */
1841 else
1842 *p++ = (char) ch;
1843 }
1844 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001845 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846
1847 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001848 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001849 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850
1851 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001852
1853 onError:
1854 Py_DECREF(repr);
1855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
1858PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1859 int size)
1860{
1861 return unicodeescape_string(s, size, 0);
1862}
1863
1864PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1865{
1866 if (!PyUnicode_Check(unicode)) {
1867 PyErr_BadArgument();
1868 return NULL;
1869 }
1870 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1871 PyUnicode_GET_SIZE(unicode));
1872}
1873
1874/* --- Raw Unicode Escape Codec ------------------------------------------- */
1875
1876PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1877 int size,
1878 const char *errors)
1879{
1880 PyUnicodeObject *v;
1881 Py_UNICODE *p, *buf;
1882 const char *end;
1883 const char *bs;
1884
1885 /* Escaped strings will always be longer than the resulting
1886 Unicode string, so we start with size here and then reduce the
1887 length after conversion to the true value. */
1888 v = _PyUnicode_New(size);
1889 if (v == NULL)
1890 goto onError;
1891 if (size == 0)
1892 return (PyObject *)v;
1893 p = buf = PyUnicode_AS_UNICODE(v);
1894 end = s + size;
1895 while (s < end) {
1896 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001897 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 int i;
1899
1900 /* Non-escape characters are interpreted as Unicode ordinals */
1901 if (*s != '\\') {
1902 *p++ = (unsigned char)*s++;
1903 continue;
1904 }
1905
1906 /* \u-escapes are only interpreted iff the number of leading
1907 backslashes if odd */
1908 bs = s;
1909 for (;s < end;) {
1910 if (*s != '\\')
1911 break;
1912 *p++ = (unsigned char)*s++;
1913 }
1914 if (((s - bs) & 1) == 0 ||
1915 s >= end ||
1916 *s != 'u') {
1917 continue;
1918 }
1919 p--;
1920 s++;
1921
1922 /* \uXXXX with 4 hex digits */
1923 for (x = 0, i = 0; i < 4; i++) {
1924 c = (unsigned char)s[i];
1925 if (!isxdigit(c)) {
1926 if (unicodeescape_decoding_error(&s, &x, errors,
1927 "truncated \\uXXXX"))
1928 goto onError;
1929 i++;
1930 break;
1931 }
1932 x = (x<<4) & ~0xF;
1933 if (c >= '0' && c <= '9')
1934 x += c - '0';
1935 else if (c >= 'a' && c <= 'f')
1936 x += 10 + c - 'a';
1937 else
1938 x += 10 + c - 'A';
1939 }
1940 s += i;
1941 *p++ = x;
1942 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001943 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001944 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 return (PyObject *)v;
1946
1947 onError:
1948 Py_XDECREF(v);
1949 return NULL;
1950}
1951
1952PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1953 int size)
1954{
1955 PyObject *repr;
1956 char *p;
1957 char *q;
1958
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001959 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960
1961 repr = PyString_FromStringAndSize(NULL, 6 * size);
1962 if (repr == NULL)
1963 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001964 if (size == 0)
1965 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
1967 p = q = PyString_AS_STRING(repr);
1968 while (size-- > 0) {
1969 Py_UNICODE ch = *s++;
1970 /* Map 16-bit characters to '\uxxxx' */
1971 if (ch >= 256) {
1972 *p++ = '\\';
1973 *p++ = 'u';
1974 *p++ = hexdigit[(ch >> 12) & 0xf];
1975 *p++ = hexdigit[(ch >> 8) & 0xf];
1976 *p++ = hexdigit[(ch >> 4) & 0xf];
1977 *p++ = hexdigit[ch & 15];
1978 }
1979 /* Copy everything else as-is */
1980 else
1981 *p++ = (char) ch;
1982 }
1983 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001984 if (_PyString_Resize(&repr, p - q))
1985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986
1987 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001988
1989 onError:
1990 Py_DECREF(repr);
1991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992}
1993
1994PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1995{
1996 if (!PyUnicode_Check(unicode)) {
1997 PyErr_BadArgument();
1998 return NULL;
1999 }
2000 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2001 PyUnicode_GET_SIZE(unicode));
2002}
2003
2004/* --- Latin-1 Codec ------------------------------------------------------ */
2005
2006PyObject *PyUnicode_DecodeLatin1(const char *s,
2007 int size,
2008 const char *errors)
2009{
2010 PyUnicodeObject *v;
2011 Py_UNICODE *p;
2012
2013 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002014 if (size == 1 && *(unsigned char*)s < 256) {
2015 Py_UNICODE r = *(unsigned char*)s;
2016 return PyUnicode_FromUnicode(&r, 1);
2017 }
2018
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 v = _PyUnicode_New(size);
2020 if (v == NULL)
2021 goto onError;
2022 if (size == 0)
2023 return (PyObject *)v;
2024 p = PyUnicode_AS_UNICODE(v);
2025 while (size-- > 0)
2026 *p++ = (unsigned char)*s++;
2027 return (PyObject *)v;
2028
2029 onError:
2030 Py_XDECREF(v);
2031 return NULL;
2032}
2033
2034static
2035int latin1_encoding_error(const Py_UNICODE **source,
2036 char **dest,
2037 const char *errors,
2038 const char *details)
2039{
2040 if ((errors == NULL) ||
2041 (strcmp(errors,"strict") == 0)) {
2042 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002043 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 details);
2045 return -1;
2046 }
2047 else if (strcmp(errors,"ignore") == 0) {
2048 return 0;
2049 }
2050 else if (strcmp(errors,"replace") == 0) {
2051 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002052 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 return 0;
2054 }
2055 else {
2056 PyErr_Format(PyExc_ValueError,
2057 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002058 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 errors);
2060 return -1;
2061 }
2062}
2063
2064PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2065 int size,
2066 const char *errors)
2067{
2068 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002069 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002070
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 repr = PyString_FromStringAndSize(NULL, size);
2072 if (repr == NULL)
2073 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002074 if (size == 0)
2075 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076
2077 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002078 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 while (size-- > 0) {
2080 Py_UNICODE ch = *p++;
2081 if (ch >= 256) {
2082 if (latin1_encoding_error(&p, &s, errors,
2083 "ordinal not in range(256)"))
2084 goto onError;
2085 }
2086 else
2087 *s++ = (char)ch;
2088 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002089 /* Resize if error handling skipped some characters */
2090 if (s - start < PyString_GET_SIZE(repr))
2091 if (_PyString_Resize(&repr, s - start))
2092 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 return repr;
2094
2095 onError:
2096 Py_DECREF(repr);
2097 return NULL;
2098}
2099
2100PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2101{
2102 if (!PyUnicode_Check(unicode)) {
2103 PyErr_BadArgument();
2104 return NULL;
2105 }
2106 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2107 PyUnicode_GET_SIZE(unicode),
2108 NULL);
2109}
2110
2111/* --- 7-bit ASCII Codec -------------------------------------------------- */
2112
2113static
2114int ascii_decoding_error(const char **source,
2115 Py_UNICODE **dest,
2116 const char *errors,
2117 const char *details)
2118{
2119 if ((errors == NULL) ||
2120 (strcmp(errors,"strict") == 0)) {
2121 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002122 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 details);
2124 return -1;
2125 }
2126 else if (strcmp(errors,"ignore") == 0) {
2127 return 0;
2128 }
2129 else if (strcmp(errors,"replace") == 0) {
2130 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2131 (*dest)++;
2132 return 0;
2133 }
2134 else {
2135 PyErr_Format(PyExc_ValueError,
2136 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002137 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 errors);
2139 return -1;
2140 }
2141}
2142
2143PyObject *PyUnicode_DecodeASCII(const char *s,
2144 int size,
2145 const char *errors)
2146{
2147 PyUnicodeObject *v;
2148 Py_UNICODE *p;
2149
2150 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002151 if (size == 1 && *(unsigned char*)s < 128) {
2152 Py_UNICODE r = *(unsigned char*)s;
2153 return PyUnicode_FromUnicode(&r, 1);
2154 }
2155
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 v = _PyUnicode_New(size);
2157 if (v == NULL)
2158 goto onError;
2159 if (size == 0)
2160 return (PyObject *)v;
2161 p = PyUnicode_AS_UNICODE(v);
2162 while (size-- > 0) {
2163 register unsigned char c;
2164
2165 c = (unsigned char)*s++;
2166 if (c < 128)
2167 *p++ = c;
2168 else if (ascii_decoding_error(&s, &p, errors,
2169 "ordinal not in range(128)"))
2170 goto onError;
2171 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002172 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002173 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 return (PyObject *)v;
2176
2177 onError:
2178 Py_XDECREF(v);
2179 return NULL;
2180}
2181
2182static
2183int ascii_encoding_error(const Py_UNICODE **source,
2184 char **dest,
2185 const char *errors,
2186 const char *details)
2187{
2188 if ((errors == NULL) ||
2189 (strcmp(errors,"strict") == 0)) {
2190 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002191 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 details);
2193 return -1;
2194 }
2195 else if (strcmp(errors,"ignore") == 0) {
2196 return 0;
2197 }
2198 else if (strcmp(errors,"replace") == 0) {
2199 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002200 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 return 0;
2202 }
2203 else {
2204 PyErr_Format(PyExc_ValueError,
2205 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002206 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 errors);
2208 return -1;
2209 }
2210}
2211
2212PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2213 int size,
2214 const char *errors)
2215{
2216 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002217 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002218
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 repr = PyString_FromStringAndSize(NULL, size);
2220 if (repr == NULL)
2221 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002222 if (size == 0)
2223 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224
2225 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002226 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 while (size-- > 0) {
2228 Py_UNICODE ch = *p++;
2229 if (ch >= 128) {
2230 if (ascii_encoding_error(&p, &s, errors,
2231 "ordinal not in range(128)"))
2232 goto onError;
2233 }
2234 else
2235 *s++ = (char)ch;
2236 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002237 /* Resize if error handling skipped some characters */
2238 if (s - start < PyString_GET_SIZE(repr))
2239 if (_PyString_Resize(&repr, s - start))
2240 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 return repr;
2242
2243 onError:
2244 Py_DECREF(repr);
2245 return NULL;
2246}
2247
2248PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2249{
2250 if (!PyUnicode_Check(unicode)) {
2251 PyErr_BadArgument();
2252 return NULL;
2253 }
2254 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2255 PyUnicode_GET_SIZE(unicode),
2256 NULL);
2257}
2258
Fredrik Lundh30831632001-06-26 15:11:00 +00002259#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002260
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002261/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002262
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002263PyObject *PyUnicode_DecodeMBCS(const char *s,
2264 int size,
2265 const char *errors)
2266{
2267 PyUnicodeObject *v;
2268 Py_UNICODE *p;
2269
2270 /* First get the size of the result */
2271 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002272 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002273 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2274
2275 v = _PyUnicode_New(usize);
2276 if (v == NULL)
2277 return NULL;
2278 if (usize == 0)
2279 return (PyObject *)v;
2280 p = PyUnicode_AS_UNICODE(v);
2281 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2282 Py_DECREF(v);
2283 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2284 }
2285
2286 return (PyObject *)v;
2287}
2288
2289PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2290 int size,
2291 const char *errors)
2292{
2293 PyObject *repr;
2294 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002295 DWORD mbcssize;
2296
2297 /* If there are no characters, bail now! */
2298 if (size==0)
2299 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002300
2301 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002302 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002303 if (mbcssize==0)
2304 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2305
2306 repr = PyString_FromStringAndSize(NULL, mbcssize);
2307 if (repr == NULL)
2308 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002309 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002310 return repr;
2311
2312 /* Do the conversion */
2313 s = PyString_AS_STRING(repr);
2314 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2315 Py_DECREF(repr);
2316 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2317 }
2318 return repr;
2319}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002320
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002321#endif /* MS_WIN32 */
2322
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323/* --- Character Mapping Codec -------------------------------------------- */
2324
2325static
2326int charmap_decoding_error(const char **source,
2327 Py_UNICODE **dest,
2328 const char *errors,
2329 const char *details)
2330{
2331 if ((errors == NULL) ||
2332 (strcmp(errors,"strict") == 0)) {
2333 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002334 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335 details);
2336 return -1;
2337 }
2338 else if (strcmp(errors,"ignore") == 0) {
2339 return 0;
2340 }
2341 else if (strcmp(errors,"replace") == 0) {
2342 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2343 (*dest)++;
2344 return 0;
2345 }
2346 else {
2347 PyErr_Format(PyExc_ValueError,
2348 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002349 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 errors);
2351 return -1;
2352 }
2353}
2354
2355PyObject *PyUnicode_DecodeCharmap(const char *s,
2356 int size,
2357 PyObject *mapping,
2358 const char *errors)
2359{
2360 PyUnicodeObject *v;
2361 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002362 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363
2364 /* Default to Latin-1 */
2365 if (mapping == NULL)
2366 return PyUnicode_DecodeLatin1(s, size, errors);
2367
2368 v = _PyUnicode_New(size);
2369 if (v == NULL)
2370 goto onError;
2371 if (size == 0)
2372 return (PyObject *)v;
2373 p = PyUnicode_AS_UNICODE(v);
2374 while (size-- > 0) {
2375 unsigned char ch = *s++;
2376 PyObject *w, *x;
2377
2378 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2379 w = PyInt_FromLong((long)ch);
2380 if (w == NULL)
2381 goto onError;
2382 x = PyObject_GetItem(mapping, w);
2383 Py_DECREF(w);
2384 if (x == NULL) {
2385 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002386 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002388 x = Py_None;
2389 Py_INCREF(x);
2390 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392 }
2393
2394 /* Apply mapping */
2395 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002396 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 if (value < 0 || value > 65535) {
2398 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002399 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 Py_DECREF(x);
2401 goto onError;
2402 }
2403 *p++ = (Py_UNICODE)value;
2404 }
2405 else if (x == Py_None) {
2406 /* undefined mapping */
2407 if (charmap_decoding_error(&s, &p, errors,
2408 "character maps to <undefined>")) {
2409 Py_DECREF(x);
2410 goto onError;
2411 }
2412 }
2413 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002414 int targetsize = PyUnicode_GET_SIZE(x);
2415
2416 if (targetsize == 1)
2417 /* 1-1 mapping */
2418 *p++ = *PyUnicode_AS_UNICODE(x);
2419
2420 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002422 if (targetsize > extrachars) {
2423 /* resize first */
2424 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2425 int needed = (targetsize - extrachars) + \
2426 (targetsize << 2);
2427 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002428 if (_PyUnicode_Resize(&v,
2429 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002430 Py_DECREF(x);
2431 goto onError;
2432 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002433 p = PyUnicode_AS_UNICODE(v) + oldpos;
2434 }
2435 Py_UNICODE_COPY(p,
2436 PyUnicode_AS_UNICODE(x),
2437 targetsize);
2438 p += targetsize;
2439 extrachars -= targetsize;
2440 }
2441 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 }
2443 else {
2444 /* wrong return value */
2445 PyErr_SetString(PyExc_TypeError,
2446 "character mapping must return integer, None or unicode");
2447 Py_DECREF(x);
2448 goto onError;
2449 }
2450 Py_DECREF(x);
2451 }
2452 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002453 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 goto onError;
2455 return (PyObject *)v;
2456
2457 onError:
2458 Py_XDECREF(v);
2459 return NULL;
2460}
2461
2462static
2463int charmap_encoding_error(const Py_UNICODE **source,
2464 char **dest,
2465 const char *errors,
2466 const char *details)
2467{
2468 if ((errors == NULL) ||
2469 (strcmp(errors,"strict") == 0)) {
2470 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002471 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 details);
2473 return -1;
2474 }
2475 else if (strcmp(errors,"ignore") == 0) {
2476 return 0;
2477 }
2478 else if (strcmp(errors,"replace") == 0) {
2479 **dest = '?';
2480 (*dest)++;
2481 return 0;
2482 }
2483 else {
2484 PyErr_Format(PyExc_ValueError,
2485 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002486 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 errors);
2488 return -1;
2489 }
2490}
2491
2492PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2493 int size,
2494 PyObject *mapping,
2495 const char *errors)
2496{
2497 PyObject *v;
2498 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002499 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
2501 /* Default to Latin-1 */
2502 if (mapping == NULL)
2503 return PyUnicode_EncodeLatin1(p, size, errors);
2504
2505 v = PyString_FromStringAndSize(NULL, size);
2506 if (v == NULL)
2507 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002508 if (size == 0)
2509 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 s = PyString_AS_STRING(v);
2511 while (size-- > 0) {
2512 Py_UNICODE ch = *p++;
2513 PyObject *w, *x;
2514
2515 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2516 w = PyInt_FromLong((long)ch);
2517 if (w == NULL)
2518 goto onError;
2519 x = PyObject_GetItem(mapping, w);
2520 Py_DECREF(w);
2521 if (x == NULL) {
2522 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002523 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002525 x = Py_None;
2526 Py_INCREF(x);
2527 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002528 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 }
2530
2531 /* Apply mapping */
2532 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002533 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 if (value < 0 || value > 255) {
2535 PyErr_SetString(PyExc_TypeError,
2536 "character mapping must be in range(256)");
2537 Py_DECREF(x);
2538 goto onError;
2539 }
2540 *s++ = (char)value;
2541 }
2542 else if (x == Py_None) {
2543 /* undefined mapping */
2544 if (charmap_encoding_error(&p, &s, errors,
2545 "character maps to <undefined>")) {
2546 Py_DECREF(x);
2547 goto onError;
2548 }
2549 }
2550 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002551 int targetsize = PyString_GET_SIZE(x);
2552
2553 if (targetsize == 1)
2554 /* 1-1 mapping */
2555 *s++ = *PyString_AS_STRING(x);
2556
2557 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002559 if (targetsize > extrachars) {
2560 /* resize first */
2561 int oldpos = (int)(s - PyString_AS_STRING(v));
2562 int needed = (targetsize - extrachars) + \
2563 (targetsize << 2);
2564 extrachars += needed;
2565 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002566 Py_DECREF(x);
2567 goto onError;
2568 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002569 s = PyString_AS_STRING(v) + oldpos;
2570 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002571 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002572 s += targetsize;
2573 extrachars -= targetsize;
2574 }
2575 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 }
2577 else {
2578 /* wrong return value */
2579 PyErr_SetString(PyExc_TypeError,
2580 "character mapping must return integer, None or unicode");
2581 Py_DECREF(x);
2582 goto onError;
2583 }
2584 Py_DECREF(x);
2585 }
2586 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2587 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2588 goto onError;
2589 return v;
2590
2591 onError:
2592 Py_DECREF(v);
2593 return NULL;
2594}
2595
2596PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2597 PyObject *mapping)
2598{
2599 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2600 PyErr_BadArgument();
2601 return NULL;
2602 }
2603 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2604 PyUnicode_GET_SIZE(unicode),
2605 mapping,
2606 NULL);
2607}
2608
2609static
2610int translate_error(const Py_UNICODE **source,
2611 Py_UNICODE **dest,
2612 const char *errors,
2613 const char *details)
2614{
2615 if ((errors == NULL) ||
2616 (strcmp(errors,"strict") == 0)) {
2617 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002618 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 details);
2620 return -1;
2621 }
2622 else if (strcmp(errors,"ignore") == 0) {
2623 return 0;
2624 }
2625 else if (strcmp(errors,"replace") == 0) {
2626 **dest = '?';
2627 (*dest)++;
2628 return 0;
2629 }
2630 else {
2631 PyErr_Format(PyExc_ValueError,
2632 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002633 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 errors);
2635 return -1;
2636 }
2637}
2638
2639PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2640 int size,
2641 PyObject *mapping,
2642 const char *errors)
2643{
2644 PyUnicodeObject *v;
2645 Py_UNICODE *p;
2646
2647 if (mapping == NULL) {
2648 PyErr_BadArgument();
2649 return NULL;
2650 }
2651
2652 /* Output will never be longer than input */
2653 v = _PyUnicode_New(size);
2654 if (v == NULL)
2655 goto onError;
2656 if (size == 0)
2657 goto done;
2658 p = PyUnicode_AS_UNICODE(v);
2659 while (size-- > 0) {
2660 Py_UNICODE ch = *s++;
2661 PyObject *w, *x;
2662
2663 /* Get mapping */
2664 w = PyInt_FromLong(ch);
2665 if (w == NULL)
2666 goto onError;
2667 x = PyObject_GetItem(mapping, w);
2668 Py_DECREF(w);
2669 if (x == NULL) {
2670 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2671 /* No mapping found: default to 1-1 mapping */
2672 PyErr_Clear();
2673 *p++ = ch;
2674 continue;
2675 }
2676 goto onError;
2677 }
2678
2679 /* Apply mapping */
2680 if (PyInt_Check(x))
2681 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2682 else if (x == Py_None) {
2683 /* undefined mapping */
2684 if (translate_error(&s, &p, errors,
2685 "character maps to <undefined>")) {
2686 Py_DECREF(x);
2687 goto onError;
2688 }
2689 }
2690 else if (PyUnicode_Check(x)) {
2691 if (PyUnicode_GET_SIZE(x) != 1) {
2692 /* 1-n mapping */
2693 PyErr_SetString(PyExc_NotImplementedError,
2694 "1-n mappings are currently not implemented");
2695 Py_DECREF(x);
2696 goto onError;
2697 }
2698 *p++ = *PyUnicode_AS_UNICODE(x);
2699 }
2700 else {
2701 /* wrong return value */
2702 PyErr_SetString(PyExc_TypeError,
2703 "translate mapping must return integer, None or unicode");
2704 Py_DECREF(x);
2705 goto onError;
2706 }
2707 Py_DECREF(x);
2708 }
2709 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002710 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002711 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712
2713 done:
2714 return (PyObject *)v;
2715
2716 onError:
2717 Py_XDECREF(v);
2718 return NULL;
2719}
2720
2721PyObject *PyUnicode_Translate(PyObject *str,
2722 PyObject *mapping,
2723 const char *errors)
2724{
2725 PyObject *result;
2726
2727 str = PyUnicode_FromObject(str);
2728 if (str == NULL)
2729 goto onError;
2730 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2731 PyUnicode_GET_SIZE(str),
2732 mapping,
2733 errors);
2734 Py_DECREF(str);
2735 return result;
2736
2737 onError:
2738 Py_XDECREF(str);
2739 return NULL;
2740}
2741
Guido van Rossum9e896b32000-04-05 20:11:21 +00002742/* --- Decimal Encoder ---------------------------------------------------- */
2743
2744int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2745 int length,
2746 char *output,
2747 const char *errors)
2748{
2749 Py_UNICODE *p, *end;
2750
2751 if (output == NULL) {
2752 PyErr_BadArgument();
2753 return -1;
2754 }
2755
2756 p = s;
2757 end = s + length;
2758 while (p < end) {
2759 register Py_UNICODE ch = *p++;
2760 int decimal;
2761
2762 if (Py_UNICODE_ISSPACE(ch)) {
2763 *output++ = ' ';
2764 continue;
2765 }
2766 decimal = Py_UNICODE_TODECIMAL(ch);
2767 if (decimal >= 0) {
2768 *output++ = '0' + decimal;
2769 continue;
2770 }
Guido van Rossumba477042000-04-06 18:18:10 +00002771 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002772 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002773 continue;
2774 }
2775 /* All other characters are considered invalid */
2776 if (errors == NULL || strcmp(errors, "strict") == 0) {
2777 PyErr_SetString(PyExc_ValueError,
2778 "invalid decimal Unicode string");
2779 goto onError;
2780 }
2781 else if (strcmp(errors, "ignore") == 0)
2782 continue;
2783 else if (strcmp(errors, "replace") == 0) {
2784 *output++ = '?';
2785 continue;
2786 }
2787 }
2788 /* 0-terminate the output string */
2789 *output++ = '\0';
2790 return 0;
2791
2792 onError:
2793 return -1;
2794}
2795
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796/* --- Helpers ------------------------------------------------------------ */
2797
2798static
2799int count(PyUnicodeObject *self,
2800 int start,
2801 int end,
2802 PyUnicodeObject *substring)
2803{
2804 int count = 0;
2805
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002806 if (start < 0)
2807 start += self->length;
2808 if (start < 0)
2809 start = 0;
2810 if (end > self->length)
2811 end = self->length;
2812 if (end < 0)
2813 end += self->length;
2814 if (end < 0)
2815 end = 0;
2816
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002817 if (substring->length == 0)
2818 return (end - start + 1);
2819
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 end -= substring->length;
2821
2822 while (start <= end)
2823 if (Py_UNICODE_MATCH(self, start, substring)) {
2824 count++;
2825 start += substring->length;
2826 } else
2827 start++;
2828
2829 return count;
2830}
2831
2832int PyUnicode_Count(PyObject *str,
2833 PyObject *substr,
2834 int start,
2835 int end)
2836{
2837 int result;
2838
2839 str = PyUnicode_FromObject(str);
2840 if (str == NULL)
2841 return -1;
2842 substr = PyUnicode_FromObject(substr);
2843 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002844 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 return -1;
2846 }
2847
2848 result = count((PyUnicodeObject *)str,
2849 start, end,
2850 (PyUnicodeObject *)substr);
2851
2852 Py_DECREF(str);
2853 Py_DECREF(substr);
2854 return result;
2855}
2856
2857static
2858int findstring(PyUnicodeObject *self,
2859 PyUnicodeObject *substring,
2860 int start,
2861 int end,
2862 int direction)
2863{
2864 if (start < 0)
2865 start += self->length;
2866 if (start < 0)
2867 start = 0;
2868
2869 if (substring->length == 0)
2870 return start;
2871
2872 if (end > self->length)
2873 end = self->length;
2874 if (end < 0)
2875 end += self->length;
2876 if (end < 0)
2877 end = 0;
2878
2879 end -= substring->length;
2880
2881 if (direction < 0) {
2882 for (; end >= start; end--)
2883 if (Py_UNICODE_MATCH(self, end, substring))
2884 return end;
2885 } else {
2886 for (; start <= end; start++)
2887 if (Py_UNICODE_MATCH(self, start, substring))
2888 return start;
2889 }
2890
2891 return -1;
2892}
2893
2894int PyUnicode_Find(PyObject *str,
2895 PyObject *substr,
2896 int start,
2897 int end,
2898 int direction)
2899{
2900 int result;
2901
2902 str = PyUnicode_FromObject(str);
2903 if (str == NULL)
2904 return -1;
2905 substr = PyUnicode_FromObject(substr);
2906 if (substr == NULL) {
2907 Py_DECREF(substr);
2908 return -1;
2909 }
2910
2911 result = findstring((PyUnicodeObject *)str,
2912 (PyUnicodeObject *)substr,
2913 start, end, direction);
2914 Py_DECREF(str);
2915 Py_DECREF(substr);
2916 return result;
2917}
2918
2919static
2920int tailmatch(PyUnicodeObject *self,
2921 PyUnicodeObject *substring,
2922 int start,
2923 int end,
2924 int direction)
2925{
2926 if (start < 0)
2927 start += self->length;
2928 if (start < 0)
2929 start = 0;
2930
2931 if (substring->length == 0)
2932 return 1;
2933
2934 if (end > self->length)
2935 end = self->length;
2936 if (end < 0)
2937 end += self->length;
2938 if (end < 0)
2939 end = 0;
2940
2941 end -= substring->length;
2942 if (end < start)
2943 return 0;
2944
2945 if (direction > 0) {
2946 if (Py_UNICODE_MATCH(self, end, substring))
2947 return 1;
2948 } else {
2949 if (Py_UNICODE_MATCH(self, start, substring))
2950 return 1;
2951 }
2952
2953 return 0;
2954}
2955
2956int PyUnicode_Tailmatch(PyObject *str,
2957 PyObject *substr,
2958 int start,
2959 int end,
2960 int direction)
2961{
2962 int result;
2963
2964 str = PyUnicode_FromObject(str);
2965 if (str == NULL)
2966 return -1;
2967 substr = PyUnicode_FromObject(substr);
2968 if (substr == NULL) {
2969 Py_DECREF(substr);
2970 return -1;
2971 }
2972
2973 result = tailmatch((PyUnicodeObject *)str,
2974 (PyUnicodeObject *)substr,
2975 start, end, direction);
2976 Py_DECREF(str);
2977 Py_DECREF(substr);
2978 return result;
2979}
2980
2981static
2982const Py_UNICODE *findchar(const Py_UNICODE *s,
2983 int size,
2984 Py_UNICODE ch)
2985{
2986 /* like wcschr, but doesn't stop at NULL characters */
2987
2988 while (size-- > 0) {
2989 if (*s == ch)
2990 return s;
2991 s++;
2992 }
2993
2994 return NULL;
2995}
2996
2997/* Apply fixfct filter to the Unicode object self and return a
2998 reference to the modified object */
2999
3000static
3001PyObject *fixup(PyUnicodeObject *self,
3002 int (*fixfct)(PyUnicodeObject *s))
3003{
3004
3005 PyUnicodeObject *u;
3006
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003007 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 if (u == NULL)
3009 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003010
3011 Py_UNICODE_COPY(u->str, self->str, self->length);
3012
Tim Peters7a29bd52001-09-12 03:03:31 +00003013 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 /* fixfct should return TRUE if it modified the buffer. If
3015 FALSE, return a reference to the original buffer instead
3016 (to save space, not time) */
3017 Py_INCREF(self);
3018 Py_DECREF(u);
3019 return (PyObject*) self;
3020 }
3021 return (PyObject*) u;
3022}
3023
3024static
3025int fixupper(PyUnicodeObject *self)
3026{
3027 int len = self->length;
3028 Py_UNICODE *s = self->str;
3029 int status = 0;
3030
3031 while (len-- > 0) {
3032 register Py_UNICODE ch;
3033
3034 ch = Py_UNICODE_TOUPPER(*s);
3035 if (ch != *s) {
3036 status = 1;
3037 *s = ch;
3038 }
3039 s++;
3040 }
3041
3042 return status;
3043}
3044
3045static
3046int fixlower(PyUnicodeObject *self)
3047{
3048 int len = self->length;
3049 Py_UNICODE *s = self->str;
3050 int status = 0;
3051
3052 while (len-- > 0) {
3053 register Py_UNICODE ch;
3054
3055 ch = Py_UNICODE_TOLOWER(*s);
3056 if (ch != *s) {
3057 status = 1;
3058 *s = ch;
3059 }
3060 s++;
3061 }
3062
3063 return status;
3064}
3065
3066static
3067int fixswapcase(PyUnicodeObject *self)
3068{
3069 int len = self->length;
3070 Py_UNICODE *s = self->str;
3071 int status = 0;
3072
3073 while (len-- > 0) {
3074 if (Py_UNICODE_ISUPPER(*s)) {
3075 *s = Py_UNICODE_TOLOWER(*s);
3076 status = 1;
3077 } else if (Py_UNICODE_ISLOWER(*s)) {
3078 *s = Py_UNICODE_TOUPPER(*s);
3079 status = 1;
3080 }
3081 s++;
3082 }
3083
3084 return status;
3085}
3086
3087static
3088int fixcapitalize(PyUnicodeObject *self)
3089{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003090 int len = self->length;
3091 Py_UNICODE *s = self->str;
3092 int status = 0;
3093
3094 if (len == 0)
3095 return 0;
3096 if (Py_UNICODE_ISLOWER(*s)) {
3097 *s = Py_UNICODE_TOUPPER(*s);
3098 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003100 s++;
3101 while (--len > 0) {
3102 if (Py_UNICODE_ISUPPER(*s)) {
3103 *s = Py_UNICODE_TOLOWER(*s);
3104 status = 1;
3105 }
3106 s++;
3107 }
3108 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109}
3110
3111static
3112int fixtitle(PyUnicodeObject *self)
3113{
3114 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3115 register Py_UNICODE *e;
3116 int previous_is_cased;
3117
3118 /* Shortcut for single character strings */
3119 if (PyUnicode_GET_SIZE(self) == 1) {
3120 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3121 if (*p != ch) {
3122 *p = ch;
3123 return 1;
3124 }
3125 else
3126 return 0;
3127 }
3128
3129 e = p + PyUnicode_GET_SIZE(self);
3130 previous_is_cased = 0;
3131 for (; p < e; p++) {
3132 register const Py_UNICODE ch = *p;
3133
3134 if (previous_is_cased)
3135 *p = Py_UNICODE_TOLOWER(ch);
3136 else
3137 *p = Py_UNICODE_TOTITLE(ch);
3138
3139 if (Py_UNICODE_ISLOWER(ch) ||
3140 Py_UNICODE_ISUPPER(ch) ||
3141 Py_UNICODE_ISTITLE(ch))
3142 previous_is_cased = 1;
3143 else
3144 previous_is_cased = 0;
3145 }
3146 return 1;
3147}
3148
3149PyObject *PyUnicode_Join(PyObject *separator,
3150 PyObject *seq)
3151{
3152 Py_UNICODE *sep;
3153 int seplen;
3154 PyUnicodeObject *res = NULL;
3155 int reslen = 0;
3156 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 int sz = 100;
3158 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003159 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160
Tim Peters2cfe3682001-05-05 05:36:48 +00003161 it = PyObject_GetIter(seq);
3162 if (it == NULL)
3163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164
3165 if (separator == NULL) {
3166 Py_UNICODE blank = ' ';
3167 sep = &blank;
3168 seplen = 1;
3169 }
3170 else {
3171 separator = PyUnicode_FromObject(separator);
3172 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003173 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 sep = PyUnicode_AS_UNICODE(separator);
3175 seplen = PyUnicode_GET_SIZE(separator);
3176 }
3177
3178 res = _PyUnicode_New(sz);
3179 if (res == NULL)
3180 goto onError;
3181 p = PyUnicode_AS_UNICODE(res);
3182 reslen = 0;
3183
Tim Peters2cfe3682001-05-05 05:36:48 +00003184 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003186 PyObject *item = PyIter_Next(it);
3187 if (item == NULL) {
3188 if (PyErr_Occurred())
3189 goto onError;
3190 break;
3191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 if (!PyUnicode_Check(item)) {
3193 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003194 if (!PyString_Check(item)) {
3195 PyErr_Format(PyExc_TypeError,
3196 "sequence item %i: expected string or Unicode,"
3197 " %.80s found",
3198 i, item->ob_type->tp_name);
3199 Py_DECREF(item);
3200 goto onError;
3201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 v = PyUnicode_FromObject(item);
3203 Py_DECREF(item);
3204 item = v;
3205 if (item == NULL)
3206 goto onError;
3207 }
3208 itemlen = PyUnicode_GET_SIZE(item);
3209 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003210 if (_PyUnicode_Resize(&res, sz*2)) {
3211 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 sz *= 2;
3215 p = PyUnicode_AS_UNICODE(res) + reslen;
3216 }
3217 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003218 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 p += seplen;
3220 reslen += seplen;
3221 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003222 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 p += itemlen;
3224 reslen += itemlen;
3225 Py_DECREF(item);
3226 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003227 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 goto onError;
3229
3230 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003231 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 return (PyObject *)res;
3233
3234 onError:
3235 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003236 Py_XDECREF(res);
3237 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 return NULL;
3239}
3240
3241static
3242PyUnicodeObject *pad(PyUnicodeObject *self,
3243 int left,
3244 int right,
3245 Py_UNICODE fill)
3246{
3247 PyUnicodeObject *u;
3248
3249 if (left < 0)
3250 left = 0;
3251 if (right < 0)
3252 right = 0;
3253
Tim Peters7a29bd52001-09-12 03:03:31 +00003254 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 Py_INCREF(self);
3256 return self;
3257 }
3258
3259 u = _PyUnicode_New(left + self->length + right);
3260 if (u) {
3261 if (left)
3262 Py_UNICODE_FILL(u->str, fill, left);
3263 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3264 if (right)
3265 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3266 }
3267
3268 return u;
3269}
3270
3271#define SPLIT_APPEND(data, left, right) \
3272 str = PyUnicode_FromUnicode(data + left, right - left); \
3273 if (!str) \
3274 goto onError; \
3275 if (PyList_Append(list, str)) { \
3276 Py_DECREF(str); \
3277 goto onError; \
3278 } \
3279 else \
3280 Py_DECREF(str);
3281
3282static
3283PyObject *split_whitespace(PyUnicodeObject *self,
3284 PyObject *list,
3285 int maxcount)
3286{
3287 register int i;
3288 register int j;
3289 int len = self->length;
3290 PyObject *str;
3291
3292 for (i = j = 0; i < len; ) {
3293 /* find a token */
3294 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3295 i++;
3296 j = i;
3297 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3298 i++;
3299 if (j < i) {
3300 if (maxcount-- <= 0)
3301 break;
3302 SPLIT_APPEND(self->str, j, i);
3303 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3304 i++;
3305 j = i;
3306 }
3307 }
3308 if (j < len) {
3309 SPLIT_APPEND(self->str, j, len);
3310 }
3311 return list;
3312
3313 onError:
3314 Py_DECREF(list);
3315 return NULL;
3316}
3317
3318PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003319 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320{
3321 register int i;
3322 register int j;
3323 int len;
3324 PyObject *list;
3325 PyObject *str;
3326 Py_UNICODE *data;
3327
3328 string = PyUnicode_FromObject(string);
3329 if (string == NULL)
3330 return NULL;
3331 data = PyUnicode_AS_UNICODE(string);
3332 len = PyUnicode_GET_SIZE(string);
3333
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 list = PyList_New(0);
3335 if (!list)
3336 goto onError;
3337
3338 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003339 int eol;
3340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 /* Find a line and append it */
3342 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3343 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344
3345 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003346 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 if (i < len) {
3348 if (data[i] == '\r' && i + 1 < len &&
3349 data[i+1] == '\n')
3350 i += 2;
3351 else
3352 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003353 if (keepends)
3354 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 }
Guido van Rossum86662912000-04-11 15:38:46 +00003356 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 j = i;
3358 }
3359 if (j < len) {
3360 SPLIT_APPEND(data, j, len);
3361 }
3362
3363 Py_DECREF(string);
3364 return list;
3365
3366 onError:
3367 Py_DECREF(list);
3368 Py_DECREF(string);
3369 return NULL;
3370}
3371
3372static
3373PyObject *split_char(PyUnicodeObject *self,
3374 PyObject *list,
3375 Py_UNICODE ch,
3376 int maxcount)
3377{
3378 register int i;
3379 register int j;
3380 int len = self->length;
3381 PyObject *str;
3382
3383 for (i = j = 0; i < len; ) {
3384 if (self->str[i] == ch) {
3385 if (maxcount-- <= 0)
3386 break;
3387 SPLIT_APPEND(self->str, j, i);
3388 i = j = i + 1;
3389 } else
3390 i++;
3391 }
3392 if (j <= len) {
3393 SPLIT_APPEND(self->str, j, len);
3394 }
3395 return list;
3396
3397 onError:
3398 Py_DECREF(list);
3399 return NULL;
3400}
3401
3402static
3403PyObject *split_substring(PyUnicodeObject *self,
3404 PyObject *list,
3405 PyUnicodeObject *substring,
3406 int maxcount)
3407{
3408 register int i;
3409 register int j;
3410 int len = self->length;
3411 int sublen = substring->length;
3412 PyObject *str;
3413
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003414 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 if (Py_UNICODE_MATCH(self, i, substring)) {
3416 if (maxcount-- <= 0)
3417 break;
3418 SPLIT_APPEND(self->str, j, i);
3419 i = j = i + sublen;
3420 } else
3421 i++;
3422 }
3423 if (j <= len) {
3424 SPLIT_APPEND(self->str, j, len);
3425 }
3426 return list;
3427
3428 onError:
3429 Py_DECREF(list);
3430 return NULL;
3431}
3432
3433#undef SPLIT_APPEND
3434
3435static
3436PyObject *split(PyUnicodeObject *self,
3437 PyUnicodeObject *substring,
3438 int maxcount)
3439{
3440 PyObject *list;
3441
3442 if (maxcount < 0)
3443 maxcount = INT_MAX;
3444
3445 list = PyList_New(0);
3446 if (!list)
3447 return NULL;
3448
3449 if (substring == NULL)
3450 return split_whitespace(self,list,maxcount);
3451
3452 else if (substring->length == 1)
3453 return split_char(self,list,substring->str[0],maxcount);
3454
3455 else if (substring->length == 0) {
3456 Py_DECREF(list);
3457 PyErr_SetString(PyExc_ValueError, "empty separator");
3458 return NULL;
3459 }
3460 else
3461 return split_substring(self,list,substring,maxcount);
3462}
3463
3464static
3465PyObject *strip(PyUnicodeObject *self,
3466 int left,
3467 int right)
3468{
3469 Py_UNICODE *p = self->str;
3470 int start = 0;
3471 int end = self->length;
3472
3473 if (left)
3474 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3475 start++;
3476
3477 if (right)
3478 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3479 end--;
3480
Tim Peters7a29bd52001-09-12 03:03:31 +00003481 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 /* couldn't strip anything off, return original string */
3483 Py_INCREF(self);
3484 return (PyObject*) self;
3485 }
3486
3487 return (PyObject*) PyUnicode_FromUnicode(
3488 self->str + start,
3489 end - start
3490 );
3491}
3492
3493static
3494PyObject *replace(PyUnicodeObject *self,
3495 PyUnicodeObject *str1,
3496 PyUnicodeObject *str2,
3497 int maxcount)
3498{
3499 PyUnicodeObject *u;
3500
3501 if (maxcount < 0)
3502 maxcount = INT_MAX;
3503
3504 if (str1->length == 1 && str2->length == 1) {
3505 int i;
3506
3507 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003508 if (!findchar(self->str, self->length, str1->str[0]) &&
3509 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 /* nothing to replace, return original string */
3511 Py_INCREF(self);
3512 u = self;
3513 } else {
3514 Py_UNICODE u1 = str1->str[0];
3515 Py_UNICODE u2 = str2->str[0];
3516
3517 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003518 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 self->length
3520 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003521 if (u != NULL) {
3522 Py_UNICODE_COPY(u->str, self->str,
3523 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 for (i = 0; i < u->length; i++)
3525 if (u->str[i] == u1) {
3526 if (--maxcount < 0)
3527 break;
3528 u->str[i] = u2;
3529 }
3530 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532
3533 } else {
3534 int n, i;
3535 Py_UNICODE *p;
3536
3537 /* replace strings */
3538 n = count(self, 0, self->length, str1);
3539 if (n > maxcount)
3540 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003541 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 /* nothing to replace, return original string */
3543 Py_INCREF(self);
3544 u = self;
3545 } else {
3546 u = _PyUnicode_New(
3547 self->length + n * (str2->length - str1->length));
3548 if (u) {
3549 i = 0;
3550 p = u->str;
3551 while (i <= self->length - str1->length)
3552 if (Py_UNICODE_MATCH(self, i, str1)) {
3553 /* replace string segment */
3554 Py_UNICODE_COPY(p, str2->str, str2->length);
3555 p += str2->length;
3556 i += str1->length;
3557 if (--n <= 0) {
3558 /* copy remaining part */
3559 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3560 break;
3561 }
3562 } else
3563 *p++ = self->str[i++];
3564 }
3565 }
3566 }
3567
3568 return (PyObject *) u;
3569}
3570
3571/* --- Unicode Object Methods --------------------------------------------- */
3572
3573static char title__doc__[] =
3574"S.title() -> unicode\n\
3575\n\
3576Return a titlecased version of S, i.e. words start with title case\n\
3577characters, all remaining cased characters have lower case.";
3578
3579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003580unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 return fixup(self, fixtitle);
3583}
3584
3585static char capitalize__doc__[] =
3586"S.capitalize() -> unicode\n\
3587\n\
3588Return a capitalized version of S, i.e. make the first character\n\
3589have upper case.";
3590
3591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003592unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 return fixup(self, fixcapitalize);
3595}
3596
3597#if 0
3598static char capwords__doc__[] =
3599"S.capwords() -> unicode\n\
3600\n\
3601Apply .capitalize() to all words in S and return the result with\n\
3602normalized whitespace (all whitespace strings are replaced by ' ').";
3603
3604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003605unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606{
3607 PyObject *list;
3608 PyObject *item;
3609 int i;
3610
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 /* Split into words */
3612 list = split(self, NULL, -1);
3613 if (!list)
3614 return NULL;
3615
3616 /* Capitalize each word */
3617 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3618 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3619 fixcapitalize);
3620 if (item == NULL)
3621 goto onError;
3622 Py_DECREF(PyList_GET_ITEM(list, i));
3623 PyList_SET_ITEM(list, i, item);
3624 }
3625
3626 /* Join the words to form a new string */
3627 item = PyUnicode_Join(NULL, list);
3628
3629onError:
3630 Py_DECREF(list);
3631 return (PyObject *)item;
3632}
3633#endif
3634
3635static char center__doc__[] =
3636"S.center(width) -> unicode\n\
3637\n\
3638Return S centered in a Unicode string of length width. Padding is done\n\
3639using spaces.";
3640
3641static PyObject *
3642unicode_center(PyUnicodeObject *self, PyObject *args)
3643{
3644 int marg, left;
3645 int width;
3646
3647 if (!PyArg_ParseTuple(args, "i:center", &width))
3648 return NULL;
3649
Tim Peters7a29bd52001-09-12 03:03:31 +00003650 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 Py_INCREF(self);
3652 return (PyObject*) self;
3653 }
3654
3655 marg = width - self->length;
3656 left = marg / 2 + (marg & width & 1);
3657
3658 return (PyObject*) pad(self, left, marg - left, ' ');
3659}
3660
Marc-André Lemburge5034372000-08-08 08:04:29 +00003661#if 0
3662
3663/* This code should go into some future Unicode collation support
3664 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003665 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003666
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003667/* speedy UTF-16 code point order comparison */
3668/* gleaned from: */
3669/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3670
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003671static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003672{
3673 0, 0, 0, 0, 0, 0, 0, 0,
3674 0, 0, 0, 0, 0, 0, 0, 0,
3675 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003676 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003677};
3678
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679static int
3680unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3681{
3682 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003683
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 Py_UNICODE *s1 = str1->str;
3685 Py_UNICODE *s2 = str2->str;
3686
3687 len1 = str1->length;
3688 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003691 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003692
3693 c1 = *s1++;
3694 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003695
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696 if (c1 > (1<<11) * 26)
3697 c1 += utf16Fixup[c1>>11];
3698 if (c2 > (1<<11) * 26)
3699 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003700 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003701
3702 if (c1 != c2)
3703 return (c1 < c2) ? -1 : 1;
3704
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003705 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 }
3707
3708 return (len1 < len2) ? -1 : (len1 != len2);
3709}
3710
Marc-André Lemburge5034372000-08-08 08:04:29 +00003711#else
3712
3713static int
3714unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3715{
3716 register int len1, len2;
3717
3718 Py_UNICODE *s1 = str1->str;
3719 Py_UNICODE *s2 = str2->str;
3720
3721 len1 = str1->length;
3722 len2 = str2->length;
3723
3724 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003725 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003726
Fredrik Lundh45714e92001-06-26 16:39:36 +00003727 c1 = *s1++;
3728 c2 = *s2++;
3729
3730 if (c1 != c2)
3731 return (c1 < c2) ? -1 : 1;
3732
Marc-André Lemburge5034372000-08-08 08:04:29 +00003733 len1--; len2--;
3734 }
3735
3736 return (len1 < len2) ? -1 : (len1 != len2);
3737}
3738
3739#endif
3740
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741int PyUnicode_Compare(PyObject *left,
3742 PyObject *right)
3743{
3744 PyUnicodeObject *u = NULL, *v = NULL;
3745 int result;
3746
3747 /* Coerce the two arguments */
3748 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3749 if (u == NULL)
3750 goto onError;
3751 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3752 if (v == NULL)
3753 goto onError;
3754
Thomas Wouters7e474022000-07-16 12:04:32 +00003755 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 if (v == u) {
3757 Py_DECREF(u);
3758 Py_DECREF(v);
3759 return 0;
3760 }
3761
3762 result = unicode_compare(u, v);
3763
3764 Py_DECREF(u);
3765 Py_DECREF(v);
3766 return result;
3767
3768onError:
3769 Py_XDECREF(u);
3770 Py_XDECREF(v);
3771 return -1;
3772}
3773
Guido van Rossum403d68b2000-03-13 15:55:09 +00003774int PyUnicode_Contains(PyObject *container,
3775 PyObject *element)
3776{
3777 PyUnicodeObject *u = NULL, *v = NULL;
3778 int result;
3779 register const Py_UNICODE *p, *e;
3780 register Py_UNICODE ch;
3781
3782 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003783 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003784 if (v == NULL) {
3785 PyErr_SetString(PyExc_TypeError,
3786 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003787 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003788 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003789 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3790 if (u == NULL) {
3791 Py_DECREF(v);
3792 goto onError;
3793 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003794
3795 /* Check v in u */
3796 if (PyUnicode_GET_SIZE(v) != 1) {
3797 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003798 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003799 goto onError;
3800 }
3801 ch = *PyUnicode_AS_UNICODE(v);
3802 p = PyUnicode_AS_UNICODE(u);
3803 e = p + PyUnicode_GET_SIZE(u);
3804 result = 0;
3805 while (p < e) {
3806 if (*p++ == ch) {
3807 result = 1;
3808 break;
3809 }
3810 }
3811
3812 Py_DECREF(u);
3813 Py_DECREF(v);
3814 return result;
3815
3816onError:
3817 Py_XDECREF(u);
3818 Py_XDECREF(v);
3819 return -1;
3820}
3821
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822/* Concat to string or Unicode object giving a new Unicode object. */
3823
3824PyObject *PyUnicode_Concat(PyObject *left,
3825 PyObject *right)
3826{
3827 PyUnicodeObject *u = NULL, *v = NULL, *w;
3828
3829 /* Coerce the two arguments */
3830 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3831 if (u == NULL)
3832 goto onError;
3833 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3834 if (v == NULL)
3835 goto onError;
3836
3837 /* Shortcuts */
3838 if (v == unicode_empty) {
3839 Py_DECREF(v);
3840 return (PyObject *)u;
3841 }
3842 if (u == unicode_empty) {
3843 Py_DECREF(u);
3844 return (PyObject *)v;
3845 }
3846
3847 /* Concat the two Unicode strings */
3848 w = _PyUnicode_New(u->length + v->length);
3849 if (w == NULL)
3850 goto onError;
3851 Py_UNICODE_COPY(w->str, u->str, u->length);
3852 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3853
3854 Py_DECREF(u);
3855 Py_DECREF(v);
3856 return (PyObject *)w;
3857
3858onError:
3859 Py_XDECREF(u);
3860 Py_XDECREF(v);
3861 return NULL;
3862}
3863
3864static char count__doc__[] =
3865"S.count(sub[, start[, end]]) -> int\n\
3866\n\
3867Return the number of occurrences of substring sub in Unicode string\n\
3868S[start:end]. Optional arguments start and end are\n\
3869interpreted as in slice notation.";
3870
3871static PyObject *
3872unicode_count(PyUnicodeObject *self, PyObject *args)
3873{
3874 PyUnicodeObject *substring;
3875 int start = 0;
3876 int end = INT_MAX;
3877 PyObject *result;
3878
Guido van Rossumb8872e62000-05-09 14:14:27 +00003879 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3880 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 return NULL;
3882
3883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3884 (PyObject *)substring);
3885 if (substring == NULL)
3886 return NULL;
3887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 if (start < 0)
3889 start += self->length;
3890 if (start < 0)
3891 start = 0;
3892 if (end > self->length)
3893 end = self->length;
3894 if (end < 0)
3895 end += self->length;
3896 if (end < 0)
3897 end = 0;
3898
3899 result = PyInt_FromLong((long) count(self, start, end, substring));
3900
3901 Py_DECREF(substring);
3902 return result;
3903}
3904
3905static char encode__doc__[] =
3906"S.encode([encoding[,errors]]) -> string\n\
3907\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003908Return an encoded string version of S. Default encoding is the current\n\
3909default string encoding. errors may be given to set a different error\n\
3910handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3911a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912
3913static PyObject *
3914unicode_encode(PyUnicodeObject *self, PyObject *args)
3915{
3916 char *encoding = NULL;
3917 char *errors = NULL;
3918 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3919 return NULL;
3920 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3921}
3922
3923static char expandtabs__doc__[] =
3924"S.expandtabs([tabsize]) -> unicode\n\
3925\n\
3926Return a copy of S where all tab characters are expanded using spaces.\n\
3927If tabsize is not given, a tab size of 8 characters is assumed.";
3928
3929static PyObject*
3930unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3931{
3932 Py_UNICODE *e;
3933 Py_UNICODE *p;
3934 Py_UNICODE *q;
3935 int i, j;
3936 PyUnicodeObject *u;
3937 int tabsize = 8;
3938
3939 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3940 return NULL;
3941
Thomas Wouters7e474022000-07-16 12:04:32 +00003942 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 i = j = 0;
3944 e = self->str + self->length;
3945 for (p = self->str; p < e; p++)
3946 if (*p == '\t') {
3947 if (tabsize > 0)
3948 j += tabsize - (j % tabsize);
3949 }
3950 else {
3951 j++;
3952 if (*p == '\n' || *p == '\r') {
3953 i += j;
3954 j = 0;
3955 }
3956 }
3957
3958 /* Second pass: create output string and fill it */
3959 u = _PyUnicode_New(i + j);
3960 if (!u)
3961 return NULL;
3962
3963 j = 0;
3964 q = u->str;
3965
3966 for (p = self->str; p < e; p++)
3967 if (*p == '\t') {
3968 if (tabsize > 0) {
3969 i = tabsize - (j % tabsize);
3970 j += i;
3971 while (i--)
3972 *q++ = ' ';
3973 }
3974 }
3975 else {
3976 j++;
3977 *q++ = *p;
3978 if (*p == '\n' || *p == '\r')
3979 j = 0;
3980 }
3981
3982 return (PyObject*) u;
3983}
3984
3985static char find__doc__[] =
3986"S.find(sub [,start [,end]]) -> int\n\
3987\n\
3988Return the lowest index in S where substring sub is found,\n\
3989such that sub is contained within s[start,end]. Optional\n\
3990arguments start and end are interpreted as in slice notation.\n\
3991\n\
3992Return -1 on failure.";
3993
3994static PyObject *
3995unicode_find(PyUnicodeObject *self, PyObject *args)
3996{
3997 PyUnicodeObject *substring;
3998 int start = 0;
3999 int end = INT_MAX;
4000 PyObject *result;
4001
Guido van Rossumb8872e62000-05-09 14:14:27 +00004002 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4003 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return NULL;
4005 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4006 (PyObject *)substring);
4007 if (substring == NULL)
4008 return NULL;
4009
4010 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4011
4012 Py_DECREF(substring);
4013 return result;
4014}
4015
4016static PyObject *
4017unicode_getitem(PyUnicodeObject *self, int index)
4018{
4019 if (index < 0 || index >= self->length) {
4020 PyErr_SetString(PyExc_IndexError, "string index out of range");
4021 return NULL;
4022 }
4023
4024 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4025}
4026
4027static long
4028unicode_hash(PyUnicodeObject *self)
4029{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004030 /* Since Unicode objects compare equal to their ASCII string
4031 counterparts, they should use the individual character values
4032 as basis for their hash value. This is needed to assure that
4033 strings and Unicode objects behave in the same way as
4034 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
Fredrik Lundhdde61642000-07-10 18:27:47 +00004036 register int len;
4037 register Py_UNICODE *p;
4038 register long x;
4039
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 if (self->hash != -1)
4041 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004042 len = PyUnicode_GET_SIZE(self);
4043 p = PyUnicode_AS_UNICODE(self);
4044 x = *p << 7;
4045 while (--len >= 0)
4046 x = (1000003*x) ^ *p++;
4047 x ^= PyUnicode_GET_SIZE(self);
4048 if (x == -1)
4049 x = -2;
4050 self->hash = x;
4051 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052}
4053
4054static char index__doc__[] =
4055"S.index(sub [,start [,end]]) -> int\n\
4056\n\
4057Like S.find() but raise ValueError when the substring is not found.";
4058
4059static PyObject *
4060unicode_index(PyUnicodeObject *self, PyObject *args)
4061{
4062 int result;
4063 PyUnicodeObject *substring;
4064 int start = 0;
4065 int end = INT_MAX;
4066
Guido van Rossumb8872e62000-05-09 14:14:27 +00004067 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4068 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 return NULL;
4070
4071 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4072 (PyObject *)substring);
4073 if (substring == NULL)
4074 return NULL;
4075
4076 result = findstring(self, substring, start, end, 1);
4077
4078 Py_DECREF(substring);
4079 if (result < 0) {
4080 PyErr_SetString(PyExc_ValueError, "substring not found");
4081 return NULL;
4082 }
4083 return PyInt_FromLong(result);
4084}
4085
4086static char islower__doc__[] =
4087"S.islower() -> int\n\
4088\n\
4089Return 1 if all cased characters in S are lowercase and there is\n\
4090at least one cased character in S, 0 otherwise.";
4091
4092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004093unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094{
4095 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4096 register const Py_UNICODE *e;
4097 int cased;
4098
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 /* Shortcut for single character strings */
4100 if (PyUnicode_GET_SIZE(self) == 1)
4101 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4102
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004103 /* Special case for empty strings */
4104 if (PyString_GET_SIZE(self) == 0)
4105 return PyInt_FromLong(0);
4106
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 e = p + PyUnicode_GET_SIZE(self);
4108 cased = 0;
4109 for (; p < e; p++) {
4110 register const Py_UNICODE ch = *p;
4111
4112 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4113 return PyInt_FromLong(0);
4114 else if (!cased && Py_UNICODE_ISLOWER(ch))
4115 cased = 1;
4116 }
4117 return PyInt_FromLong(cased);
4118}
4119
4120static char isupper__doc__[] =
4121"S.isupper() -> int\n\
4122\n\
4123Return 1 if all cased characters in S are uppercase and there is\n\
4124at least one cased character in S, 0 otherwise.";
4125
4126static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004127unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128{
4129 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4130 register const Py_UNICODE *e;
4131 int cased;
4132
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 /* Shortcut for single character strings */
4134 if (PyUnicode_GET_SIZE(self) == 1)
4135 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004137 /* Special case for empty strings */
4138 if (PyString_GET_SIZE(self) == 0)
4139 return PyInt_FromLong(0);
4140
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 e = p + PyUnicode_GET_SIZE(self);
4142 cased = 0;
4143 for (; p < e; p++) {
4144 register const Py_UNICODE ch = *p;
4145
4146 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4147 return PyInt_FromLong(0);
4148 else if (!cased && Py_UNICODE_ISUPPER(ch))
4149 cased = 1;
4150 }
4151 return PyInt_FromLong(cased);
4152}
4153
4154static char istitle__doc__[] =
4155"S.istitle() -> int\n\
4156\n\
4157Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4158may only follow uncased characters and lowercase characters only cased\n\
4159ones. Return 0 otherwise.";
4160
4161static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004162unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163{
4164 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4165 register const Py_UNICODE *e;
4166 int cased, previous_is_cased;
4167
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 /* Shortcut for single character strings */
4169 if (PyUnicode_GET_SIZE(self) == 1)
4170 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4171 (Py_UNICODE_ISUPPER(*p) != 0));
4172
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004173 /* Special case for empty strings */
4174 if (PyString_GET_SIZE(self) == 0)
4175 return PyInt_FromLong(0);
4176
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 e = p + PyUnicode_GET_SIZE(self);
4178 cased = 0;
4179 previous_is_cased = 0;
4180 for (; p < e; p++) {
4181 register const Py_UNICODE ch = *p;
4182
4183 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4184 if (previous_is_cased)
4185 return PyInt_FromLong(0);
4186 previous_is_cased = 1;
4187 cased = 1;
4188 }
4189 else if (Py_UNICODE_ISLOWER(ch)) {
4190 if (!previous_is_cased)
4191 return PyInt_FromLong(0);
4192 previous_is_cased = 1;
4193 cased = 1;
4194 }
4195 else
4196 previous_is_cased = 0;
4197 }
4198 return PyInt_FromLong(cased);
4199}
4200
4201static char isspace__doc__[] =
4202"S.isspace() -> int\n\
4203\n\
4204Return 1 if there are only whitespace characters in S,\n\
42050 otherwise.";
4206
4207static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004208unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209{
4210 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4211 register const Py_UNICODE *e;
4212
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 /* Shortcut for single character strings */
4214 if (PyUnicode_GET_SIZE(self) == 1 &&
4215 Py_UNICODE_ISSPACE(*p))
4216 return PyInt_FromLong(1);
4217
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004218 /* Special case for empty strings */
4219 if (PyString_GET_SIZE(self) == 0)
4220 return PyInt_FromLong(0);
4221
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 e = p + PyUnicode_GET_SIZE(self);
4223 for (; p < e; p++) {
4224 if (!Py_UNICODE_ISSPACE(*p))
4225 return PyInt_FromLong(0);
4226 }
4227 return PyInt_FromLong(1);
4228}
4229
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004230static char isalpha__doc__[] =
4231"S.isalpha() -> int\n\
4232\n\
4233Return 1 if all characters in S are alphabetic\n\
4234and there is at least one character in S, 0 otherwise.";
4235
4236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004237unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004238{
4239 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4240 register const Py_UNICODE *e;
4241
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004242 /* Shortcut for single character strings */
4243 if (PyUnicode_GET_SIZE(self) == 1 &&
4244 Py_UNICODE_ISALPHA(*p))
4245 return PyInt_FromLong(1);
4246
4247 /* Special case for empty strings */
4248 if (PyString_GET_SIZE(self) == 0)
4249 return PyInt_FromLong(0);
4250
4251 e = p + PyUnicode_GET_SIZE(self);
4252 for (; p < e; p++) {
4253 if (!Py_UNICODE_ISALPHA(*p))
4254 return PyInt_FromLong(0);
4255 }
4256 return PyInt_FromLong(1);
4257}
4258
4259static char isalnum__doc__[] =
4260"S.isalnum() -> int\n\
4261\n\
4262Return 1 if all characters in S are alphanumeric\n\
4263and there is at least one character in S, 0 otherwise.";
4264
4265static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004266unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004267{
4268 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4269 register const Py_UNICODE *e;
4270
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004271 /* Shortcut for single character strings */
4272 if (PyUnicode_GET_SIZE(self) == 1 &&
4273 Py_UNICODE_ISALNUM(*p))
4274 return PyInt_FromLong(1);
4275
4276 /* Special case for empty strings */
4277 if (PyString_GET_SIZE(self) == 0)
4278 return PyInt_FromLong(0);
4279
4280 e = p + PyUnicode_GET_SIZE(self);
4281 for (; p < e; p++) {
4282 if (!Py_UNICODE_ISALNUM(*p))
4283 return PyInt_FromLong(0);
4284 }
4285 return PyInt_FromLong(1);
4286}
4287
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288static char isdecimal__doc__[] =
4289"S.isdecimal() -> int\n\
4290\n\
4291Return 1 if there are only decimal characters in S,\n\
42920 otherwise.";
4293
4294static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004295unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296{
4297 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4298 register const Py_UNICODE *e;
4299
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 /* Shortcut for single character strings */
4301 if (PyUnicode_GET_SIZE(self) == 1 &&
4302 Py_UNICODE_ISDECIMAL(*p))
4303 return PyInt_FromLong(1);
4304
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004305 /* Special case for empty strings */
4306 if (PyString_GET_SIZE(self) == 0)
4307 return PyInt_FromLong(0);
4308
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 e = p + PyUnicode_GET_SIZE(self);
4310 for (; p < e; p++) {
4311 if (!Py_UNICODE_ISDECIMAL(*p))
4312 return PyInt_FromLong(0);
4313 }
4314 return PyInt_FromLong(1);
4315}
4316
4317static char isdigit__doc__[] =
4318"S.isdigit() -> int\n\
4319\n\
4320Return 1 if there are only digit characters in S,\n\
43210 otherwise.";
4322
4323static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004324unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325{
4326 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4327 register const Py_UNICODE *e;
4328
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 /* Shortcut for single character strings */
4330 if (PyUnicode_GET_SIZE(self) == 1 &&
4331 Py_UNICODE_ISDIGIT(*p))
4332 return PyInt_FromLong(1);
4333
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004334 /* Special case for empty strings */
4335 if (PyString_GET_SIZE(self) == 0)
4336 return PyInt_FromLong(0);
4337
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 e = p + PyUnicode_GET_SIZE(self);
4339 for (; p < e; p++) {
4340 if (!Py_UNICODE_ISDIGIT(*p))
4341 return PyInt_FromLong(0);
4342 }
4343 return PyInt_FromLong(1);
4344}
4345
4346static char isnumeric__doc__[] =
4347"S.isnumeric() -> int\n\
4348\n\
4349Return 1 if there are only numeric characters in S,\n\
43500 otherwise.";
4351
4352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004353unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354{
4355 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4356 register const Py_UNICODE *e;
4357
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 /* Shortcut for single character strings */
4359 if (PyUnicode_GET_SIZE(self) == 1 &&
4360 Py_UNICODE_ISNUMERIC(*p))
4361 return PyInt_FromLong(1);
4362
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004363 /* Special case for empty strings */
4364 if (PyString_GET_SIZE(self) == 0)
4365 return PyInt_FromLong(0);
4366
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367 e = p + PyUnicode_GET_SIZE(self);
4368 for (; p < e; p++) {
4369 if (!Py_UNICODE_ISNUMERIC(*p))
4370 return PyInt_FromLong(0);
4371 }
4372 return PyInt_FromLong(1);
4373}
4374
4375static char join__doc__[] =
4376"S.join(sequence) -> unicode\n\
4377\n\
4378Return a string which is the concatenation of the strings in the\n\
4379sequence. The separator between elements is S.";
4380
4381static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004382unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004384 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385}
4386
4387static int
4388unicode_length(PyUnicodeObject *self)
4389{
4390 return self->length;
4391}
4392
4393static char ljust__doc__[] =
4394"S.ljust(width) -> unicode\n\
4395\n\
4396Return S left justified in a Unicode string of length width. Padding is\n\
4397done using spaces.";
4398
4399static PyObject *
4400unicode_ljust(PyUnicodeObject *self, PyObject *args)
4401{
4402 int width;
4403 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4404 return NULL;
4405
Tim Peters7a29bd52001-09-12 03:03:31 +00004406 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407 Py_INCREF(self);
4408 return (PyObject*) self;
4409 }
4410
4411 return (PyObject*) pad(self, 0, width - self->length, ' ');
4412}
4413
4414static char lower__doc__[] =
4415"S.lower() -> unicode\n\
4416\n\
4417Return a copy of the string S converted to lowercase.";
4418
4419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004420unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 return fixup(self, fixlower);
4423}
4424
4425static char lstrip__doc__[] =
4426"S.lstrip() -> unicode\n\
4427\n\
4428Return a copy of the string S with leading whitespace removed.";
4429
4430static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004431unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 return strip(self, 1, 0);
4434}
4435
4436static PyObject*
4437unicode_repeat(PyUnicodeObject *str, int len)
4438{
4439 PyUnicodeObject *u;
4440 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004441 int nchars;
4442 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443
4444 if (len < 0)
4445 len = 0;
4446
Tim Peters7a29bd52001-09-12 03:03:31 +00004447 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 /* no repeat, return original string */
4449 Py_INCREF(str);
4450 return (PyObject*) str;
4451 }
Tim Peters8f422462000-09-09 06:13:41 +00004452
4453 /* ensure # of chars needed doesn't overflow int and # of bytes
4454 * needed doesn't overflow size_t
4455 */
4456 nchars = len * str->length;
4457 if (len && nchars / len != str->length) {
4458 PyErr_SetString(PyExc_OverflowError,
4459 "repeated string is too long");
4460 return NULL;
4461 }
4462 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4463 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4464 PyErr_SetString(PyExc_OverflowError,
4465 "repeated string is too long");
4466 return NULL;
4467 }
4468 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 if (!u)
4470 return NULL;
4471
4472 p = u->str;
4473
4474 while (len-- > 0) {
4475 Py_UNICODE_COPY(p, str->str, str->length);
4476 p += str->length;
4477 }
4478
4479 return (PyObject*) u;
4480}
4481
4482PyObject *PyUnicode_Replace(PyObject *obj,
4483 PyObject *subobj,
4484 PyObject *replobj,
4485 int maxcount)
4486{
4487 PyObject *self;
4488 PyObject *str1;
4489 PyObject *str2;
4490 PyObject *result;
4491
4492 self = PyUnicode_FromObject(obj);
4493 if (self == NULL)
4494 return NULL;
4495 str1 = PyUnicode_FromObject(subobj);
4496 if (str1 == NULL) {
4497 Py_DECREF(self);
4498 return NULL;
4499 }
4500 str2 = PyUnicode_FromObject(replobj);
4501 if (str2 == NULL) {
4502 Py_DECREF(self);
4503 Py_DECREF(str1);
4504 return NULL;
4505 }
4506 result = replace((PyUnicodeObject *)self,
4507 (PyUnicodeObject *)str1,
4508 (PyUnicodeObject *)str2,
4509 maxcount);
4510 Py_DECREF(self);
4511 Py_DECREF(str1);
4512 Py_DECREF(str2);
4513 return result;
4514}
4515
4516static char replace__doc__[] =
4517"S.replace (old, new[, maxsplit]) -> unicode\n\
4518\n\
4519Return a copy of S with all occurrences of substring\n\
4520old replaced by new. If the optional argument maxsplit is\n\
4521given, only the first maxsplit occurrences are replaced.";
4522
4523static PyObject*
4524unicode_replace(PyUnicodeObject *self, PyObject *args)
4525{
4526 PyUnicodeObject *str1;
4527 PyUnicodeObject *str2;
4528 int maxcount = -1;
4529 PyObject *result;
4530
4531 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4532 return NULL;
4533 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4534 if (str1 == NULL)
4535 return NULL;
4536 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4537 if (str2 == NULL)
4538 return NULL;
4539
4540 result = replace(self, str1, str2, maxcount);
4541
4542 Py_DECREF(str1);
4543 Py_DECREF(str2);
4544 return result;
4545}
4546
4547static
4548PyObject *unicode_repr(PyObject *unicode)
4549{
4550 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4551 PyUnicode_GET_SIZE(unicode),
4552 1);
4553}
4554
4555static char rfind__doc__[] =
4556"S.rfind(sub [,start [,end]]) -> int\n\
4557\n\
4558Return the highest index in S where substring sub is found,\n\
4559such that sub is contained within s[start,end]. Optional\n\
4560arguments start and end are interpreted as in slice notation.\n\
4561\n\
4562Return -1 on failure.";
4563
4564static PyObject *
4565unicode_rfind(PyUnicodeObject *self, PyObject *args)
4566{
4567 PyUnicodeObject *substring;
4568 int start = 0;
4569 int end = INT_MAX;
4570 PyObject *result;
4571
Guido van Rossumb8872e62000-05-09 14:14:27 +00004572 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 return NULL;
4575 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4576 (PyObject *)substring);
4577 if (substring == NULL)
4578 return NULL;
4579
4580 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4581
4582 Py_DECREF(substring);
4583 return result;
4584}
4585
4586static char rindex__doc__[] =
4587"S.rindex(sub [,start [,end]]) -> int\n\
4588\n\
4589Like S.rfind() but raise ValueError when the substring is not found.";
4590
4591static PyObject *
4592unicode_rindex(PyUnicodeObject *self, PyObject *args)
4593{
4594 int result;
4595 PyUnicodeObject *substring;
4596 int start = 0;
4597 int end = INT_MAX;
4598
Guido van Rossumb8872e62000-05-09 14:14:27 +00004599 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4600 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 return NULL;
4602 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4603 (PyObject *)substring);
4604 if (substring == NULL)
4605 return NULL;
4606
4607 result = findstring(self, substring, start, end, -1);
4608
4609 Py_DECREF(substring);
4610 if (result < 0) {
4611 PyErr_SetString(PyExc_ValueError, "substring not found");
4612 return NULL;
4613 }
4614 return PyInt_FromLong(result);
4615}
4616
4617static char rjust__doc__[] =
4618"S.rjust(width) -> unicode\n\
4619\n\
4620Return S right justified in a Unicode string of length width. Padding is\n\
4621done using spaces.";
4622
4623static PyObject *
4624unicode_rjust(PyUnicodeObject *self, PyObject *args)
4625{
4626 int width;
4627 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4628 return NULL;
4629
Tim Peters7a29bd52001-09-12 03:03:31 +00004630 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631 Py_INCREF(self);
4632 return (PyObject*) self;
4633 }
4634
4635 return (PyObject*) pad(self, width - self->length, 0, ' ');
4636}
4637
4638static char rstrip__doc__[] =
4639"S.rstrip() -> unicode\n\
4640\n\
4641Return a copy of the string S with trailing whitespace removed.";
4642
4643static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004644unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 return strip(self, 0, 1);
4647}
4648
4649static PyObject*
4650unicode_slice(PyUnicodeObject *self, int start, int end)
4651{
4652 /* standard clamping */
4653 if (start < 0)
4654 start = 0;
4655 if (end < 0)
4656 end = 0;
4657 if (end > self->length)
4658 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004659 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660 /* full slice, return original string */
4661 Py_INCREF(self);
4662 return (PyObject*) self;
4663 }
4664 if (start > end)
4665 start = end;
4666 /* copy slice */
4667 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4668 end - start);
4669}
4670
4671PyObject *PyUnicode_Split(PyObject *s,
4672 PyObject *sep,
4673 int maxsplit)
4674{
4675 PyObject *result;
4676
4677 s = PyUnicode_FromObject(s);
4678 if (s == NULL)
4679 return NULL;
4680 if (sep != NULL) {
4681 sep = PyUnicode_FromObject(sep);
4682 if (sep == NULL) {
4683 Py_DECREF(s);
4684 return NULL;
4685 }
4686 }
4687
4688 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4689
4690 Py_DECREF(s);
4691 Py_XDECREF(sep);
4692 return result;
4693}
4694
4695static char split__doc__[] =
4696"S.split([sep [,maxsplit]]) -> list of strings\n\
4697\n\
4698Return a list of the words in S, using sep as the\n\
4699delimiter string. If maxsplit is given, at most maxsplit\n\
4700splits are done. If sep is not specified, any whitespace string\n\
4701is a separator.";
4702
4703static PyObject*
4704unicode_split(PyUnicodeObject *self, PyObject *args)
4705{
4706 PyObject *substring = Py_None;
4707 int maxcount = -1;
4708
4709 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4710 return NULL;
4711
4712 if (substring == Py_None)
4713 return split(self, NULL, maxcount);
4714 else if (PyUnicode_Check(substring))
4715 return split(self, (PyUnicodeObject *)substring, maxcount);
4716 else
4717 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4718}
4719
4720static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004721"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722\n\
4723Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004724Line breaks are not included in the resulting list unless keepends\n\
4725is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
4727static PyObject*
4728unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4729{
Guido van Rossum86662912000-04-11 15:38:46 +00004730 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731
Guido van Rossum86662912000-04-11 15:38:46 +00004732 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 return NULL;
4734
Guido van Rossum86662912000-04-11 15:38:46 +00004735 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736}
4737
4738static
4739PyObject *unicode_str(PyUnicodeObject *self)
4740{
Fred Drakee4315f52000-05-09 19:53:39 +00004741 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742}
4743
4744static char strip__doc__[] =
4745"S.strip() -> unicode\n\
4746\n\
4747Return a copy of S with leading and trailing whitespace removed.";
4748
4749static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004750unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 return strip(self, 1, 1);
4753}
4754
4755static char swapcase__doc__[] =
4756"S.swapcase() -> unicode\n\
4757\n\
4758Return a copy of S with uppercase characters converted to lowercase\n\
4759and vice versa.";
4760
4761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004762unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 return fixup(self, fixswapcase);
4765}
4766
4767static char translate__doc__[] =
4768"S.translate(table) -> unicode\n\
4769\n\
4770Return a copy of the string S, where all characters have been mapped\n\
4771through the given translation table, which must be a mapping of\n\
4772Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4773are left untouched. Characters mapped to None are deleted.";
4774
4775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004776unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 return PyUnicode_TranslateCharmap(self->str,
4779 self->length,
4780 table,
4781 "ignore");
4782}
4783
4784static char upper__doc__[] =
4785"S.upper() -> unicode\n\
4786\n\
4787Return a copy of S converted to uppercase.";
4788
4789static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004790unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 return fixup(self, fixupper);
4793}
4794
4795#if 0
4796static char zfill__doc__[] =
4797"S.zfill(width) -> unicode\n\
4798\n\
4799Pad a numeric string x with zeros on the left, to fill a field\n\
4800of the specified width. The string x is never truncated.";
4801
4802static PyObject *
4803unicode_zfill(PyUnicodeObject *self, PyObject *args)
4804{
4805 int fill;
4806 PyUnicodeObject *u;
4807
4808 int width;
4809 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4810 return NULL;
4811
4812 if (self->length >= width) {
4813 Py_INCREF(self);
4814 return (PyObject*) self;
4815 }
4816
4817 fill = width - self->length;
4818
4819 u = pad(self, fill, 0, '0');
4820
4821 if (u->str[fill] == '+' || u->str[fill] == '-') {
4822 /* move sign to beginning of string */
4823 u->str[0] = u->str[fill];
4824 u->str[fill] = '0';
4825 }
4826
4827 return (PyObject*) u;
4828}
4829#endif
4830
4831#if 0
4832static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004833unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 return PyInt_FromLong(unicode_freelist_size);
4836}
4837#endif
4838
4839static char startswith__doc__[] =
4840"S.startswith(prefix[, start[, end]]) -> int\n\
4841\n\
4842Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4843optional start, test S beginning at that position. With optional end, stop\n\
4844comparing S at that position.";
4845
4846static PyObject *
4847unicode_startswith(PyUnicodeObject *self,
4848 PyObject *args)
4849{
4850 PyUnicodeObject *substring;
4851 int start = 0;
4852 int end = INT_MAX;
4853 PyObject *result;
4854
Guido van Rossumb8872e62000-05-09 14:14:27 +00004855 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4856 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 return NULL;
4858 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4859 (PyObject *)substring);
4860 if (substring == NULL)
4861 return NULL;
4862
4863 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4864
4865 Py_DECREF(substring);
4866 return result;
4867}
4868
4869
4870static char endswith__doc__[] =
4871"S.endswith(suffix[, start[, end]]) -> int\n\
4872\n\
4873Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4874optional start, test S beginning at that position. With optional end, stop\n\
4875comparing S at that position.";
4876
4877static PyObject *
4878unicode_endswith(PyUnicodeObject *self,
4879 PyObject *args)
4880{
4881 PyUnicodeObject *substring;
4882 int start = 0;
4883 int end = INT_MAX;
4884 PyObject *result;
4885
Guido van Rossumb8872e62000-05-09 14:14:27 +00004886 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4887 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 return NULL;
4889 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4890 (PyObject *)substring);
4891 if (substring == NULL)
4892 return NULL;
4893
4894 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4895
4896 Py_DECREF(substring);
4897 return result;
4898}
4899
4900
4901static PyMethodDef unicode_methods[] = {
4902
4903 /* Order is according to common usage: often used methods should
4904 appear first, since lookup is done sequentially. */
4905
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004906 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4907 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4908 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4909 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4910 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4911 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4912 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4913 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4914 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4915 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4916 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4917 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4918 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4919 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4920/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4921 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4922 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4923 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4924 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4925 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4926 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4927 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4928 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4929 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4930 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4931 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4932 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4933 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4934 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4935 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4936 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4937 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4938 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4939 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4940 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004942 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4943 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944#endif
4945
4946#if 0
4947 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004948 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949#endif
4950
4951 {NULL, NULL}
4952};
4953
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954static PySequenceMethods unicode_as_sequence = {
4955 (inquiry) unicode_length, /* sq_length */
4956 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4957 (intargfunc) unicode_repeat, /* sq_repeat */
4958 (intargfunc) unicode_getitem, /* sq_item */
4959 (intintargfunc) unicode_slice, /* sq_slice */
4960 0, /* sq_ass_item */
4961 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004962 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963};
4964
4965static int
4966unicode_buffer_getreadbuf(PyUnicodeObject *self,
4967 int index,
4968 const void **ptr)
4969{
4970 if (index != 0) {
4971 PyErr_SetString(PyExc_SystemError,
4972 "accessing non-existent unicode segment");
4973 return -1;
4974 }
4975 *ptr = (void *) self->str;
4976 return PyUnicode_GET_DATA_SIZE(self);
4977}
4978
4979static int
4980unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4981 const void **ptr)
4982{
4983 PyErr_SetString(PyExc_TypeError,
4984 "cannot use unicode as modifyable buffer");
4985 return -1;
4986}
4987
4988static int
4989unicode_buffer_getsegcount(PyUnicodeObject *self,
4990 int *lenp)
4991{
4992 if (lenp)
4993 *lenp = PyUnicode_GET_DATA_SIZE(self);
4994 return 1;
4995}
4996
4997static int
4998unicode_buffer_getcharbuf(PyUnicodeObject *self,
4999 int index,
5000 const void **ptr)
5001{
5002 PyObject *str;
5003
5004 if (index != 0) {
5005 PyErr_SetString(PyExc_SystemError,
5006 "accessing non-existent unicode segment");
5007 return -1;
5008 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005009 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 if (str == NULL)
5011 return -1;
5012 *ptr = (void *) PyString_AS_STRING(str);
5013 return PyString_GET_SIZE(str);
5014}
5015
5016/* Helpers for PyUnicode_Format() */
5017
5018static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005019getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020{
5021 int argidx = *p_argidx;
5022 if (argidx < arglen) {
5023 (*p_argidx)++;
5024 if (arglen < 0)
5025 return args;
5026 else
5027 return PyTuple_GetItem(args, argidx);
5028 }
5029 PyErr_SetString(PyExc_TypeError,
5030 "not enough arguments for format string");
5031 return NULL;
5032}
5033
5034#define F_LJUST (1<<0)
5035#define F_SIGN (1<<1)
5036#define F_BLANK (1<<2)
5037#define F_ALT (1<<3)
5038#define F_ZERO (1<<4)
5039
5040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042{
5043 register int i;
5044 int len;
5045 va_list va;
5046 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048
5049 /* First, format the string as char array, then expand to Py_UNICODE
5050 array. */
5051 charbuffer = (char *)buffer;
5052 len = vsprintf(charbuffer, format, va);
5053 for (i = len - 1; i >= 0; i--)
5054 buffer[i] = (Py_UNICODE) charbuffer[i];
5055
5056 va_end(va);
5057 return len;
5058}
5059
5060static int
5061formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005062 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 int flags,
5064 int prec,
5065 int type,
5066 PyObject *v)
5067{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005068 /* fmt = '%#.' + `prec` + `type`
5069 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 char fmt[20];
5071 double x;
5072
5073 x = PyFloat_AsDouble(v);
5074 if (x == -1.0 && PyErr_Occurred())
5075 return -1;
5076 if (prec < 0)
5077 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5079 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005080 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5081 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005082 /* worst case length calc to ensure no buffer overrun:
5083 fmt = %#.<prec>g
5084 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5085 for any double rep.)
5086 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5087 If prec=0 the effective precision is 1 (the leading digit is
5088 always given), therefore increase by one to 10+prec. */
5089 if (buflen <= (size_t)10 + (size_t)prec) {
5090 PyErr_SetString(PyExc_OverflowError,
5091 "formatted float is too long (precision too long?)");
5092 return -1;
5093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 return usprintf(buf, fmt, x);
5095}
5096
Tim Peters38fd5b62000-09-21 05:43:11 +00005097static PyObject*
5098formatlong(PyObject *val, int flags, int prec, int type)
5099{
5100 char *buf;
5101 int i, len;
5102 PyObject *str; /* temporary string object. */
5103 PyUnicodeObject *result;
5104
5105 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5106 if (!str)
5107 return NULL;
5108 result = _PyUnicode_New(len);
5109 for (i = 0; i < len; i++)
5110 result->str[i] = buf[i];
5111 result->str[len] = 0;
5112 Py_DECREF(str);
5113 return (PyObject*)result;
5114}
5115
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116static int
5117formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005118 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 int flags,
5120 int prec,
5121 int type,
5122 PyObject *v)
5123{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005124 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005125 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5126 + 1 + 1 = 24*/
5127 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005129 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
5131 x = PyInt_AsLong(v);
5132 if (x == -1 && PyErr_Occurred())
5133 return -1;
5134 if (prec < 0)
5135 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005136 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5137 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5138 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5139 PyErr_SetString(PyExc_OverflowError,
5140 "formatted integer is too long (precision too long?)");
5141 return -1;
5142 }
Tim Petersfff53252001-04-12 18:38:48 +00005143 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5144 * but we want it (for consistency with other %#x conversions, and
5145 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005146 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5147 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5148 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005149 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005150 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5151 /* Only way to know what the platform does is to try it. */
Barry Warsawe5c492d2001-11-28 21:00:41 +00005152 PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005153 if (fmt[1] != (char)type) {
5154 /* Supply our own leading 0x/0X -- needed under std C */
5155 use_native_c_format = 0;
Barry Warsawe5c492d2001-11-28 21:00:41 +00005156 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005157 }
5158 }
5159 if (use_native_c_format)
Barry Warsawe5c492d2001-11-28 21:00:41 +00005160 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5161 (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 return usprintf(buf, fmt, x);
5163}
5164
5165static int
5166formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005167 size_t buflen,
5168 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005170 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005171 if (PyUnicode_Check(v)) {
5172 if (PyUnicode_GET_SIZE(v) != 1)
5173 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005177 else if (PyString_Check(v)) {
5178 if (PyString_GET_SIZE(v) != 1)
5179 goto onError;
5180 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182
5183 else {
5184 /* Integer input truncated to a character */
5185 long x;
5186 x = PyInt_AsLong(v);
5187 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 buf[0] = (char) x;
5190 }
5191 buf[1] = '\0';
5192 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005193
5194 onError:
5195 PyErr_SetString(PyExc_TypeError,
5196 "%c requires int or char");
5197 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198}
5199
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005200/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5201
5202 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5203 chars are formatted. XXX This is a magic number. Each formatting
5204 routine does bounds checking to ensure no overflow, but a better
5205 solution may be to malloc a buffer of appropriate size for each
5206 format. For now, the current solution is sufficient.
5207*/
5208#define FORMATBUFLEN (size_t)120
5209
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210PyObject *PyUnicode_Format(PyObject *format,
5211 PyObject *args)
5212{
5213 Py_UNICODE *fmt, *res;
5214 int fmtcnt, rescnt, reslen, arglen, argidx;
5215 int args_owned = 0;
5216 PyUnicodeObject *result = NULL;
5217 PyObject *dict = NULL;
5218 PyObject *uformat;
5219
5220 if (format == NULL || args == NULL) {
5221 PyErr_BadInternalCall();
5222 return NULL;
5223 }
5224 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005225 if (uformat == NULL)
5226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 fmt = PyUnicode_AS_UNICODE(uformat);
5228 fmtcnt = PyUnicode_GET_SIZE(uformat);
5229
5230 reslen = rescnt = fmtcnt + 100;
5231 result = _PyUnicode_New(reslen);
5232 if (result == NULL)
5233 goto onError;
5234 res = PyUnicode_AS_UNICODE(result);
5235
5236 if (PyTuple_Check(args)) {
5237 arglen = PyTuple_Size(args);
5238 argidx = 0;
5239 }
5240 else {
5241 arglen = -1;
5242 argidx = -2;
5243 }
5244 if (args->ob_type->tp_as_mapping)
5245 dict = args;
5246
5247 while (--fmtcnt >= 0) {
5248 if (*fmt != '%') {
5249 if (--rescnt < 0) {
5250 rescnt = fmtcnt + 100;
5251 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005252 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 return NULL;
5254 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5255 --rescnt;
5256 }
5257 *res++ = *fmt++;
5258 }
5259 else {
5260 /* Got a format specifier */
5261 int flags = 0;
5262 int width = -1;
5263 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 Py_UNICODE c = '\0';
5265 Py_UNICODE fill;
5266 PyObject *v = NULL;
5267 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005268 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 Py_UNICODE sign;
5270 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005271 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273 fmt++;
5274 if (*fmt == '(') {
5275 Py_UNICODE *keystart;
5276 int keylen;
5277 PyObject *key;
5278 int pcount = 1;
5279
5280 if (dict == NULL) {
5281 PyErr_SetString(PyExc_TypeError,
5282 "format requires a mapping");
5283 goto onError;
5284 }
5285 ++fmt;
5286 --fmtcnt;
5287 keystart = fmt;
5288 /* Skip over balanced parentheses */
5289 while (pcount > 0 && --fmtcnt >= 0) {
5290 if (*fmt == ')')
5291 --pcount;
5292 else if (*fmt == '(')
5293 ++pcount;
5294 fmt++;
5295 }
5296 keylen = fmt - keystart - 1;
5297 if (fmtcnt < 0 || pcount > 0) {
5298 PyErr_SetString(PyExc_ValueError,
5299 "incomplete format key");
5300 goto onError;
5301 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005302#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005303 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 then looked up since Python uses strings to hold
5305 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005306 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 key = PyUnicode_EncodeUTF8(keystart,
5308 keylen,
5309 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005310#else
5311 key = PyUnicode_FromUnicode(keystart, keylen);
5312#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 if (key == NULL)
5314 goto onError;
5315 if (args_owned) {
5316 Py_DECREF(args);
5317 args_owned = 0;
5318 }
5319 args = PyObject_GetItem(dict, key);
5320 Py_DECREF(key);
5321 if (args == NULL) {
5322 goto onError;
5323 }
5324 args_owned = 1;
5325 arglen = -1;
5326 argidx = -2;
5327 }
5328 while (--fmtcnt >= 0) {
5329 switch (c = *fmt++) {
5330 case '-': flags |= F_LJUST; continue;
5331 case '+': flags |= F_SIGN; continue;
5332 case ' ': flags |= F_BLANK; continue;
5333 case '#': flags |= F_ALT; continue;
5334 case '0': flags |= F_ZERO; continue;
5335 }
5336 break;
5337 }
5338 if (c == '*') {
5339 v = getnextarg(args, arglen, &argidx);
5340 if (v == NULL)
5341 goto onError;
5342 if (!PyInt_Check(v)) {
5343 PyErr_SetString(PyExc_TypeError,
5344 "* wants int");
5345 goto onError;
5346 }
5347 width = PyInt_AsLong(v);
5348 if (width < 0) {
5349 flags |= F_LJUST;
5350 width = -width;
5351 }
5352 if (--fmtcnt >= 0)
5353 c = *fmt++;
5354 }
5355 else if (c >= '0' && c <= '9') {
5356 width = c - '0';
5357 while (--fmtcnt >= 0) {
5358 c = *fmt++;
5359 if (c < '0' || c > '9')
5360 break;
5361 if ((width*10) / 10 != width) {
5362 PyErr_SetString(PyExc_ValueError,
5363 "width too big");
5364 goto onError;
5365 }
5366 width = width*10 + (c - '0');
5367 }
5368 }
5369 if (c == '.') {
5370 prec = 0;
5371 if (--fmtcnt >= 0)
5372 c = *fmt++;
5373 if (c == '*') {
5374 v = getnextarg(args, arglen, &argidx);
5375 if (v == NULL)
5376 goto onError;
5377 if (!PyInt_Check(v)) {
5378 PyErr_SetString(PyExc_TypeError,
5379 "* wants int");
5380 goto onError;
5381 }
5382 prec = PyInt_AsLong(v);
5383 if (prec < 0)
5384 prec = 0;
5385 if (--fmtcnt >= 0)
5386 c = *fmt++;
5387 }
5388 else if (c >= '0' && c <= '9') {
5389 prec = c - '0';
5390 while (--fmtcnt >= 0) {
5391 c = Py_CHARMASK(*fmt++);
5392 if (c < '0' || c > '9')
5393 break;
5394 if ((prec*10) / 10 != prec) {
5395 PyErr_SetString(PyExc_ValueError,
5396 "prec too big");
5397 goto onError;
5398 }
5399 prec = prec*10 + (c - '0');
5400 }
5401 }
5402 } /* prec */
5403 if (fmtcnt >= 0) {
5404 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 if (--fmtcnt >= 0)
5406 c = *fmt++;
5407 }
5408 }
5409 if (fmtcnt < 0) {
5410 PyErr_SetString(PyExc_ValueError,
5411 "incomplete format");
5412 goto onError;
5413 }
5414 if (c != '%') {
5415 v = getnextarg(args, arglen, &argidx);
5416 if (v == NULL)
5417 goto onError;
5418 }
5419 sign = 0;
5420 fill = ' ';
5421 switch (c) {
5422
5423 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005424 pbuf = formatbuf;
5425 /* presume that buffer length is at least 1 */
5426 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 len = 1;
5428 break;
5429
5430 case 's':
5431 case 'r':
5432 if (PyUnicode_Check(v) && c == 's') {
5433 temp = v;
5434 Py_INCREF(temp);
5435 }
5436 else {
5437 PyObject *unicode;
5438 if (c == 's')
5439 temp = PyObject_Str(v);
5440 else
5441 temp = PyObject_Repr(v);
5442 if (temp == NULL)
5443 goto onError;
5444 if (!PyString_Check(temp)) {
5445 /* XXX Note: this should never happen, since
5446 PyObject_Repr() and PyObject_Str() assure
5447 this */
5448 Py_DECREF(temp);
5449 PyErr_SetString(PyExc_TypeError,
5450 "%s argument has non-string str()");
5451 goto onError;
5452 }
Fred Drakee4315f52000-05-09 19:53:39 +00005453 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005455 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 "strict");
5457 Py_DECREF(temp);
5458 temp = unicode;
5459 if (temp == NULL)
5460 goto onError;
5461 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005462 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 len = PyUnicode_GET_SIZE(temp);
5464 if (prec >= 0 && len > prec)
5465 len = prec;
5466 break;
5467
5468 case 'i':
5469 case 'd':
5470 case 'u':
5471 case 'o':
5472 case 'x':
5473 case 'X':
5474 if (c == 'i')
5475 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005476 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005477 temp = formatlong(v, flags, prec, c);
5478 if (!temp)
5479 goto onError;
5480 pbuf = PyUnicode_AS_UNICODE(temp);
5481 len = PyUnicode_GET_SIZE(temp);
5482 /* unbounded ints can always produce
5483 a sign character! */
5484 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005486 else {
5487 pbuf = formatbuf;
5488 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5489 flags, prec, c, v);
5490 if (len < 0)
5491 goto onError;
5492 /* only d conversion is signed */
5493 sign = c == 'd';
5494 }
5495 if (flags & F_ZERO)
5496 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 break;
5498
5499 case 'e':
5500 case 'E':
5501 case 'f':
5502 case 'g':
5503 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005504 pbuf = formatbuf;
5505 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5506 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (len < 0)
5508 goto onError;
5509 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005510 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 fill = '0';
5512 break;
5513
5514 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005515 pbuf = formatbuf;
5516 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 if (len < 0)
5518 goto onError;
5519 break;
5520
5521 default:
5522 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005523 "unsupported format character '%c' (0x%x) "
5524 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005525 (31<=c && c<=126) ? c : '?',
5526 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 goto onError;
5528 }
5529 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005530 if (*pbuf == '-' || *pbuf == '+') {
5531 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 len--;
5533 }
5534 else if (flags & F_SIGN)
5535 sign = '+';
5536 else if (flags & F_BLANK)
5537 sign = ' ';
5538 else
5539 sign = 0;
5540 }
5541 if (width < len)
5542 width = len;
5543 if (rescnt < width + (sign != 0)) {
5544 reslen -= rescnt;
5545 rescnt = width + fmtcnt + 100;
5546 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005547 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 return NULL;
5549 res = PyUnicode_AS_UNICODE(result)
5550 + reslen - rescnt;
5551 }
5552 if (sign) {
5553 if (fill != ' ')
5554 *res++ = sign;
5555 rescnt--;
5556 if (width > len)
5557 width--;
5558 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005559 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5560 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005561 assert(pbuf[1] == c);
5562 if (fill != ' ') {
5563 *res++ = *pbuf++;
5564 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005565 }
Tim Petersfff53252001-04-12 18:38:48 +00005566 rescnt -= 2;
5567 width -= 2;
5568 if (width < 0)
5569 width = 0;
5570 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (width > len && !(flags & F_LJUST)) {
5573 do {
5574 --rescnt;
5575 *res++ = fill;
5576 } while (--width > len);
5577 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005578 if (fill == ' ') {
5579 if (sign)
5580 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005581 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005582 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005583 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005584 *res++ = *pbuf++;
5585 *res++ = *pbuf++;
5586 }
5587 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005588 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 res += len;
5590 rescnt -= len;
5591 while (--width >= len) {
5592 --rescnt;
5593 *res++ = ' ';
5594 }
5595 if (dict && (argidx < arglen) && c != '%') {
5596 PyErr_SetString(PyExc_TypeError,
5597 "not all arguments converted");
5598 goto onError;
5599 }
5600 Py_XDECREF(temp);
5601 } /* '%' */
5602 } /* until end */
5603 if (argidx < arglen && !dict) {
5604 PyErr_SetString(PyExc_TypeError,
5605 "not all arguments converted");
5606 goto onError;
5607 }
5608
5609 if (args_owned) {
5610 Py_DECREF(args);
5611 }
5612 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005613 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 return (PyObject *)result;
5616
5617 onError:
5618 Py_XDECREF(result);
5619 Py_DECREF(uformat);
5620 if (args_owned) {
5621 Py_DECREF(args);
5622 }
5623 return NULL;
5624}
5625
5626static PyBufferProcs unicode_as_buffer = {
5627 (getreadbufferproc) unicode_buffer_getreadbuf,
5628 (getwritebufferproc) unicode_buffer_getwritebuf,
5629 (getsegcountproc) unicode_buffer_getsegcount,
5630 (getcharbufferproc) unicode_buffer_getcharbuf,
5631};
5632
Guido van Rossume023fe02001-08-30 03:12:59 +00005633staticforward PyObject *
5634unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5635
Tim Peters6d6c1a32001-08-02 04:15:00 +00005636static PyObject *
5637unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5638{
5639 PyObject *x = NULL;
5640 static char *kwlist[] = {"string", "encoding", "errors", 0};
5641 char *encoding = NULL;
5642 char *errors = NULL;
5643
Guido van Rossume023fe02001-08-30 03:12:59 +00005644 if (type != &PyUnicode_Type)
5645 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005646 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5647 kwlist, &x, &encoding, &errors))
5648 return NULL;
5649 if (x == NULL)
5650 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005651 if (encoding == NULL && errors == NULL)
5652 return PyObject_Unicode(x);
5653 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005654 return PyUnicode_FromEncodedObject(x, encoding, errors);
5655}
5656
Guido van Rossume023fe02001-08-30 03:12:59 +00005657static PyObject *
5658unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5659{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005660 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005661 int n;
5662
5663 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5664 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5665 if (tmp == NULL)
5666 return NULL;
5667 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005668 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5669 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005670 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005671 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5672 if (pnew->str == NULL) {
5673 _Py_ForgetReference((PyObject *)pnew);
5674 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005675 return NULL;
5676 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005677 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5678 pnew->length = n;
5679 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005680 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005681 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005682}
5683
Tim Peters6d6c1a32001-08-02 04:15:00 +00005684static char unicode_doc[] =
5685"unicode(string [, encoding[, errors]]) -> object\n\
5686\n\
5687Create a new Unicode object from the given encoded string.\n\
5688encoding defaults to the current default string encoding and \n\
5689errors, defining the error handling, to 'strict'.";
5690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691PyTypeObject PyUnicode_Type = {
5692 PyObject_HEAD_INIT(&PyType_Type)
5693 0, /* ob_size */
5694 "unicode", /* tp_name */
5695 sizeof(PyUnicodeObject), /* tp_size */
5696 0, /* tp_itemsize */
5697 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005698 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005700 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 0, /* tp_setattr */
5702 (cmpfunc) unicode_compare, /* tp_compare */
5703 (reprfunc) unicode_repr, /* tp_repr */
5704 0, /* tp_as_number */
5705 &unicode_as_sequence, /* tp_as_sequence */
5706 0, /* tp_as_mapping */
5707 (hashfunc) unicode_hash, /* tp_hash*/
5708 0, /* tp_call*/
5709 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005710 PyObject_GenericGetAttr, /* tp_getattro */
5711 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005713 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005714 unicode_doc, /* tp_doc */
5715 0, /* tp_traverse */
5716 0, /* tp_clear */
5717 0, /* tp_richcompare */
5718 0, /* tp_weaklistoffset */
5719 0, /* tp_iter */
5720 0, /* tp_iternext */
5721 unicode_methods, /* tp_methods */
5722 0, /* tp_members */
5723 0, /* tp_getset */
5724 0, /* tp_base */
5725 0, /* tp_dict */
5726 0, /* tp_descr_get */
5727 0, /* tp_descr_set */
5728 0, /* tp_dictoffset */
5729 0, /* tp_init */
5730 0, /* tp_alloc */
5731 unicode_new, /* tp_new */
Guido van Rossum9475a232001-10-05 20:51:39 +00005732 _PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733};
5734
5735/* Initialize the Unicode implementation */
5736
Thomas Wouters78890102000-07-22 19:25:51 +00005737void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005739 int i;
5740
Fred Drakee4315f52000-05-09 19:53:39 +00005741 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005742 unicode_freelist = NULL;
5743 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005745 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005746 for (i = 0; i < 256; i++)
5747 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748}
5749
5750/* Finalize the Unicode implementation */
5751
5752void
Thomas Wouters78890102000-07-22 19:25:51 +00005753_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005755 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005756 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005758 Py_XDECREF(unicode_empty);
5759 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005760
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005761 for (i = 0; i < 256; i++) {
5762 if (unicode_latin1[i]) {
5763 Py_DECREF(unicode_latin1[i]);
5764 unicode_latin1[i] = NULL;
5765 }
5766 }
5767
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005768 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 PyUnicodeObject *v = u;
5770 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005771 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005772 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005773 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005774 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005776 unicode_freelist = NULL;
5777 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778}