blob: 596cb3830b481a6368bf71de253e4fac7979d126 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001175/* Allocation strategy: we default to Latin-1, then do one resize
1176 whenever we hit an order boundary. The assumption is that
1177 characters from higher orders usually occur often enough to warrant
1178 this.
1179*/
1180
Tim Peters7e3d9612002-04-21 03:26:37 +00001181PyObject *
1182PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1183 int size,
1184 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185{
1186 PyObject *v;
1187 char *p;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001188 int i = 0;
1189 int overalloc = 2;
1190 int len;
1191
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001192 /* Short-cut for emtpy strings */
1193 if (size == 0)
1194 return PyString_FromStringAndSize(NULL, 0);
1195
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001196 v = PyString_FromStringAndSize(NULL, overalloc * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 if (v == NULL)
1198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001200 p = PyString_AS_STRING(v);
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001201
1202 while (i < size) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001203 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001204
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001205 if (ch < 0x80)
1206 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001208
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 else if (ch < 0x0800) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001210 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001211 *p++ = (char)(0xc0 | (ch >> 6));
1212 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001213 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001214
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001215 else {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001216 /* Encode UCS2 Unicode ordinals */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001217 if (ch < 0x10000) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001218
1219 /* Special case: check for high surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001220 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1221 Py_UCS4 ch2 = s[i];
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001222 /* Check for low surrogate and combine the two to
1223 form a UCS4 value */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001224 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001225 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1226 i++;
1227 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001228 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001229 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001230 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001231
1232 if (overalloc < 3) {
1233 len = (int)(p - PyString_AS_STRING(v));
1234 overalloc = 3;
1235 if (_PyString_Resize(&v, overalloc * size))
1236 goto onError;
1237 p = PyString_AS_STRING(v) + len;
1238 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001239 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001240 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1241 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001242 continue;
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001243 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001244
1245 /* Encode UCS4 Unicode ordinals */
1246 encodeUCS4:
1247 if (overalloc < 4) {
1248 len = (int)(p - PyString_AS_STRING(v));
1249 overalloc = 4;
1250 if (_PyString_Resize(&v, overalloc * size))
1251 goto onError;
1252 p = PyString_AS_STRING(v) + len;
1253 }
1254 *p++ = (char)(0xf0 | (ch >> 18));
1255 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1256 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1257 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001260 *p = '\0';
1261 assert((p - PyString_AS_STRING(v)) <= overalloc*size);
1262 if (_PyString_Resize(&v, (int)(p - PyString_AS_STRING(v))))
1263 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001265
1266 onError:
1267 Py_DECREF(v);
1268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269}
1270
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1272{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (!PyUnicode_Check(unicode)) {
1274 PyErr_BadArgument();
1275 return NULL;
1276 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001277 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1278 PyUnicode_GET_SIZE(unicode),
1279 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280}
1281
1282/* --- UTF-16 Codec ------------------------------------------------------- */
1283
1284static
Tim Peters772747b2001-08-09 22:21:55 +00001285int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 const char *errors,
1287 const char *details)
1288{
1289 if ((errors == NULL) ||
1290 (strcmp(errors,"strict") == 0)) {
1291 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001292 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 details);
1294 return -1;
1295 }
1296 else if (strcmp(errors,"ignore") == 0) {
1297 return 0;
1298 }
1299 else if (strcmp(errors,"replace") == 0) {
1300 if (dest) {
1301 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1302 (*dest)++;
1303 }
1304 return 0;
1305 }
1306 else {
1307 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001308 "UTF-16 decoding error; "
1309 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 errors);
1311 return -1;
1312 }
1313}
1314
Tim Peters772747b2001-08-09 22:21:55 +00001315PyObject *
1316PyUnicode_DecodeUTF16(const char *s,
1317 int size,
1318 const char *errors,
1319 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320{
1321 PyUnicodeObject *unicode;
1322 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001323 const unsigned char *q, *e;
1324 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001326 /* Offsets from q for retrieving byte pairs in the right order. */
1327#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1328 int ihi = 1, ilo = 0;
1329#else
1330 int ihi = 0, ilo = 1;
1331#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332
1333 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001334 if (size & 1) {
1335 if (utf16_decoding_error(NULL, errors, "truncated data"))
1336 return NULL;
1337 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338 }
1339
1340 /* Note: size will always be longer than the resulting Unicode
1341 character count */
1342 unicode = _PyUnicode_New(size);
1343 if (!unicode)
1344 return NULL;
1345 if (size == 0)
1346 return (PyObject *)unicode;
1347
1348 /* Unpack UTF-16 encoded data */
1349 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001350 q = (unsigned char *)s;
1351 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352
1353 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001354 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001356 /* Check for BOM marks (U+FEFF) in the input and adjust current
1357 byte order setting accordingly. In native mode, the leading BOM
1358 mark is skipped, in all other modes, it is copied to the output
1359 stream as-is (giving a ZWNBSP character). */
1360 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001361 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001362#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001363 if (bom == 0xFEFF) {
1364 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001365 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001366 }
1367 else if (bom == 0xFFFE) {
1368 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001369 bo = 1;
1370 }
1371#else
Tim Peters772747b2001-08-09 22:21:55 +00001372 if (bom == 0xFEFF) {
1373 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001374 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001375 }
1376 else if (bom == 0xFFFE) {
1377 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001378 bo = -1;
1379 }
1380#endif
1381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382
Tim Peters772747b2001-08-09 22:21:55 +00001383 if (bo == -1) {
1384 /* force LE */
1385 ihi = 1;
1386 ilo = 0;
1387 }
1388 else if (bo == 1) {
1389 /* force BE */
1390 ihi = 0;
1391 ilo = 1;
1392 }
1393
1394 while (q < e) {
1395 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1396 q += 2;
1397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 if (ch < 0xD800 || ch > 0xDFFF) {
1399 *p++ = ch;
1400 continue;
1401 }
1402
1403 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001404 if (q >= e) {
1405 errmsg = "unexpected end of data";
1406 goto utf16Error;
1407 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001408 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001409 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1410 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001411 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001412#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001413 *p++ = ch;
1414 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001415#else
1416 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001417#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001418 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001419 }
1420 else {
1421 errmsg = "illegal UTF-16 surrogate";
1422 goto utf16Error;
1423 }
1424
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001426 errmsg = "illegal encoding";
1427 /* Fall through to report the error */
1428
1429 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001430 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 }
1433
1434 if (byteorder)
1435 *byteorder = bo;
1436
1437 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001438 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 goto onError;
1440
1441 return (PyObject *)unicode;
1442
1443onError:
1444 Py_DECREF(unicode);
1445 return NULL;
1446}
1447
Tim Peters772747b2001-08-09 22:21:55 +00001448PyObject *
1449PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1450 int size,
1451 const char *errors,
1452 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453{
1454 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001455 unsigned char *p;
1456 int i, pairs;
1457 /* Offsets from p for storing byte pairs in the right order. */
1458#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1459 int ihi = 1, ilo = 0;
1460#else
1461 int ihi = 0, ilo = 1;
1462#endif
1463
1464#define STORECHAR(CH) \
1465 do { \
1466 p[ihi] = ((CH) >> 8) & 0xff; \
1467 p[ilo] = (CH) & 0xff; \
1468 p += 2; \
1469 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001471 for (i = pairs = 0; i < size; i++)
1472 if (s[i] >= 0x10000)
1473 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001475 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476 if (v == NULL)
1477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478
Tim Peters772747b2001-08-09 22:21:55 +00001479 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001481 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001482 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001483 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001484
1485 if (byteorder == -1) {
1486 /* force LE */
1487 ihi = 1;
1488 ilo = 0;
1489 }
1490 else if (byteorder == 1) {
1491 /* force BE */
1492 ihi = 0;
1493 ilo = 1;
1494 }
1495
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001496 while (size-- > 0) {
1497 Py_UNICODE ch = *s++;
1498 Py_UNICODE ch2 = 0;
1499 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001500 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1501 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 }
Tim Peters772747b2001-08-09 22:21:55 +00001503 STORECHAR(ch);
1504 if (ch2)
1505 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001508#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509}
1510
1511PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1512{
1513 if (!PyUnicode_Check(unicode)) {
1514 PyErr_BadArgument();
1515 return NULL;
1516 }
1517 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1518 PyUnicode_GET_SIZE(unicode),
1519 NULL,
1520 0);
1521}
1522
1523/* --- Unicode Escape Codec ----------------------------------------------- */
1524
1525static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001526int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527 const char *errors,
1528 const char *details)
1529{
1530 if ((errors == NULL) ||
1531 (strcmp(errors,"strict") == 0)) {
1532 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001533 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 details);
1535 return -1;
1536 }
1537 else if (strcmp(errors,"ignore") == 0) {
1538 return 0;
1539 }
1540 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001541 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1542 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return 0;
1544 }
1545 else {
1546 PyErr_Format(PyExc_ValueError,
1547 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001548 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 errors);
1550 return -1;
1551 }
1552}
1553
Fredrik Lundh06d12682001-01-24 07:59:11 +00001554static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001555
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1557 int size,
1558 const char *errors)
1559{
1560 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001561 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001563 char* message;
1564 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 /* Escaped strings will always be longer than the resulting
1567 Unicode string, so we start with size here and then reduce the
1568 length after conversion to the true value. */
1569 v = _PyUnicode_New(size);
1570 if (v == NULL)
1571 goto onError;
1572 if (size == 0)
1573 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001574
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575 p = buf = PyUnicode_AS_UNICODE(v);
1576 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001577
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578 while (s < end) {
1579 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001580 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001581 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582
1583 /* Non-escape characters are interpreted as Unicode ordinals */
1584 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001585 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 continue;
1587 }
1588
1589 /* \ - Escapes */
1590 s++;
1591 switch (*s++) {
1592
1593 /* \x escapes */
1594 case '\n': break;
1595 case '\\': *p++ = '\\'; break;
1596 case '\'': *p++ = '\''; break;
1597 case '\"': *p++ = '\"'; break;
1598 case 'b': *p++ = '\b'; break;
1599 case 'f': *p++ = '\014'; break; /* FF */
1600 case 't': *p++ = '\t'; break;
1601 case 'n': *p++ = '\n'; break;
1602 case 'r': *p++ = '\r'; break;
1603 case 'v': *p++ = '\013'; break; /* VT */
1604 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1605
1606 /* \OOO (octal) escapes */
1607 case '0': case '1': case '2': case '3':
1608 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001609 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001611 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001613 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001615 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 break;
1617
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 /* hex escapes */
1619 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001621 digits = 2;
1622 message = "truncated \\xXX escape";
1623 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001627 digits = 4;
1628 message = "truncated \\uXXXX escape";
1629 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001632 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 digits = 8;
1634 message = "truncated \\UXXXXXXXX escape";
1635 hexescape:
1636 chr = 0;
1637 for (i = 0; i < digits; i++) {
1638 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001639 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001640 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001641 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001642 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001643 i++;
1644 break;
1645 }
1646 chr = (chr<<4) & ~0xF;
1647 if (c >= '0' && c <= '9')
1648 chr += c - '0';
1649 else if (c >= 'a' && c <= 'f')
1650 chr += 10 + c - 'a';
1651 else
1652 chr += 10 + c - 'A';
1653 }
1654 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001655 if (chr == 0xffffffff)
1656 /* _decoding_error will have already written into the
1657 target buffer. */
1658 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001659 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001660 /* when we get here, chr is a 32-bit unicode character */
1661 if (chr <= 0xffff)
1662 /* UCS-2 character */
1663 *p++ = (Py_UNICODE) chr;
1664 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001665 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001666 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001667#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001668 *p++ = chr;
1669#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001670 chr -= 0x10000L;
1671 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001672 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001673#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001674 } else {
1675 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001676 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001677 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001678 )
1679 goto onError;
1680 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001681 break;
1682
1683 /* \N{name} */
1684 case 'N':
1685 message = "malformed \\N character escape";
1686 if (ucnhash_CAPI == NULL) {
1687 /* load the unicode data module */
1688 PyObject *m, *v;
1689 m = PyImport_ImportModule("unicodedata");
1690 if (m == NULL)
1691 goto ucnhashError;
1692 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1693 Py_DECREF(m);
1694 if (v == NULL)
1695 goto ucnhashError;
1696 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1697 Py_DECREF(v);
1698 if (ucnhash_CAPI == NULL)
1699 goto ucnhashError;
1700 }
1701 if (*s == '{') {
1702 const char *start = s+1;
1703 /* look for the closing brace */
1704 while (*s != '}' && s < end)
1705 s++;
1706 if (s > start && s < end && *s == '}') {
1707 /* found a name. look it up in the unicode database */
1708 message = "unknown Unicode character name";
1709 s++;
1710 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1711 goto store;
1712 }
1713 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001714 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001716 break;
1717
1718 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001719 if (s > end) {
1720 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1721 goto onError;
1722 }
1723 else {
1724 *p++ = '\\';
1725 *p++ = (unsigned char)s[-1];
1726 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001727 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 }
1729 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001730 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001731 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001733
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001735 PyErr_SetString(
1736 PyExc_UnicodeError,
1737 "\\N escapes not supported (can't load unicodedata module)"
1738 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001739 return NULL;
1740
Fredrik Lundhccc74732001-02-18 22:13:49 +00001741onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 Py_XDECREF(v);
1743 return NULL;
1744}
1745
1746/* Return a Unicode-Escape string version of the Unicode object.
1747
1748 If quotes is true, the string is enclosed in u"" or u'' quotes as
1749 appropriate.
1750
1751*/
1752
Barry Warsaw51ac5802000-03-20 16:36:48 +00001753static const Py_UNICODE *findchar(const Py_UNICODE *s,
1754 int size,
1755 Py_UNICODE ch);
1756
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757static
1758PyObject *unicodeescape_string(const Py_UNICODE *s,
1759 int size,
1760 int quotes)
1761{
1762 PyObject *repr;
1763 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001765 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766
1767 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1768 if (repr == NULL)
1769 return NULL;
1770
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001771 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772
1773 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 *p++ = 'u';
1775 *p++ = (findchar(s, size, '\'') &&
1776 !findchar(s, size, '"')) ? '"' : '\'';
1777 }
1778 while (size-- > 0) {
1779 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001780
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001782 if (quotes &&
1783 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 *p++ = '\\';
1785 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001786 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001788
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001789#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001790 /* Map 21-bit characters to '\U00xxxxxx' */
1791 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001792 int offset = p - PyString_AS_STRING(repr);
1793
1794 /* Resize the string if necessary */
1795 if (offset + 12 > PyString_GET_SIZE(repr)) {
1796 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1797 goto onError;
1798 p = PyString_AS_STRING(repr) + offset;
1799 }
1800
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001801 *p++ = '\\';
1802 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001803 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1804 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1805 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1806 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1807 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1808 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1809 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001810 *p++ = hexdigit[ch & 0x0000000F];
1811 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001812 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001813#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001814 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1815 else if (ch >= 0xD800 && ch < 0xDC00) {
1816 Py_UNICODE ch2;
1817 Py_UCS4 ucs;
1818
1819 ch2 = *s++;
1820 size--;
1821 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1822 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1823 *p++ = '\\';
1824 *p++ = 'U';
1825 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1826 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1827 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1828 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1829 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1830 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1831 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1832 *p++ = hexdigit[ucs & 0x0000000F];
1833 continue;
1834 }
1835 /* Fall through: isolated surrogates are copied as-is */
1836 s--;
1837 size++;
1838 }
1839
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001841 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 *p++ = '\\';
1843 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001844 *p++ = hexdigit[(ch >> 12) & 0x000F];
1845 *p++ = hexdigit[(ch >> 8) & 0x000F];
1846 *p++ = hexdigit[(ch >> 4) & 0x000F];
1847 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001849
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001850 /* Map special whitespace to '\t', \n', '\r' */
1851 else if (ch == '\t') {
1852 *p++ = '\\';
1853 *p++ = 't';
1854 }
1855 else if (ch == '\n') {
1856 *p++ = '\\';
1857 *p++ = 'n';
1858 }
1859 else if (ch == '\r') {
1860 *p++ = '\\';
1861 *p++ = 'r';
1862 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001863
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001864 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001865 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001867 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001868 *p++ = hexdigit[(ch >> 4) & 0x000F];
1869 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001871
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 /* Copy everything else as-is */
1873 else
1874 *p++ = (char) ch;
1875 }
1876 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001877 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878
1879 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001880 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001881 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882
1883 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001884
1885 onError:
1886 Py_DECREF(repr);
1887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888}
1889
1890PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1891 int size)
1892{
1893 return unicodeescape_string(s, size, 0);
1894}
1895
1896PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1897{
1898 if (!PyUnicode_Check(unicode)) {
1899 PyErr_BadArgument();
1900 return NULL;
1901 }
1902 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1903 PyUnicode_GET_SIZE(unicode));
1904}
1905
1906/* --- Raw Unicode Escape Codec ------------------------------------------- */
1907
1908PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1909 int size,
1910 const char *errors)
1911{
1912 PyUnicodeObject *v;
1913 Py_UNICODE *p, *buf;
1914 const char *end;
1915 const char *bs;
1916
1917 /* Escaped strings will always be longer than the resulting
1918 Unicode string, so we start with size here and then reduce the
1919 length after conversion to the true value. */
1920 v = _PyUnicode_New(size);
1921 if (v == NULL)
1922 goto onError;
1923 if (size == 0)
1924 return (PyObject *)v;
1925 p = buf = PyUnicode_AS_UNICODE(v);
1926 end = s + size;
1927 while (s < end) {
1928 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001929 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 int i;
1931
1932 /* Non-escape characters are interpreted as Unicode ordinals */
1933 if (*s != '\\') {
1934 *p++ = (unsigned char)*s++;
1935 continue;
1936 }
1937
1938 /* \u-escapes are only interpreted iff the number of leading
1939 backslashes if odd */
1940 bs = s;
1941 for (;s < end;) {
1942 if (*s != '\\')
1943 break;
1944 *p++ = (unsigned char)*s++;
1945 }
1946 if (((s - bs) & 1) == 0 ||
1947 s >= end ||
1948 *s != 'u') {
1949 continue;
1950 }
1951 p--;
1952 s++;
1953
1954 /* \uXXXX with 4 hex digits */
1955 for (x = 0, i = 0; i < 4; i++) {
1956 c = (unsigned char)s[i];
1957 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001958 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 "truncated \\uXXXX"))
1960 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001961 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 i++;
1963 break;
1964 }
1965 x = (x<<4) & ~0xF;
1966 if (c >= '0' && c <= '9')
1967 x += c - '0';
1968 else if (c >= 'a' && c <= 'f')
1969 x += 10 + c - 'a';
1970 else
1971 x += 10 + c - 'A';
1972 }
1973 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001974 if (x != 0xffffffff)
1975 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001977 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return (PyObject *)v;
1980
1981 onError:
1982 Py_XDECREF(v);
1983 return NULL;
1984}
1985
1986PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1987 int size)
1988{
1989 PyObject *repr;
1990 char *p;
1991 char *q;
1992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001993 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 repr = PyString_FromStringAndSize(NULL, 6 * size);
1996 if (repr == NULL)
1997 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001998 if (size == 0)
1999 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 p = q = PyString_AS_STRING(repr);
2002 while (size-- > 0) {
2003 Py_UNICODE ch = *s++;
2004 /* Map 16-bit characters to '\uxxxx' */
2005 if (ch >= 256) {
2006 *p++ = '\\';
2007 *p++ = 'u';
2008 *p++ = hexdigit[(ch >> 12) & 0xf];
2009 *p++ = hexdigit[(ch >> 8) & 0xf];
2010 *p++ = hexdigit[(ch >> 4) & 0xf];
2011 *p++ = hexdigit[ch & 15];
2012 }
2013 /* Copy everything else as-is */
2014 else
2015 *p++ = (char) ch;
2016 }
2017 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002018 if (_PyString_Resize(&repr, p - q))
2019 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020
2021 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002022
2023 onError:
2024 Py_DECREF(repr);
2025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026}
2027
2028PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2029{
2030 if (!PyUnicode_Check(unicode)) {
2031 PyErr_BadArgument();
2032 return NULL;
2033 }
2034 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2035 PyUnicode_GET_SIZE(unicode));
2036}
2037
2038/* --- Latin-1 Codec ------------------------------------------------------ */
2039
2040PyObject *PyUnicode_DecodeLatin1(const char *s,
2041 int size,
2042 const char *errors)
2043{
2044 PyUnicodeObject *v;
2045 Py_UNICODE *p;
2046
2047 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 if (size == 1 && *(unsigned char*)s < 256) {
2049 Py_UNICODE r = *(unsigned char*)s;
2050 return PyUnicode_FromUnicode(&r, 1);
2051 }
2052
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 v = _PyUnicode_New(size);
2054 if (v == NULL)
2055 goto onError;
2056 if (size == 0)
2057 return (PyObject *)v;
2058 p = PyUnicode_AS_UNICODE(v);
2059 while (size-- > 0)
2060 *p++ = (unsigned char)*s++;
2061 return (PyObject *)v;
2062
2063 onError:
2064 Py_XDECREF(v);
2065 return NULL;
2066}
2067
2068static
2069int latin1_encoding_error(const Py_UNICODE **source,
2070 char **dest,
2071 const char *errors,
2072 const char *details)
2073{
2074 if ((errors == NULL) ||
2075 (strcmp(errors,"strict") == 0)) {
2076 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002077 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 details);
2079 return -1;
2080 }
2081 else if (strcmp(errors,"ignore") == 0) {
2082 return 0;
2083 }
2084 else if (strcmp(errors,"replace") == 0) {
2085 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002086 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 return 0;
2088 }
2089 else {
2090 PyErr_Format(PyExc_ValueError,
2091 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002092 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 errors);
2094 return -1;
2095 }
2096}
2097
2098PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2099 int size,
2100 const char *errors)
2101{
2102 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002103 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002104
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 repr = PyString_FromStringAndSize(NULL, size);
2106 if (repr == NULL)
2107 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002108 if (size == 0)
2109 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110
2111 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002112 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 while (size-- > 0) {
2114 Py_UNICODE ch = *p++;
2115 if (ch >= 256) {
2116 if (latin1_encoding_error(&p, &s, errors,
2117 "ordinal not in range(256)"))
2118 goto onError;
2119 }
2120 else
2121 *s++ = (char)ch;
2122 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002123 /* Resize if error handling skipped some characters */
2124 if (s - start < PyString_GET_SIZE(repr))
2125 if (_PyString_Resize(&repr, s - start))
2126 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return repr;
2128
2129 onError:
2130 Py_DECREF(repr);
2131 return NULL;
2132}
2133
2134PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2135{
2136 if (!PyUnicode_Check(unicode)) {
2137 PyErr_BadArgument();
2138 return NULL;
2139 }
2140 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2141 PyUnicode_GET_SIZE(unicode),
2142 NULL);
2143}
2144
2145/* --- 7-bit ASCII Codec -------------------------------------------------- */
2146
2147static
2148int ascii_decoding_error(const char **source,
2149 Py_UNICODE **dest,
2150 const char *errors,
2151 const char *details)
2152{
2153 if ((errors == NULL) ||
2154 (strcmp(errors,"strict") == 0)) {
2155 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002156 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 details);
2158 return -1;
2159 }
2160 else if (strcmp(errors,"ignore") == 0) {
2161 return 0;
2162 }
2163 else if (strcmp(errors,"replace") == 0) {
2164 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2165 (*dest)++;
2166 return 0;
2167 }
2168 else {
2169 PyErr_Format(PyExc_ValueError,
2170 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002171 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 errors);
2173 return -1;
2174 }
2175}
2176
2177PyObject *PyUnicode_DecodeASCII(const char *s,
2178 int size,
2179 const char *errors)
2180{
2181 PyUnicodeObject *v;
2182 Py_UNICODE *p;
2183
2184 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002185 if (size == 1 && *(unsigned char*)s < 128) {
2186 Py_UNICODE r = *(unsigned char*)s;
2187 return PyUnicode_FromUnicode(&r, 1);
2188 }
2189
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 v = _PyUnicode_New(size);
2191 if (v == NULL)
2192 goto onError;
2193 if (size == 0)
2194 return (PyObject *)v;
2195 p = PyUnicode_AS_UNICODE(v);
2196 while (size-- > 0) {
2197 register unsigned char c;
2198
2199 c = (unsigned char)*s++;
2200 if (c < 128)
2201 *p++ = c;
2202 else if (ascii_decoding_error(&s, &p, errors,
2203 "ordinal not in range(128)"))
2204 goto onError;
2205 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002207 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002208 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)v;
2210
2211 onError:
2212 Py_XDECREF(v);
2213 return NULL;
2214}
2215
2216static
2217int ascii_encoding_error(const Py_UNICODE **source,
2218 char **dest,
2219 const char *errors,
2220 const char *details)
2221{
2222 if ((errors == NULL) ||
2223 (strcmp(errors,"strict") == 0)) {
2224 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002225 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 details);
2227 return -1;
2228 }
2229 else if (strcmp(errors,"ignore") == 0) {
2230 return 0;
2231 }
2232 else if (strcmp(errors,"replace") == 0) {
2233 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002234 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235 return 0;
2236 }
2237 else {
2238 PyErr_Format(PyExc_ValueError,
2239 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002240 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 errors);
2242 return -1;
2243 }
2244}
2245
2246PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2247 int size,
2248 const char *errors)
2249{
2250 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002251 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002252
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253 repr = PyString_FromStringAndSize(NULL, size);
2254 if (repr == NULL)
2255 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002256 if (size == 0)
2257 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258
2259 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002260 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 while (size-- > 0) {
2262 Py_UNICODE ch = *p++;
2263 if (ch >= 128) {
2264 if (ascii_encoding_error(&p, &s, errors,
2265 "ordinal not in range(128)"))
2266 goto onError;
2267 }
2268 else
2269 *s++ = (char)ch;
2270 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002271 /* Resize if error handling skipped some characters */
2272 if (s - start < PyString_GET_SIZE(repr))
2273 if (_PyString_Resize(&repr, s - start))
2274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275 return repr;
2276
2277 onError:
2278 Py_DECREF(repr);
2279 return NULL;
2280}
2281
2282PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2283{
2284 if (!PyUnicode_Check(unicode)) {
2285 PyErr_BadArgument();
2286 return NULL;
2287 }
2288 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2289 PyUnicode_GET_SIZE(unicode),
2290 NULL);
2291}
2292
Fredrik Lundh30831632001-06-26 15:11:00 +00002293#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002294
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002295/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002296
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002297PyObject *PyUnicode_DecodeMBCS(const char *s,
2298 int size,
2299 const char *errors)
2300{
2301 PyUnicodeObject *v;
2302 Py_UNICODE *p;
2303
2304 /* First get the size of the result */
2305 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002306 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002307 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2308
2309 v = _PyUnicode_New(usize);
2310 if (v == NULL)
2311 return NULL;
2312 if (usize == 0)
2313 return (PyObject *)v;
2314 p = PyUnicode_AS_UNICODE(v);
2315 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2316 Py_DECREF(v);
2317 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2318 }
2319
2320 return (PyObject *)v;
2321}
2322
2323PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2324 int size,
2325 const char *errors)
2326{
2327 PyObject *repr;
2328 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002329 DWORD mbcssize;
2330
2331 /* If there are no characters, bail now! */
2332 if (size==0)
2333 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002334
2335 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002336 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002337 if (mbcssize==0)
2338 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2339
2340 repr = PyString_FromStringAndSize(NULL, mbcssize);
2341 if (repr == NULL)
2342 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002343 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002344 return repr;
2345
2346 /* Do the conversion */
2347 s = PyString_AS_STRING(repr);
2348 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2349 Py_DECREF(repr);
2350 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2351 }
2352 return repr;
2353}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002354
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002355#endif /* MS_WIN32 */
2356
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357/* --- Character Mapping Codec -------------------------------------------- */
2358
2359static
2360int charmap_decoding_error(const char **source,
2361 Py_UNICODE **dest,
2362 const char *errors,
2363 const char *details)
2364{
2365 if ((errors == NULL) ||
2366 (strcmp(errors,"strict") == 0)) {
2367 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002368 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 details);
2370 return -1;
2371 }
2372 else if (strcmp(errors,"ignore") == 0) {
2373 return 0;
2374 }
2375 else if (strcmp(errors,"replace") == 0) {
2376 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2377 (*dest)++;
2378 return 0;
2379 }
2380 else {
2381 PyErr_Format(PyExc_ValueError,
2382 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002383 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 errors);
2385 return -1;
2386 }
2387}
2388
2389PyObject *PyUnicode_DecodeCharmap(const char *s,
2390 int size,
2391 PyObject *mapping,
2392 const char *errors)
2393{
2394 PyUnicodeObject *v;
2395 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002396 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397
2398 /* Default to Latin-1 */
2399 if (mapping == NULL)
2400 return PyUnicode_DecodeLatin1(s, size, errors);
2401
2402 v = _PyUnicode_New(size);
2403 if (v == NULL)
2404 goto onError;
2405 if (size == 0)
2406 return (PyObject *)v;
2407 p = PyUnicode_AS_UNICODE(v);
2408 while (size-- > 0) {
2409 unsigned char ch = *s++;
2410 PyObject *w, *x;
2411
2412 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2413 w = PyInt_FromLong((long)ch);
2414 if (w == NULL)
2415 goto onError;
2416 x = PyObject_GetItem(mapping, w);
2417 Py_DECREF(w);
2418 if (x == NULL) {
2419 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002420 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002422 x = Py_None;
2423 Py_INCREF(x);
2424 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 }
2427
2428 /* Apply mapping */
2429 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002430 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 if (value < 0 || value > 65535) {
2432 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002433 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 Py_DECREF(x);
2435 goto onError;
2436 }
2437 *p++ = (Py_UNICODE)value;
2438 }
2439 else if (x == Py_None) {
2440 /* undefined mapping */
2441 if (charmap_decoding_error(&s, &p, errors,
2442 "character maps to <undefined>")) {
2443 Py_DECREF(x);
2444 goto onError;
2445 }
2446 }
2447 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002448 int targetsize = PyUnicode_GET_SIZE(x);
2449
2450 if (targetsize == 1)
2451 /* 1-1 mapping */
2452 *p++ = *PyUnicode_AS_UNICODE(x);
2453
2454 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002456 if (targetsize > extrachars) {
2457 /* resize first */
2458 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2459 int needed = (targetsize - extrachars) + \
2460 (targetsize << 2);
2461 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002462 if (_PyUnicode_Resize(&v,
2463 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002464 Py_DECREF(x);
2465 goto onError;
2466 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002467 p = PyUnicode_AS_UNICODE(v) + oldpos;
2468 }
2469 Py_UNICODE_COPY(p,
2470 PyUnicode_AS_UNICODE(x),
2471 targetsize);
2472 p += targetsize;
2473 extrachars -= targetsize;
2474 }
2475 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
2477 else {
2478 /* wrong return value */
2479 PyErr_SetString(PyExc_TypeError,
2480 "character mapping must return integer, None or unicode");
2481 Py_DECREF(x);
2482 goto onError;
2483 }
2484 Py_DECREF(x);
2485 }
2486 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002487 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 goto onError;
2489 return (PyObject *)v;
2490
2491 onError:
2492 Py_XDECREF(v);
2493 return NULL;
2494}
2495
2496static
2497int charmap_encoding_error(const Py_UNICODE **source,
2498 char **dest,
2499 const char *errors,
2500 const char *details)
2501{
2502 if ((errors == NULL) ||
2503 (strcmp(errors,"strict") == 0)) {
2504 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002505 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 details);
2507 return -1;
2508 }
2509 else if (strcmp(errors,"ignore") == 0) {
2510 return 0;
2511 }
2512 else if (strcmp(errors,"replace") == 0) {
2513 **dest = '?';
2514 (*dest)++;
2515 return 0;
2516 }
2517 else {
2518 PyErr_Format(PyExc_ValueError,
2519 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002520 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 errors);
2522 return -1;
2523 }
2524}
2525
2526PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2527 int size,
2528 PyObject *mapping,
2529 const char *errors)
2530{
2531 PyObject *v;
2532 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002533 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534
2535 /* Default to Latin-1 */
2536 if (mapping == NULL)
2537 return PyUnicode_EncodeLatin1(p, size, errors);
2538
2539 v = PyString_FromStringAndSize(NULL, size);
2540 if (v == NULL)
2541 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002542 if (size == 0)
2543 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 s = PyString_AS_STRING(v);
2545 while (size-- > 0) {
2546 Py_UNICODE ch = *p++;
2547 PyObject *w, *x;
2548
2549 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2550 w = PyInt_FromLong((long)ch);
2551 if (w == NULL)
2552 goto onError;
2553 x = PyObject_GetItem(mapping, w);
2554 Py_DECREF(w);
2555 if (x == NULL) {
2556 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002557 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002559 x = Py_None;
2560 Py_INCREF(x);
2561 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
2564
2565 /* Apply mapping */
2566 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002567 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 if (value < 0 || value > 255) {
2569 PyErr_SetString(PyExc_TypeError,
2570 "character mapping must be in range(256)");
2571 Py_DECREF(x);
2572 goto onError;
2573 }
2574 *s++ = (char)value;
2575 }
2576 else if (x == Py_None) {
2577 /* undefined mapping */
2578 if (charmap_encoding_error(&p, &s, errors,
2579 "character maps to <undefined>")) {
2580 Py_DECREF(x);
2581 goto onError;
2582 }
2583 }
2584 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002585 int targetsize = PyString_GET_SIZE(x);
2586
2587 if (targetsize == 1)
2588 /* 1-1 mapping */
2589 *s++ = *PyString_AS_STRING(x);
2590
2591 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002593 if (targetsize > extrachars) {
2594 /* resize first */
2595 int oldpos = (int)(s - PyString_AS_STRING(v));
2596 int needed = (targetsize - extrachars) + \
2597 (targetsize << 2);
2598 extrachars += needed;
2599 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002600 Py_DECREF(x);
2601 goto onError;
2602 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002603 s = PyString_AS_STRING(v) + oldpos;
2604 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002605 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002606 s += targetsize;
2607 extrachars -= targetsize;
2608 }
2609 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 }
2611 else {
2612 /* wrong return value */
2613 PyErr_SetString(PyExc_TypeError,
2614 "character mapping must return integer, None or unicode");
2615 Py_DECREF(x);
2616 goto onError;
2617 }
2618 Py_DECREF(x);
2619 }
2620 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2621 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2622 goto onError;
2623 return v;
2624
2625 onError:
2626 Py_DECREF(v);
2627 return NULL;
2628}
2629
2630PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2631 PyObject *mapping)
2632{
2633 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2634 PyErr_BadArgument();
2635 return NULL;
2636 }
2637 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2638 PyUnicode_GET_SIZE(unicode),
2639 mapping,
2640 NULL);
2641}
2642
2643static
2644int translate_error(const Py_UNICODE **source,
2645 Py_UNICODE **dest,
2646 const char *errors,
2647 const char *details)
2648{
2649 if ((errors == NULL) ||
2650 (strcmp(errors,"strict") == 0)) {
2651 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002652 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 details);
2654 return -1;
2655 }
2656 else if (strcmp(errors,"ignore") == 0) {
2657 return 0;
2658 }
2659 else if (strcmp(errors,"replace") == 0) {
2660 **dest = '?';
2661 (*dest)++;
2662 return 0;
2663 }
2664 else {
2665 PyErr_Format(PyExc_ValueError,
2666 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002667 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 errors);
2669 return -1;
2670 }
2671}
2672
2673PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2674 int size,
2675 PyObject *mapping,
2676 const char *errors)
2677{
2678 PyUnicodeObject *v;
2679 Py_UNICODE *p;
2680
2681 if (mapping == NULL) {
2682 PyErr_BadArgument();
2683 return NULL;
2684 }
2685
2686 /* Output will never be longer than input */
2687 v = _PyUnicode_New(size);
2688 if (v == NULL)
2689 goto onError;
2690 if (size == 0)
2691 goto done;
2692 p = PyUnicode_AS_UNICODE(v);
2693 while (size-- > 0) {
2694 Py_UNICODE ch = *s++;
2695 PyObject *w, *x;
2696
2697 /* Get mapping */
2698 w = PyInt_FromLong(ch);
2699 if (w == NULL)
2700 goto onError;
2701 x = PyObject_GetItem(mapping, w);
2702 Py_DECREF(w);
2703 if (x == NULL) {
2704 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2705 /* No mapping found: default to 1-1 mapping */
2706 PyErr_Clear();
2707 *p++ = ch;
2708 continue;
2709 }
2710 goto onError;
2711 }
2712
2713 /* Apply mapping */
2714 if (PyInt_Check(x))
2715 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2716 else if (x == Py_None) {
2717 /* undefined mapping */
2718 if (translate_error(&s, &p, errors,
2719 "character maps to <undefined>")) {
2720 Py_DECREF(x);
2721 goto onError;
2722 }
2723 }
2724 else if (PyUnicode_Check(x)) {
2725 if (PyUnicode_GET_SIZE(x) != 1) {
2726 /* 1-n mapping */
2727 PyErr_SetString(PyExc_NotImplementedError,
2728 "1-n mappings are currently not implemented");
2729 Py_DECREF(x);
2730 goto onError;
2731 }
2732 *p++ = *PyUnicode_AS_UNICODE(x);
2733 }
2734 else {
2735 /* wrong return value */
2736 PyErr_SetString(PyExc_TypeError,
2737 "translate mapping must return integer, None or unicode");
2738 Py_DECREF(x);
2739 goto onError;
2740 }
2741 Py_DECREF(x);
2742 }
2743 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002744 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002745 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746
2747 done:
2748 return (PyObject *)v;
2749
2750 onError:
2751 Py_XDECREF(v);
2752 return NULL;
2753}
2754
2755PyObject *PyUnicode_Translate(PyObject *str,
2756 PyObject *mapping,
2757 const char *errors)
2758{
2759 PyObject *result;
2760
2761 str = PyUnicode_FromObject(str);
2762 if (str == NULL)
2763 goto onError;
2764 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2765 PyUnicode_GET_SIZE(str),
2766 mapping,
2767 errors);
2768 Py_DECREF(str);
2769 return result;
2770
2771 onError:
2772 Py_XDECREF(str);
2773 return NULL;
2774}
2775
Guido van Rossum9e896b32000-04-05 20:11:21 +00002776/* --- Decimal Encoder ---------------------------------------------------- */
2777
2778int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2779 int length,
2780 char *output,
2781 const char *errors)
2782{
2783 Py_UNICODE *p, *end;
2784
2785 if (output == NULL) {
2786 PyErr_BadArgument();
2787 return -1;
2788 }
2789
2790 p = s;
2791 end = s + length;
2792 while (p < end) {
2793 register Py_UNICODE ch = *p++;
2794 int decimal;
2795
2796 if (Py_UNICODE_ISSPACE(ch)) {
2797 *output++ = ' ';
2798 continue;
2799 }
2800 decimal = Py_UNICODE_TODECIMAL(ch);
2801 if (decimal >= 0) {
2802 *output++ = '0' + decimal;
2803 continue;
2804 }
Guido van Rossumba477042000-04-06 18:18:10 +00002805 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002806 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002807 continue;
2808 }
2809 /* All other characters are considered invalid */
2810 if (errors == NULL || strcmp(errors, "strict") == 0) {
2811 PyErr_SetString(PyExc_ValueError,
2812 "invalid decimal Unicode string");
2813 goto onError;
2814 }
2815 else if (strcmp(errors, "ignore") == 0)
2816 continue;
2817 else if (strcmp(errors, "replace") == 0) {
2818 *output++ = '?';
2819 continue;
2820 }
2821 }
2822 /* 0-terminate the output string */
2823 *output++ = '\0';
2824 return 0;
2825
2826 onError:
2827 return -1;
2828}
2829
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830/* --- Helpers ------------------------------------------------------------ */
2831
2832static
2833int count(PyUnicodeObject *self,
2834 int start,
2835 int end,
2836 PyUnicodeObject *substring)
2837{
2838 int count = 0;
2839
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002840 if (start < 0)
2841 start += self->length;
2842 if (start < 0)
2843 start = 0;
2844 if (end > self->length)
2845 end = self->length;
2846 if (end < 0)
2847 end += self->length;
2848 if (end < 0)
2849 end = 0;
2850
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002851 if (substring->length == 0)
2852 return (end - start + 1);
2853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 end -= substring->length;
2855
2856 while (start <= end)
2857 if (Py_UNICODE_MATCH(self, start, substring)) {
2858 count++;
2859 start += substring->length;
2860 } else
2861 start++;
2862
2863 return count;
2864}
2865
2866int PyUnicode_Count(PyObject *str,
2867 PyObject *substr,
2868 int start,
2869 int end)
2870{
2871 int result;
2872
2873 str = PyUnicode_FromObject(str);
2874 if (str == NULL)
2875 return -1;
2876 substr = PyUnicode_FromObject(substr);
2877 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002878 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 return -1;
2880 }
2881
2882 result = count((PyUnicodeObject *)str,
2883 start, end,
2884 (PyUnicodeObject *)substr);
2885
2886 Py_DECREF(str);
2887 Py_DECREF(substr);
2888 return result;
2889}
2890
2891static
2892int findstring(PyUnicodeObject *self,
2893 PyUnicodeObject *substring,
2894 int start,
2895 int end,
2896 int direction)
2897{
2898 if (start < 0)
2899 start += self->length;
2900 if (start < 0)
2901 start = 0;
2902
2903 if (substring->length == 0)
2904 return start;
2905
2906 if (end > self->length)
2907 end = self->length;
2908 if (end < 0)
2909 end += self->length;
2910 if (end < 0)
2911 end = 0;
2912
2913 end -= substring->length;
2914
2915 if (direction < 0) {
2916 for (; end >= start; end--)
2917 if (Py_UNICODE_MATCH(self, end, substring))
2918 return end;
2919 } else {
2920 for (; start <= end; start++)
2921 if (Py_UNICODE_MATCH(self, start, substring))
2922 return start;
2923 }
2924
2925 return -1;
2926}
2927
2928int PyUnicode_Find(PyObject *str,
2929 PyObject *substr,
2930 int start,
2931 int end,
2932 int direction)
2933{
2934 int result;
2935
2936 str = PyUnicode_FromObject(str);
2937 if (str == NULL)
2938 return -1;
2939 substr = PyUnicode_FromObject(substr);
2940 if (substr == NULL) {
2941 Py_DECREF(substr);
2942 return -1;
2943 }
2944
2945 result = findstring((PyUnicodeObject *)str,
2946 (PyUnicodeObject *)substr,
2947 start, end, direction);
2948 Py_DECREF(str);
2949 Py_DECREF(substr);
2950 return result;
2951}
2952
2953static
2954int tailmatch(PyUnicodeObject *self,
2955 PyUnicodeObject *substring,
2956 int start,
2957 int end,
2958 int direction)
2959{
2960 if (start < 0)
2961 start += self->length;
2962 if (start < 0)
2963 start = 0;
2964
2965 if (substring->length == 0)
2966 return 1;
2967
2968 if (end > self->length)
2969 end = self->length;
2970 if (end < 0)
2971 end += self->length;
2972 if (end < 0)
2973 end = 0;
2974
2975 end -= substring->length;
2976 if (end < start)
2977 return 0;
2978
2979 if (direction > 0) {
2980 if (Py_UNICODE_MATCH(self, end, substring))
2981 return 1;
2982 } else {
2983 if (Py_UNICODE_MATCH(self, start, substring))
2984 return 1;
2985 }
2986
2987 return 0;
2988}
2989
2990int PyUnicode_Tailmatch(PyObject *str,
2991 PyObject *substr,
2992 int start,
2993 int end,
2994 int direction)
2995{
2996 int result;
2997
2998 str = PyUnicode_FromObject(str);
2999 if (str == NULL)
3000 return -1;
3001 substr = PyUnicode_FromObject(substr);
3002 if (substr == NULL) {
3003 Py_DECREF(substr);
3004 return -1;
3005 }
3006
3007 result = tailmatch((PyUnicodeObject *)str,
3008 (PyUnicodeObject *)substr,
3009 start, end, direction);
3010 Py_DECREF(str);
3011 Py_DECREF(substr);
3012 return result;
3013}
3014
3015static
3016const Py_UNICODE *findchar(const Py_UNICODE *s,
3017 int size,
3018 Py_UNICODE ch)
3019{
3020 /* like wcschr, but doesn't stop at NULL characters */
3021
3022 while (size-- > 0) {
3023 if (*s == ch)
3024 return s;
3025 s++;
3026 }
3027
3028 return NULL;
3029}
3030
3031/* Apply fixfct filter to the Unicode object self and return a
3032 reference to the modified object */
3033
3034static
3035PyObject *fixup(PyUnicodeObject *self,
3036 int (*fixfct)(PyUnicodeObject *s))
3037{
3038
3039 PyUnicodeObject *u;
3040
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003041 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 if (u == NULL)
3043 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003044
3045 Py_UNICODE_COPY(u->str, self->str, self->length);
3046
Tim Peters7a29bd52001-09-12 03:03:31 +00003047 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 /* fixfct should return TRUE if it modified the buffer. If
3049 FALSE, return a reference to the original buffer instead
3050 (to save space, not time) */
3051 Py_INCREF(self);
3052 Py_DECREF(u);
3053 return (PyObject*) self;
3054 }
3055 return (PyObject*) u;
3056}
3057
3058static
3059int fixupper(PyUnicodeObject *self)
3060{
3061 int len = self->length;
3062 Py_UNICODE *s = self->str;
3063 int status = 0;
3064
3065 while (len-- > 0) {
3066 register Py_UNICODE ch;
3067
3068 ch = Py_UNICODE_TOUPPER(*s);
3069 if (ch != *s) {
3070 status = 1;
3071 *s = ch;
3072 }
3073 s++;
3074 }
3075
3076 return status;
3077}
3078
3079static
3080int fixlower(PyUnicodeObject *self)
3081{
3082 int len = self->length;
3083 Py_UNICODE *s = self->str;
3084 int status = 0;
3085
3086 while (len-- > 0) {
3087 register Py_UNICODE ch;
3088
3089 ch = Py_UNICODE_TOLOWER(*s);
3090 if (ch != *s) {
3091 status = 1;
3092 *s = ch;
3093 }
3094 s++;
3095 }
3096
3097 return status;
3098}
3099
3100static
3101int fixswapcase(PyUnicodeObject *self)
3102{
3103 int len = self->length;
3104 Py_UNICODE *s = self->str;
3105 int status = 0;
3106
3107 while (len-- > 0) {
3108 if (Py_UNICODE_ISUPPER(*s)) {
3109 *s = Py_UNICODE_TOLOWER(*s);
3110 status = 1;
3111 } else if (Py_UNICODE_ISLOWER(*s)) {
3112 *s = Py_UNICODE_TOUPPER(*s);
3113 status = 1;
3114 }
3115 s++;
3116 }
3117
3118 return status;
3119}
3120
3121static
3122int fixcapitalize(PyUnicodeObject *self)
3123{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003124 int len = self->length;
3125 Py_UNICODE *s = self->str;
3126 int status = 0;
3127
3128 if (len == 0)
3129 return 0;
3130 if (Py_UNICODE_ISLOWER(*s)) {
3131 *s = Py_UNICODE_TOUPPER(*s);
3132 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003134 s++;
3135 while (--len > 0) {
3136 if (Py_UNICODE_ISUPPER(*s)) {
3137 *s = Py_UNICODE_TOLOWER(*s);
3138 status = 1;
3139 }
3140 s++;
3141 }
3142 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143}
3144
3145static
3146int fixtitle(PyUnicodeObject *self)
3147{
3148 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3149 register Py_UNICODE *e;
3150 int previous_is_cased;
3151
3152 /* Shortcut for single character strings */
3153 if (PyUnicode_GET_SIZE(self) == 1) {
3154 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3155 if (*p != ch) {
3156 *p = ch;
3157 return 1;
3158 }
3159 else
3160 return 0;
3161 }
3162
3163 e = p + PyUnicode_GET_SIZE(self);
3164 previous_is_cased = 0;
3165 for (; p < e; p++) {
3166 register const Py_UNICODE ch = *p;
3167
3168 if (previous_is_cased)
3169 *p = Py_UNICODE_TOLOWER(ch);
3170 else
3171 *p = Py_UNICODE_TOTITLE(ch);
3172
3173 if (Py_UNICODE_ISLOWER(ch) ||
3174 Py_UNICODE_ISUPPER(ch) ||
3175 Py_UNICODE_ISTITLE(ch))
3176 previous_is_cased = 1;
3177 else
3178 previous_is_cased = 0;
3179 }
3180 return 1;
3181}
3182
3183PyObject *PyUnicode_Join(PyObject *separator,
3184 PyObject *seq)
3185{
3186 Py_UNICODE *sep;
3187 int seplen;
3188 PyUnicodeObject *res = NULL;
3189 int reslen = 0;
3190 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 int sz = 100;
3192 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003193 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194
Tim Peters2cfe3682001-05-05 05:36:48 +00003195 it = PyObject_GetIter(seq);
3196 if (it == NULL)
3197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198
3199 if (separator == NULL) {
3200 Py_UNICODE blank = ' ';
3201 sep = &blank;
3202 seplen = 1;
3203 }
3204 else {
3205 separator = PyUnicode_FromObject(separator);
3206 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003207 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 sep = PyUnicode_AS_UNICODE(separator);
3209 seplen = PyUnicode_GET_SIZE(separator);
3210 }
3211
3212 res = _PyUnicode_New(sz);
3213 if (res == NULL)
3214 goto onError;
3215 p = PyUnicode_AS_UNICODE(res);
3216 reslen = 0;
3217
Tim Peters2cfe3682001-05-05 05:36:48 +00003218 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003220 PyObject *item = PyIter_Next(it);
3221 if (item == NULL) {
3222 if (PyErr_Occurred())
3223 goto onError;
3224 break;
3225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 if (!PyUnicode_Check(item)) {
3227 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003228 if (!PyString_Check(item)) {
3229 PyErr_Format(PyExc_TypeError,
3230 "sequence item %i: expected string or Unicode,"
3231 " %.80s found",
3232 i, item->ob_type->tp_name);
3233 Py_DECREF(item);
3234 goto onError;
3235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 v = PyUnicode_FromObject(item);
3237 Py_DECREF(item);
3238 item = v;
3239 if (item == NULL)
3240 goto onError;
3241 }
3242 itemlen = PyUnicode_GET_SIZE(item);
3243 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003244 if (_PyUnicode_Resize(&res, sz*2)) {
3245 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 sz *= 2;
3249 p = PyUnicode_AS_UNICODE(res) + reslen;
3250 }
3251 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003252 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 p += seplen;
3254 reslen += seplen;
3255 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003256 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 p += itemlen;
3258 reslen += itemlen;
3259 Py_DECREF(item);
3260 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003261 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 goto onError;
3263
3264 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003265 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 return (PyObject *)res;
3267
3268 onError:
3269 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003270 Py_XDECREF(res);
3271 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 return NULL;
3273}
3274
3275static
3276PyUnicodeObject *pad(PyUnicodeObject *self,
3277 int left,
3278 int right,
3279 Py_UNICODE fill)
3280{
3281 PyUnicodeObject *u;
3282
3283 if (left < 0)
3284 left = 0;
3285 if (right < 0)
3286 right = 0;
3287
Tim Peters7a29bd52001-09-12 03:03:31 +00003288 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 Py_INCREF(self);
3290 return self;
3291 }
3292
3293 u = _PyUnicode_New(left + self->length + right);
3294 if (u) {
3295 if (left)
3296 Py_UNICODE_FILL(u->str, fill, left);
3297 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3298 if (right)
3299 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3300 }
3301
3302 return u;
3303}
3304
3305#define SPLIT_APPEND(data, left, right) \
3306 str = PyUnicode_FromUnicode(data + left, right - left); \
3307 if (!str) \
3308 goto onError; \
3309 if (PyList_Append(list, str)) { \
3310 Py_DECREF(str); \
3311 goto onError; \
3312 } \
3313 else \
3314 Py_DECREF(str);
3315
3316static
3317PyObject *split_whitespace(PyUnicodeObject *self,
3318 PyObject *list,
3319 int maxcount)
3320{
3321 register int i;
3322 register int j;
3323 int len = self->length;
3324 PyObject *str;
3325
3326 for (i = j = 0; i < len; ) {
3327 /* find a token */
3328 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3329 i++;
3330 j = i;
3331 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3332 i++;
3333 if (j < i) {
3334 if (maxcount-- <= 0)
3335 break;
3336 SPLIT_APPEND(self->str, j, i);
3337 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3338 i++;
3339 j = i;
3340 }
3341 }
3342 if (j < len) {
3343 SPLIT_APPEND(self->str, j, len);
3344 }
3345 return list;
3346
3347 onError:
3348 Py_DECREF(list);
3349 return NULL;
3350}
3351
3352PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003353 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354{
3355 register int i;
3356 register int j;
3357 int len;
3358 PyObject *list;
3359 PyObject *str;
3360 Py_UNICODE *data;
3361
3362 string = PyUnicode_FromObject(string);
3363 if (string == NULL)
3364 return NULL;
3365 data = PyUnicode_AS_UNICODE(string);
3366 len = PyUnicode_GET_SIZE(string);
3367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 list = PyList_New(0);
3369 if (!list)
3370 goto onError;
3371
3372 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003373 int eol;
3374
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 /* Find a line and append it */
3376 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3377 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
3379 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003380 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 if (i < len) {
3382 if (data[i] == '\r' && i + 1 < len &&
3383 data[i+1] == '\n')
3384 i += 2;
3385 else
3386 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003387 if (keepends)
3388 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 }
Guido van Rossum86662912000-04-11 15:38:46 +00003390 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 j = i;
3392 }
3393 if (j < len) {
3394 SPLIT_APPEND(data, j, len);
3395 }
3396
3397 Py_DECREF(string);
3398 return list;
3399
3400 onError:
3401 Py_DECREF(list);
3402 Py_DECREF(string);
3403 return NULL;
3404}
3405
3406static
3407PyObject *split_char(PyUnicodeObject *self,
3408 PyObject *list,
3409 Py_UNICODE ch,
3410 int maxcount)
3411{
3412 register int i;
3413 register int j;
3414 int len = self->length;
3415 PyObject *str;
3416
3417 for (i = j = 0; i < len; ) {
3418 if (self->str[i] == ch) {
3419 if (maxcount-- <= 0)
3420 break;
3421 SPLIT_APPEND(self->str, j, i);
3422 i = j = i + 1;
3423 } else
3424 i++;
3425 }
3426 if (j <= len) {
3427 SPLIT_APPEND(self->str, j, len);
3428 }
3429 return list;
3430
3431 onError:
3432 Py_DECREF(list);
3433 return NULL;
3434}
3435
3436static
3437PyObject *split_substring(PyUnicodeObject *self,
3438 PyObject *list,
3439 PyUnicodeObject *substring,
3440 int maxcount)
3441{
3442 register int i;
3443 register int j;
3444 int len = self->length;
3445 int sublen = substring->length;
3446 PyObject *str;
3447
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003448 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 if (Py_UNICODE_MATCH(self, i, substring)) {
3450 if (maxcount-- <= 0)
3451 break;
3452 SPLIT_APPEND(self->str, j, i);
3453 i = j = i + sublen;
3454 } else
3455 i++;
3456 }
3457 if (j <= len) {
3458 SPLIT_APPEND(self->str, j, len);
3459 }
3460 return list;
3461
3462 onError:
3463 Py_DECREF(list);
3464 return NULL;
3465}
3466
3467#undef SPLIT_APPEND
3468
3469static
3470PyObject *split(PyUnicodeObject *self,
3471 PyUnicodeObject *substring,
3472 int maxcount)
3473{
3474 PyObject *list;
3475
3476 if (maxcount < 0)
3477 maxcount = INT_MAX;
3478
3479 list = PyList_New(0);
3480 if (!list)
3481 return NULL;
3482
3483 if (substring == NULL)
3484 return split_whitespace(self,list,maxcount);
3485
3486 else if (substring->length == 1)
3487 return split_char(self,list,substring->str[0],maxcount);
3488
3489 else if (substring->length == 0) {
3490 Py_DECREF(list);
3491 PyErr_SetString(PyExc_ValueError, "empty separator");
3492 return NULL;
3493 }
3494 else
3495 return split_substring(self,list,substring,maxcount);
3496}
3497
3498static
3499PyObject *strip(PyUnicodeObject *self,
3500 int left,
3501 int right)
3502{
3503 Py_UNICODE *p = self->str;
3504 int start = 0;
3505 int end = self->length;
3506
3507 if (left)
3508 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3509 start++;
3510
3511 if (right)
3512 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3513 end--;
3514
Tim Peters7a29bd52001-09-12 03:03:31 +00003515 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 /* couldn't strip anything off, return original string */
3517 Py_INCREF(self);
3518 return (PyObject*) self;
3519 }
3520
3521 return (PyObject*) PyUnicode_FromUnicode(
3522 self->str + start,
3523 end - start
3524 );
3525}
3526
3527static
3528PyObject *replace(PyUnicodeObject *self,
3529 PyUnicodeObject *str1,
3530 PyUnicodeObject *str2,
3531 int maxcount)
3532{
3533 PyUnicodeObject *u;
3534
3535 if (maxcount < 0)
3536 maxcount = INT_MAX;
3537
3538 if (str1->length == 1 && str2->length == 1) {
3539 int i;
3540
3541 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003542 if (!findchar(self->str, self->length, str1->str[0]) &&
3543 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 /* nothing to replace, return original string */
3545 Py_INCREF(self);
3546 u = self;
3547 } else {
3548 Py_UNICODE u1 = str1->str[0];
3549 Py_UNICODE u2 = str2->str[0];
3550
3551 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003552 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 self->length
3554 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003555 if (u != NULL) {
3556 Py_UNICODE_COPY(u->str, self->str,
3557 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 for (i = 0; i < u->length; i++)
3559 if (u->str[i] == u1) {
3560 if (--maxcount < 0)
3561 break;
3562 u->str[i] = u2;
3563 }
3564 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566
3567 } else {
3568 int n, i;
3569 Py_UNICODE *p;
3570
3571 /* replace strings */
3572 n = count(self, 0, self->length, str1);
3573 if (n > maxcount)
3574 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003575 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 /* nothing to replace, return original string */
3577 Py_INCREF(self);
3578 u = self;
3579 } else {
3580 u = _PyUnicode_New(
3581 self->length + n * (str2->length - str1->length));
3582 if (u) {
3583 i = 0;
3584 p = u->str;
3585 while (i <= self->length - str1->length)
3586 if (Py_UNICODE_MATCH(self, i, str1)) {
3587 /* replace string segment */
3588 Py_UNICODE_COPY(p, str2->str, str2->length);
3589 p += str2->length;
3590 i += str1->length;
3591 if (--n <= 0) {
3592 /* copy remaining part */
3593 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3594 break;
3595 }
3596 } else
3597 *p++ = self->str[i++];
3598 }
3599 }
3600 }
3601
3602 return (PyObject *) u;
3603}
3604
3605/* --- Unicode Object Methods --------------------------------------------- */
3606
3607static char title__doc__[] =
3608"S.title() -> unicode\n\
3609\n\
3610Return a titlecased version of S, i.e. words start with title case\n\
3611characters, all remaining cased characters have lower case.";
3612
3613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003614unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 return fixup(self, fixtitle);
3617}
3618
3619static char capitalize__doc__[] =
3620"S.capitalize() -> unicode\n\
3621\n\
3622Return a capitalized version of S, i.e. make the first character\n\
3623have upper case.";
3624
3625static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003626unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 return fixup(self, fixcapitalize);
3629}
3630
3631#if 0
3632static char capwords__doc__[] =
3633"S.capwords() -> unicode\n\
3634\n\
3635Apply .capitalize() to all words in S and return the result with\n\
3636normalized whitespace (all whitespace strings are replaced by ' ').";
3637
3638static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003639unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640{
3641 PyObject *list;
3642 PyObject *item;
3643 int i;
3644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 /* Split into words */
3646 list = split(self, NULL, -1);
3647 if (!list)
3648 return NULL;
3649
3650 /* Capitalize each word */
3651 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3652 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3653 fixcapitalize);
3654 if (item == NULL)
3655 goto onError;
3656 Py_DECREF(PyList_GET_ITEM(list, i));
3657 PyList_SET_ITEM(list, i, item);
3658 }
3659
3660 /* Join the words to form a new string */
3661 item = PyUnicode_Join(NULL, list);
3662
3663onError:
3664 Py_DECREF(list);
3665 return (PyObject *)item;
3666}
3667#endif
3668
3669static char center__doc__[] =
3670"S.center(width) -> unicode\n\
3671\n\
3672Return S centered in a Unicode string of length width. Padding is done\n\
3673using spaces.";
3674
3675static PyObject *
3676unicode_center(PyUnicodeObject *self, PyObject *args)
3677{
3678 int marg, left;
3679 int width;
3680
3681 if (!PyArg_ParseTuple(args, "i:center", &width))
3682 return NULL;
3683
Tim Peters7a29bd52001-09-12 03:03:31 +00003684 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 Py_INCREF(self);
3686 return (PyObject*) self;
3687 }
3688
3689 marg = width - self->length;
3690 left = marg / 2 + (marg & width & 1);
3691
3692 return (PyObject*) pad(self, left, marg - left, ' ');
3693}
3694
Marc-André Lemburge5034372000-08-08 08:04:29 +00003695#if 0
3696
3697/* This code should go into some future Unicode collation support
3698 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003699 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003700
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003701/* speedy UTF-16 code point order comparison */
3702/* gleaned from: */
3703/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3704
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003705static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003706{
3707 0, 0, 0, 0, 0, 0, 0, 0,
3708 0, 0, 0, 0, 0, 0, 0, 0,
3709 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003710 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003711};
3712
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713static int
3714unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3715{
3716 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003717
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 Py_UNICODE *s1 = str1->str;
3719 Py_UNICODE *s2 = str2->str;
3720
3721 len1 = str1->length;
3722 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003725 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003726
3727 c1 = *s1++;
3728 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003729
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003730 if (c1 > (1<<11) * 26)
3731 c1 += utf16Fixup[c1>>11];
3732 if (c2 > (1<<11) * 26)
3733 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003734 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003735
3736 if (c1 != c2)
3737 return (c1 < c2) ? -1 : 1;
3738
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003739 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 }
3741
3742 return (len1 < len2) ? -1 : (len1 != len2);
3743}
3744
Marc-André Lemburge5034372000-08-08 08:04:29 +00003745#else
3746
3747static int
3748unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3749{
3750 register int len1, len2;
3751
3752 Py_UNICODE *s1 = str1->str;
3753 Py_UNICODE *s2 = str2->str;
3754
3755 len1 = str1->length;
3756 len2 = str2->length;
3757
3758 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003759 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003760
Fredrik Lundh45714e92001-06-26 16:39:36 +00003761 c1 = *s1++;
3762 c2 = *s2++;
3763
3764 if (c1 != c2)
3765 return (c1 < c2) ? -1 : 1;
3766
Marc-André Lemburge5034372000-08-08 08:04:29 +00003767 len1--; len2--;
3768 }
3769
3770 return (len1 < len2) ? -1 : (len1 != len2);
3771}
3772
3773#endif
3774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775int PyUnicode_Compare(PyObject *left,
3776 PyObject *right)
3777{
3778 PyUnicodeObject *u = NULL, *v = NULL;
3779 int result;
3780
3781 /* Coerce the two arguments */
3782 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3783 if (u == NULL)
3784 goto onError;
3785 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3786 if (v == NULL)
3787 goto onError;
3788
Thomas Wouters7e474022000-07-16 12:04:32 +00003789 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 if (v == u) {
3791 Py_DECREF(u);
3792 Py_DECREF(v);
3793 return 0;
3794 }
3795
3796 result = unicode_compare(u, v);
3797
3798 Py_DECREF(u);
3799 Py_DECREF(v);
3800 return result;
3801
3802onError:
3803 Py_XDECREF(u);
3804 Py_XDECREF(v);
3805 return -1;
3806}
3807
Guido van Rossum403d68b2000-03-13 15:55:09 +00003808int PyUnicode_Contains(PyObject *container,
3809 PyObject *element)
3810{
3811 PyUnicodeObject *u = NULL, *v = NULL;
3812 int result;
3813 register const Py_UNICODE *p, *e;
3814 register Py_UNICODE ch;
3815
3816 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003817 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003818 if (v == NULL) {
3819 PyErr_SetString(PyExc_TypeError,
3820 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003821 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003822 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003823 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3824 if (u == NULL) {
3825 Py_DECREF(v);
3826 goto onError;
3827 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003828
3829 /* Check v in u */
3830 if (PyUnicode_GET_SIZE(v) != 1) {
3831 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003832 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003833 goto onError;
3834 }
3835 ch = *PyUnicode_AS_UNICODE(v);
3836 p = PyUnicode_AS_UNICODE(u);
3837 e = p + PyUnicode_GET_SIZE(u);
3838 result = 0;
3839 while (p < e) {
3840 if (*p++ == ch) {
3841 result = 1;
3842 break;
3843 }
3844 }
3845
3846 Py_DECREF(u);
3847 Py_DECREF(v);
3848 return result;
3849
3850onError:
3851 Py_XDECREF(u);
3852 Py_XDECREF(v);
3853 return -1;
3854}
3855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856/* Concat to string or Unicode object giving a new Unicode object. */
3857
3858PyObject *PyUnicode_Concat(PyObject *left,
3859 PyObject *right)
3860{
3861 PyUnicodeObject *u = NULL, *v = NULL, *w;
3862
3863 /* Coerce the two arguments */
3864 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3865 if (u == NULL)
3866 goto onError;
3867 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3868 if (v == NULL)
3869 goto onError;
3870
3871 /* Shortcuts */
3872 if (v == unicode_empty) {
3873 Py_DECREF(v);
3874 return (PyObject *)u;
3875 }
3876 if (u == unicode_empty) {
3877 Py_DECREF(u);
3878 return (PyObject *)v;
3879 }
3880
3881 /* Concat the two Unicode strings */
3882 w = _PyUnicode_New(u->length + v->length);
3883 if (w == NULL)
3884 goto onError;
3885 Py_UNICODE_COPY(w->str, u->str, u->length);
3886 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3887
3888 Py_DECREF(u);
3889 Py_DECREF(v);
3890 return (PyObject *)w;
3891
3892onError:
3893 Py_XDECREF(u);
3894 Py_XDECREF(v);
3895 return NULL;
3896}
3897
3898static char count__doc__[] =
3899"S.count(sub[, start[, end]]) -> int\n\
3900\n\
3901Return the number of occurrences of substring sub in Unicode string\n\
3902S[start:end]. Optional arguments start and end are\n\
3903interpreted as in slice notation.";
3904
3905static PyObject *
3906unicode_count(PyUnicodeObject *self, PyObject *args)
3907{
3908 PyUnicodeObject *substring;
3909 int start = 0;
3910 int end = INT_MAX;
3911 PyObject *result;
3912
Guido van Rossumb8872e62000-05-09 14:14:27 +00003913 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3914 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 return NULL;
3916
3917 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3918 (PyObject *)substring);
3919 if (substring == NULL)
3920 return NULL;
3921
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 if (start < 0)
3923 start += self->length;
3924 if (start < 0)
3925 start = 0;
3926 if (end > self->length)
3927 end = self->length;
3928 if (end < 0)
3929 end += self->length;
3930 if (end < 0)
3931 end = 0;
3932
3933 result = PyInt_FromLong((long) count(self, start, end, substring));
3934
3935 Py_DECREF(substring);
3936 return result;
3937}
3938
3939static char encode__doc__[] =
3940"S.encode([encoding[,errors]]) -> string\n\
3941\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003942Return an encoded string version of S. Default encoding is the current\n\
3943default string encoding. errors may be given to set a different error\n\
3944handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3945a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946
3947static PyObject *
3948unicode_encode(PyUnicodeObject *self, PyObject *args)
3949{
3950 char *encoding = NULL;
3951 char *errors = NULL;
3952 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3953 return NULL;
3954 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3955}
3956
3957static char expandtabs__doc__[] =
3958"S.expandtabs([tabsize]) -> unicode\n\
3959\n\
3960Return a copy of S where all tab characters are expanded using spaces.\n\
3961If tabsize is not given, a tab size of 8 characters is assumed.";
3962
3963static PyObject*
3964unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3965{
3966 Py_UNICODE *e;
3967 Py_UNICODE *p;
3968 Py_UNICODE *q;
3969 int i, j;
3970 PyUnicodeObject *u;
3971 int tabsize = 8;
3972
3973 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3974 return NULL;
3975
Thomas Wouters7e474022000-07-16 12:04:32 +00003976 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 i = j = 0;
3978 e = self->str + self->length;
3979 for (p = self->str; p < e; p++)
3980 if (*p == '\t') {
3981 if (tabsize > 0)
3982 j += tabsize - (j % tabsize);
3983 }
3984 else {
3985 j++;
3986 if (*p == '\n' || *p == '\r') {
3987 i += j;
3988 j = 0;
3989 }
3990 }
3991
3992 /* Second pass: create output string and fill it */
3993 u = _PyUnicode_New(i + j);
3994 if (!u)
3995 return NULL;
3996
3997 j = 0;
3998 q = u->str;
3999
4000 for (p = self->str; p < e; p++)
4001 if (*p == '\t') {
4002 if (tabsize > 0) {
4003 i = tabsize - (j % tabsize);
4004 j += i;
4005 while (i--)
4006 *q++ = ' ';
4007 }
4008 }
4009 else {
4010 j++;
4011 *q++ = *p;
4012 if (*p == '\n' || *p == '\r')
4013 j = 0;
4014 }
4015
4016 return (PyObject*) u;
4017}
4018
4019static char find__doc__[] =
4020"S.find(sub [,start [,end]]) -> int\n\
4021\n\
4022Return the lowest index in S where substring sub is found,\n\
4023such that sub is contained within s[start,end]. Optional\n\
4024arguments start and end are interpreted as in slice notation.\n\
4025\n\
4026Return -1 on failure.";
4027
4028static PyObject *
4029unicode_find(PyUnicodeObject *self, PyObject *args)
4030{
4031 PyUnicodeObject *substring;
4032 int start = 0;
4033 int end = INT_MAX;
4034 PyObject *result;
4035
Guido van Rossumb8872e62000-05-09 14:14:27 +00004036 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4037 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return NULL;
4039 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4040 (PyObject *)substring);
4041 if (substring == NULL)
4042 return NULL;
4043
4044 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4045
4046 Py_DECREF(substring);
4047 return result;
4048}
4049
4050static PyObject *
4051unicode_getitem(PyUnicodeObject *self, int index)
4052{
4053 if (index < 0 || index >= self->length) {
4054 PyErr_SetString(PyExc_IndexError, "string index out of range");
4055 return NULL;
4056 }
4057
4058 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4059}
4060
4061static long
4062unicode_hash(PyUnicodeObject *self)
4063{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004064 /* Since Unicode objects compare equal to their ASCII string
4065 counterparts, they should use the individual character values
4066 as basis for their hash value. This is needed to assure that
4067 strings and Unicode objects behave in the same way as
4068 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069
Fredrik Lundhdde61642000-07-10 18:27:47 +00004070 register int len;
4071 register Py_UNICODE *p;
4072 register long x;
4073
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 if (self->hash != -1)
4075 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004076 len = PyUnicode_GET_SIZE(self);
4077 p = PyUnicode_AS_UNICODE(self);
4078 x = *p << 7;
4079 while (--len >= 0)
4080 x = (1000003*x) ^ *p++;
4081 x ^= PyUnicode_GET_SIZE(self);
4082 if (x == -1)
4083 x = -2;
4084 self->hash = x;
4085 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086}
4087
4088static char index__doc__[] =
4089"S.index(sub [,start [,end]]) -> int\n\
4090\n\
4091Like S.find() but raise ValueError when the substring is not found.";
4092
4093static PyObject *
4094unicode_index(PyUnicodeObject *self, PyObject *args)
4095{
4096 int result;
4097 PyUnicodeObject *substring;
4098 int start = 0;
4099 int end = INT_MAX;
4100
Guido van Rossumb8872e62000-05-09 14:14:27 +00004101 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4102 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 return NULL;
4104
4105 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4106 (PyObject *)substring);
4107 if (substring == NULL)
4108 return NULL;
4109
4110 result = findstring(self, substring, start, end, 1);
4111
4112 Py_DECREF(substring);
4113 if (result < 0) {
4114 PyErr_SetString(PyExc_ValueError, "substring not found");
4115 return NULL;
4116 }
4117 return PyInt_FromLong(result);
4118}
4119
4120static char islower__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004121"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004123Return True if all cased characters in S are lowercase and there is\n\
4124at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
4126static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004127unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128{
4129 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4130 register const Py_UNICODE *e;
4131 int cased;
4132
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 /* Shortcut for single character strings */
4134 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004135 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004137 /* Special case for empty strings */
4138 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004140
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 e = p + PyUnicode_GET_SIZE(self);
4142 cased = 0;
4143 for (; p < e; p++) {
4144 register const Py_UNICODE ch = *p;
4145
4146 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 else if (!cased && Py_UNICODE_ISLOWER(ch))
4149 cased = 1;
4150 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004151 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152}
4153
4154static char isupper__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004155"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004157Return True if all cased characters in S are uppercase and there is\n\
4158at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159
4160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004161unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162{
4163 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4164 register const Py_UNICODE *e;
4165 int cased;
4166
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 /* Shortcut for single character strings */
4168 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004169 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004171 /* Special case for empty strings */
4172 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004173 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004174
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 e = p + PyUnicode_GET_SIZE(self);
4176 cased = 0;
4177 for (; p < e; p++) {
4178 register const Py_UNICODE ch = *p;
4179
4180 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004181 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 else if (!cased && Py_UNICODE_ISUPPER(ch))
4183 cased = 1;
4184 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004185 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186}
4187
4188static char istitle__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004189"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004191Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4192characters may only follow uncased characters and lowercase characters\n\
4193only cased ones. Return False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194
4195static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004196unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197{
4198 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4199 register const Py_UNICODE *e;
4200 int cased, previous_is_cased;
4201
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 /* Shortcut for single character strings */
4203 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004204 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4205 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004207 /* Special case for empty strings */
4208 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004209 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004210
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 e = p + PyUnicode_GET_SIZE(self);
4212 cased = 0;
4213 previous_is_cased = 0;
4214 for (; p < e; p++) {
4215 register const Py_UNICODE ch = *p;
4216
4217 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4218 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004219 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220 previous_is_cased = 1;
4221 cased = 1;
4222 }
4223 else if (Py_UNICODE_ISLOWER(ch)) {
4224 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004225 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 previous_is_cased = 1;
4227 cased = 1;
4228 }
4229 else
4230 previous_is_cased = 0;
4231 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004232 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233}
4234
4235static char isspace__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004236"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004238Return True if there are only whitespace characters in S,\n\
4239False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240
4241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004242unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243{
4244 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4245 register const Py_UNICODE *e;
4246
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 /* Shortcut for single character strings */
4248 if (PyUnicode_GET_SIZE(self) == 1 &&
4249 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004250 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004252 /* Special case for empty strings */
4253 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004254 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004255
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 e = p + PyUnicode_GET_SIZE(self);
4257 for (; p < e; p++) {
4258 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004259 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004261 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262}
4263
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004264static char isalpha__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004265"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004266\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004267Return True if all characters in S are alphabetic\n\
4268and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004269
4270static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004271unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004272{
4273 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4274 register const Py_UNICODE *e;
4275
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004276 /* Shortcut for single character strings */
4277 if (PyUnicode_GET_SIZE(self) == 1 &&
4278 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004279 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004280
4281 /* Special case for empty strings */
4282 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004283 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004284
4285 e = p + PyUnicode_GET_SIZE(self);
4286 for (; p < e; p++) {
4287 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004288 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004289 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004290 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004291}
4292
4293static char isalnum__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004294"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004295\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004296Return True if all characters in S are alphanumeric\n\
4297and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004298
4299static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004300unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004301{
4302 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4303 register const Py_UNICODE *e;
4304
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004305 /* Shortcut for single character strings */
4306 if (PyUnicode_GET_SIZE(self) == 1 &&
4307 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004308 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004309
4310 /* Special case for empty strings */
4311 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004312 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004313
4314 e = p + PyUnicode_GET_SIZE(self);
4315 for (; p < e; p++) {
4316 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004317 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004318 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004319 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004320}
4321
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322static char isdecimal__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004323"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004325Return True if there are only decimal characters in S,\n\
4326False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327
4328static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004329unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330{
4331 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4332 register const Py_UNICODE *e;
4333
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 /* Shortcut for single character strings */
4335 if (PyUnicode_GET_SIZE(self) == 1 &&
4336 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004337 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004339 /* Special case for empty strings */
4340 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004341 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004342
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 e = p + PyUnicode_GET_SIZE(self);
4344 for (; p < e; p++) {
4345 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004346 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004348 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349}
4350
4351static char isdigit__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004352"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004354Return True if there are only digit characters in S,\n\
4355False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356
4357static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004358unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359{
4360 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4361 register const Py_UNICODE *e;
4362
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 /* Shortcut for single character strings */
4364 if (PyUnicode_GET_SIZE(self) == 1 &&
4365 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004366 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004368 /* Special case for empty strings */
4369 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004370 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004371
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 e = p + PyUnicode_GET_SIZE(self);
4373 for (; p < e; p++) {
4374 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004375 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004377 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378}
4379
4380static char isnumeric__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004381"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004383Return True if there are only numeric characters in S,\n\
4384False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385
4386static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004387unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388{
4389 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4390 register const Py_UNICODE *e;
4391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 /* Shortcut for single character strings */
4393 if (PyUnicode_GET_SIZE(self) == 1 &&
4394 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004395 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004397 /* Special case for empty strings */
4398 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004399 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004400
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401 e = p + PyUnicode_GET_SIZE(self);
4402 for (; p < e; p++) {
4403 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004404 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004406 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407}
4408
4409static char join__doc__[] =
4410"S.join(sequence) -> unicode\n\
4411\n\
4412Return a string which is the concatenation of the strings in the\n\
4413sequence. The separator between elements is S.";
4414
4415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004416unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004418 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419}
4420
4421static int
4422unicode_length(PyUnicodeObject *self)
4423{
4424 return self->length;
4425}
4426
4427static char ljust__doc__[] =
4428"S.ljust(width) -> unicode\n\
4429\n\
4430Return S left justified in a Unicode string of length width. Padding is\n\
4431done using spaces.";
4432
4433static PyObject *
4434unicode_ljust(PyUnicodeObject *self, PyObject *args)
4435{
4436 int width;
4437 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4438 return NULL;
4439
Tim Peters7a29bd52001-09-12 03:03:31 +00004440 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 Py_INCREF(self);
4442 return (PyObject*) self;
4443 }
4444
4445 return (PyObject*) pad(self, 0, width - self->length, ' ');
4446}
4447
4448static char lower__doc__[] =
4449"S.lower() -> unicode\n\
4450\n\
4451Return a copy of the string S converted to lowercase.";
4452
4453static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004454unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 return fixup(self, fixlower);
4457}
4458
4459static char lstrip__doc__[] =
4460"S.lstrip() -> unicode\n\
4461\n\
4462Return a copy of the string S with leading whitespace removed.";
4463
4464static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004465unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 return strip(self, 1, 0);
4468}
4469
4470static PyObject*
4471unicode_repeat(PyUnicodeObject *str, int len)
4472{
4473 PyUnicodeObject *u;
4474 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004475 int nchars;
4476 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477
4478 if (len < 0)
4479 len = 0;
4480
Tim Peters7a29bd52001-09-12 03:03:31 +00004481 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 /* no repeat, return original string */
4483 Py_INCREF(str);
4484 return (PyObject*) str;
4485 }
Tim Peters8f422462000-09-09 06:13:41 +00004486
4487 /* ensure # of chars needed doesn't overflow int and # of bytes
4488 * needed doesn't overflow size_t
4489 */
4490 nchars = len * str->length;
4491 if (len && nchars / len != str->length) {
4492 PyErr_SetString(PyExc_OverflowError,
4493 "repeated string is too long");
4494 return NULL;
4495 }
4496 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4497 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4498 PyErr_SetString(PyExc_OverflowError,
4499 "repeated string is too long");
4500 return NULL;
4501 }
4502 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 if (!u)
4504 return NULL;
4505
4506 p = u->str;
4507
4508 while (len-- > 0) {
4509 Py_UNICODE_COPY(p, str->str, str->length);
4510 p += str->length;
4511 }
4512
4513 return (PyObject*) u;
4514}
4515
4516PyObject *PyUnicode_Replace(PyObject *obj,
4517 PyObject *subobj,
4518 PyObject *replobj,
4519 int maxcount)
4520{
4521 PyObject *self;
4522 PyObject *str1;
4523 PyObject *str2;
4524 PyObject *result;
4525
4526 self = PyUnicode_FromObject(obj);
4527 if (self == NULL)
4528 return NULL;
4529 str1 = PyUnicode_FromObject(subobj);
4530 if (str1 == NULL) {
4531 Py_DECREF(self);
4532 return NULL;
4533 }
4534 str2 = PyUnicode_FromObject(replobj);
4535 if (str2 == NULL) {
4536 Py_DECREF(self);
4537 Py_DECREF(str1);
4538 return NULL;
4539 }
4540 result = replace((PyUnicodeObject *)self,
4541 (PyUnicodeObject *)str1,
4542 (PyUnicodeObject *)str2,
4543 maxcount);
4544 Py_DECREF(self);
4545 Py_DECREF(str1);
4546 Py_DECREF(str2);
4547 return result;
4548}
4549
4550static char replace__doc__[] =
4551"S.replace (old, new[, maxsplit]) -> unicode\n\
4552\n\
4553Return a copy of S with all occurrences of substring\n\
4554old replaced by new. If the optional argument maxsplit is\n\
4555given, only the first maxsplit occurrences are replaced.";
4556
4557static PyObject*
4558unicode_replace(PyUnicodeObject *self, PyObject *args)
4559{
4560 PyUnicodeObject *str1;
4561 PyUnicodeObject *str2;
4562 int maxcount = -1;
4563 PyObject *result;
4564
4565 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4566 return NULL;
4567 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4568 if (str1 == NULL)
4569 return NULL;
4570 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4571 if (str2 == NULL)
4572 return NULL;
4573
4574 result = replace(self, str1, str2, maxcount);
4575
4576 Py_DECREF(str1);
4577 Py_DECREF(str2);
4578 return result;
4579}
4580
4581static
4582PyObject *unicode_repr(PyObject *unicode)
4583{
4584 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4585 PyUnicode_GET_SIZE(unicode),
4586 1);
4587}
4588
4589static char rfind__doc__[] =
4590"S.rfind(sub [,start [,end]]) -> int\n\
4591\n\
4592Return the highest index in S where substring sub is found,\n\
4593such that sub is contained within s[start,end]. Optional\n\
4594arguments start and end are interpreted as in slice notation.\n\
4595\n\
4596Return -1 on failure.";
4597
4598static PyObject *
4599unicode_rfind(PyUnicodeObject *self, PyObject *args)
4600{
4601 PyUnicodeObject *substring;
4602 int start = 0;
4603 int end = INT_MAX;
4604 PyObject *result;
4605
Guido van Rossumb8872e62000-05-09 14:14:27 +00004606 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4607 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608 return NULL;
4609 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4610 (PyObject *)substring);
4611 if (substring == NULL)
4612 return NULL;
4613
4614 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4615
4616 Py_DECREF(substring);
4617 return result;
4618}
4619
4620static char rindex__doc__[] =
4621"S.rindex(sub [,start [,end]]) -> int\n\
4622\n\
4623Like S.rfind() but raise ValueError when the substring is not found.";
4624
4625static PyObject *
4626unicode_rindex(PyUnicodeObject *self, PyObject *args)
4627{
4628 int result;
4629 PyUnicodeObject *substring;
4630 int start = 0;
4631 int end = INT_MAX;
4632
Guido van Rossumb8872e62000-05-09 14:14:27 +00004633 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4634 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 return NULL;
4636 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4637 (PyObject *)substring);
4638 if (substring == NULL)
4639 return NULL;
4640
4641 result = findstring(self, substring, start, end, -1);
4642
4643 Py_DECREF(substring);
4644 if (result < 0) {
4645 PyErr_SetString(PyExc_ValueError, "substring not found");
4646 return NULL;
4647 }
4648 return PyInt_FromLong(result);
4649}
4650
4651static char rjust__doc__[] =
4652"S.rjust(width) -> unicode\n\
4653\n\
4654Return S right justified in a Unicode string of length width. Padding is\n\
4655done using spaces.";
4656
4657static PyObject *
4658unicode_rjust(PyUnicodeObject *self, PyObject *args)
4659{
4660 int width;
4661 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4662 return NULL;
4663
Tim Peters7a29bd52001-09-12 03:03:31 +00004664 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 Py_INCREF(self);
4666 return (PyObject*) self;
4667 }
4668
4669 return (PyObject*) pad(self, width - self->length, 0, ' ');
4670}
4671
4672static char rstrip__doc__[] =
4673"S.rstrip() -> unicode\n\
4674\n\
4675Return a copy of the string S with trailing whitespace removed.";
4676
4677static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004678unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 return strip(self, 0, 1);
4681}
4682
4683static PyObject*
4684unicode_slice(PyUnicodeObject *self, int start, int end)
4685{
4686 /* standard clamping */
4687 if (start < 0)
4688 start = 0;
4689 if (end < 0)
4690 end = 0;
4691 if (end > self->length)
4692 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004693 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 /* full slice, return original string */
4695 Py_INCREF(self);
4696 return (PyObject*) self;
4697 }
4698 if (start > end)
4699 start = end;
4700 /* copy slice */
4701 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4702 end - start);
4703}
4704
4705PyObject *PyUnicode_Split(PyObject *s,
4706 PyObject *sep,
4707 int maxsplit)
4708{
4709 PyObject *result;
4710
4711 s = PyUnicode_FromObject(s);
4712 if (s == NULL)
4713 return NULL;
4714 if (sep != NULL) {
4715 sep = PyUnicode_FromObject(sep);
4716 if (sep == NULL) {
4717 Py_DECREF(s);
4718 return NULL;
4719 }
4720 }
4721
4722 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4723
4724 Py_DECREF(s);
4725 Py_XDECREF(sep);
4726 return result;
4727}
4728
4729static char split__doc__[] =
4730"S.split([sep [,maxsplit]]) -> list of strings\n\
4731\n\
4732Return a list of the words in S, using sep as the\n\
4733delimiter string. If maxsplit is given, at most maxsplit\n\
4734splits are done. If sep is not specified, any whitespace string\n\
4735is a separator.";
4736
4737static PyObject*
4738unicode_split(PyUnicodeObject *self, PyObject *args)
4739{
4740 PyObject *substring = Py_None;
4741 int maxcount = -1;
4742
4743 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4744 return NULL;
4745
4746 if (substring == Py_None)
4747 return split(self, NULL, maxcount);
4748 else if (PyUnicode_Check(substring))
4749 return split(self, (PyUnicodeObject *)substring, maxcount);
4750 else
4751 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4752}
4753
4754static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004755"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756\n\
4757Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004758Line breaks are not included in the resulting list unless keepends\n\
4759is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
4761static PyObject*
4762unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4763{
Guido van Rossum86662912000-04-11 15:38:46 +00004764 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765
Guido van Rossum86662912000-04-11 15:38:46 +00004766 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 return NULL;
4768
Guido van Rossum86662912000-04-11 15:38:46 +00004769 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770}
4771
4772static
4773PyObject *unicode_str(PyUnicodeObject *self)
4774{
Fred Drakee4315f52000-05-09 19:53:39 +00004775 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776}
4777
4778static char strip__doc__[] =
4779"S.strip() -> unicode\n\
4780\n\
4781Return a copy of S with leading and trailing whitespace removed.";
4782
4783static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004784unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 return strip(self, 1, 1);
4787}
4788
4789static char swapcase__doc__[] =
4790"S.swapcase() -> unicode\n\
4791\n\
4792Return a copy of S with uppercase characters converted to lowercase\n\
4793and vice versa.";
4794
4795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004796unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 return fixup(self, fixswapcase);
4799}
4800
4801static char translate__doc__[] =
4802"S.translate(table) -> unicode\n\
4803\n\
4804Return a copy of the string S, where all characters have been mapped\n\
4805through the given translation table, which must be a mapping of\n\
4806Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4807are left untouched. Characters mapped to None are deleted.";
4808
4809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004810unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 return PyUnicode_TranslateCharmap(self->str,
4813 self->length,
4814 table,
4815 "ignore");
4816}
4817
4818static char upper__doc__[] =
4819"S.upper() -> unicode\n\
4820\n\
4821Return a copy of S converted to uppercase.";
4822
4823static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004824unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 return fixup(self, fixupper);
4827}
4828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829static char zfill__doc__[] =
4830"S.zfill(width) -> unicode\n\
4831\n\
4832Pad a numeric string x with zeros on the left, to fill a field\n\
4833of the specified width. The string x is never truncated.";
4834
4835static PyObject *
4836unicode_zfill(PyUnicodeObject *self, PyObject *args)
4837{
4838 int fill;
4839 PyUnicodeObject *u;
4840
4841 int width;
4842 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4843 return NULL;
4844
4845 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004846 if (PyUnicode_CheckExact(self)) {
4847 Py_INCREF(self);
4848 return (PyObject*) self;
4849 }
4850 else
4851 return PyUnicode_FromUnicode(
4852 PyUnicode_AS_UNICODE(self),
4853 PyUnicode_GET_SIZE(self)
4854 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 }
4856
4857 fill = width - self->length;
4858
4859 u = pad(self, fill, 0, '0');
4860
Walter Dörwald068325e2002-04-15 13:36:47 +00004861 if (u == NULL)
4862 return NULL;
4863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 if (u->str[fill] == '+' || u->str[fill] == '-') {
4865 /* move sign to beginning of string */
4866 u->str[0] = u->str[fill];
4867 u->str[fill] = '0';
4868 }
4869
4870 return (PyObject*) u;
4871}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
4873#if 0
4874static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004875unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 return PyInt_FromLong(unicode_freelist_size);
4878}
4879#endif
4880
4881static char startswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004882"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004884Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885optional start, test S beginning at that position. With optional end, stop\n\
4886comparing S at that position.";
4887
4888static PyObject *
4889unicode_startswith(PyUnicodeObject *self,
4890 PyObject *args)
4891{
4892 PyUnicodeObject *substring;
4893 int start = 0;
4894 int end = INT_MAX;
4895 PyObject *result;
4896
Guido van Rossumb8872e62000-05-09 14:14:27 +00004897 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4898 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 return NULL;
4900 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4901 (PyObject *)substring);
4902 if (substring == NULL)
4903 return NULL;
4904
Guido van Rossum77f6a652002-04-03 22:41:51 +00004905 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906
4907 Py_DECREF(substring);
4908 return result;
4909}
4910
4911
4912static char endswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004913"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004915Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916optional start, test S beginning at that position. With optional end, stop\n\
4917comparing S at that position.";
4918
4919static PyObject *
4920unicode_endswith(PyUnicodeObject *self,
4921 PyObject *args)
4922{
4923 PyUnicodeObject *substring;
4924 int start = 0;
4925 int end = INT_MAX;
4926 PyObject *result;
4927
Guido van Rossumb8872e62000-05-09 14:14:27 +00004928 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4929 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 return NULL;
4931 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4932 (PyObject *)substring);
4933 if (substring == NULL)
4934 return NULL;
4935
Guido van Rossum77f6a652002-04-03 22:41:51 +00004936 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937
4938 Py_DECREF(substring);
4939 return result;
4940}
4941
4942
4943static PyMethodDef unicode_methods[] = {
4944
4945 /* Order is according to common usage: often used methods should
4946 appear first, since lookup is done sequentially. */
4947
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004948 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4949 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4950 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4951 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4952 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4953 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4954 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4955 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4956 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4957 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4958 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4959 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4960 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4961 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4962/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4963 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4964 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4965 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4966 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4967 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4968 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4969 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4970 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4971 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4972 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4973 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4974 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4975 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4976 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4977 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4978 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4979 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4980 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4981 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4982 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004983 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00004984#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004985 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986#endif
4987
4988#if 0
4989 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004990 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991#endif
4992
4993 {NULL, NULL}
4994};
4995
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996static PySequenceMethods unicode_as_sequence = {
4997 (inquiry) unicode_length, /* sq_length */
4998 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4999 (intargfunc) unicode_repeat, /* sq_repeat */
5000 (intargfunc) unicode_getitem, /* sq_item */
5001 (intintargfunc) unicode_slice, /* sq_slice */
5002 0, /* sq_ass_item */
5003 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005004 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005};
5006
5007static int
5008unicode_buffer_getreadbuf(PyUnicodeObject *self,
5009 int index,
5010 const void **ptr)
5011{
5012 if (index != 0) {
5013 PyErr_SetString(PyExc_SystemError,
5014 "accessing non-existent unicode segment");
5015 return -1;
5016 }
5017 *ptr = (void *) self->str;
5018 return PyUnicode_GET_DATA_SIZE(self);
5019}
5020
5021static int
5022unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5023 const void **ptr)
5024{
5025 PyErr_SetString(PyExc_TypeError,
5026 "cannot use unicode as modifyable buffer");
5027 return -1;
5028}
5029
5030static int
5031unicode_buffer_getsegcount(PyUnicodeObject *self,
5032 int *lenp)
5033{
5034 if (lenp)
5035 *lenp = PyUnicode_GET_DATA_SIZE(self);
5036 return 1;
5037}
5038
5039static int
5040unicode_buffer_getcharbuf(PyUnicodeObject *self,
5041 int index,
5042 const void **ptr)
5043{
5044 PyObject *str;
5045
5046 if (index != 0) {
5047 PyErr_SetString(PyExc_SystemError,
5048 "accessing non-existent unicode segment");
5049 return -1;
5050 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005051 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 if (str == NULL)
5053 return -1;
5054 *ptr = (void *) PyString_AS_STRING(str);
5055 return PyString_GET_SIZE(str);
5056}
5057
5058/* Helpers for PyUnicode_Format() */
5059
5060static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005061getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062{
5063 int argidx = *p_argidx;
5064 if (argidx < arglen) {
5065 (*p_argidx)++;
5066 if (arglen < 0)
5067 return args;
5068 else
5069 return PyTuple_GetItem(args, argidx);
5070 }
5071 PyErr_SetString(PyExc_TypeError,
5072 "not enough arguments for format string");
5073 return NULL;
5074}
5075
5076#define F_LJUST (1<<0)
5077#define F_SIGN (1<<1)
5078#define F_BLANK (1<<2)
5079#define F_ALT (1<<3)
5080#define F_ZERO (1<<4)
5081
5082static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084{
5085 register int i;
5086 int len;
5087 va_list va;
5088 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090
5091 /* First, format the string as char array, then expand to Py_UNICODE
5092 array. */
5093 charbuffer = (char *)buffer;
5094 len = vsprintf(charbuffer, format, va);
5095 for (i = len - 1; i >= 0; i--)
5096 buffer[i] = (Py_UNICODE) charbuffer[i];
5097
5098 va_end(va);
5099 return len;
5100}
5101
5102static int
5103formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005104 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 int flags,
5106 int prec,
5107 int type,
5108 PyObject *v)
5109{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005110 /* fmt = '%#.' + `prec` + `type`
5111 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 char fmt[20];
5113 double x;
5114
5115 x = PyFloat_AsDouble(v);
5116 if (x == -1.0 && PyErr_Occurred())
5117 return -1;
5118 if (prec < 0)
5119 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5121 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005122 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5123 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005124 /* worst case length calc to ensure no buffer overrun:
5125 fmt = %#.<prec>g
5126 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5127 for any double rep.)
5128 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5129 If prec=0 the effective precision is 1 (the leading digit is
5130 always given), therefore increase by one to 10+prec. */
5131 if (buflen <= (size_t)10 + (size_t)prec) {
5132 PyErr_SetString(PyExc_OverflowError,
5133 "formatted float is too long (precision too long?)");
5134 return -1;
5135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 return usprintf(buf, fmt, x);
5137}
5138
Tim Peters38fd5b62000-09-21 05:43:11 +00005139static PyObject*
5140formatlong(PyObject *val, int flags, int prec, int type)
5141{
5142 char *buf;
5143 int i, len;
5144 PyObject *str; /* temporary string object. */
5145 PyUnicodeObject *result;
5146
5147 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5148 if (!str)
5149 return NULL;
5150 result = _PyUnicode_New(len);
5151 for (i = 0; i < len; i++)
5152 result->str[i] = buf[i];
5153 result->str[len] = 0;
5154 Py_DECREF(str);
5155 return (PyObject*)result;
5156}
5157
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158static int
5159formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005160 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 int flags,
5162 int prec,
5163 int type,
5164 PyObject *v)
5165{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005166 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005167 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5168 * + 1 + 1
5169 * = 24
5170 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005171 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 long x;
5173
5174 x = PyInt_AsLong(v);
5175 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005176 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005178 prec = 1;
5179
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005180 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005181 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5182 */
5183 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005184 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005185 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005186 return -1;
5187 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005188
5189 if ((flags & F_ALT) &&
5190 (type == 'x' || type == 'X')) {
5191 /* When converting under %#x or %#X, there are a number
5192 * of issues that cause pain:
5193 * - when 0 is being converted, the C standard leaves off
5194 * the '0x' or '0X', which is inconsistent with other
5195 * %#x/%#X conversions and inconsistent with Python's
5196 * hex() function
5197 * - there are platforms that violate the standard and
5198 * convert 0 with the '0x' or '0X'
5199 * (Metrowerks, Compaq Tru64)
5200 * - there are platforms that give '0x' when converting
5201 * under %#X, but convert 0 in accordance with the
5202 * standard (OS/2 EMX)
5203 *
5204 * We can achieve the desired consistency by inserting our
5205 * own '0x' or '0X' prefix, and substituting %x/%X in place
5206 * of %#x/%#X.
5207 *
5208 * Note that this is the same approach as used in
5209 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005210 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005211 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5212 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005213 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005214 else {
5215 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5216 (flags&F_ALT) ? "#" : "",
5217 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return usprintf(buf, fmt, x);
5220}
5221
5222static int
5223formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005224 size_t buflen,
5225 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005227 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005228 if (PyUnicode_Check(v)) {
5229 if (PyUnicode_GET_SIZE(v) != 1)
5230 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005234 else if (PyString_Check(v)) {
5235 if (PyString_GET_SIZE(v) != 1)
5236 goto onError;
5237 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239
5240 else {
5241 /* Integer input truncated to a character */
5242 long x;
5243 x = PyInt_AsLong(v);
5244 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005245 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 buf[0] = (char) x;
5247 }
5248 buf[1] = '\0';
5249 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005250
5251 onError:
5252 PyErr_SetString(PyExc_TypeError,
5253 "%c requires int or char");
5254 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255}
5256
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005257/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5258
5259 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5260 chars are formatted. XXX This is a magic number. Each formatting
5261 routine does bounds checking to ensure no overflow, but a better
5262 solution may be to malloc a buffer of appropriate size for each
5263 format. For now, the current solution is sufficient.
5264*/
5265#define FORMATBUFLEN (size_t)120
5266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267PyObject *PyUnicode_Format(PyObject *format,
5268 PyObject *args)
5269{
5270 Py_UNICODE *fmt, *res;
5271 int fmtcnt, rescnt, reslen, arglen, argidx;
5272 int args_owned = 0;
5273 PyUnicodeObject *result = NULL;
5274 PyObject *dict = NULL;
5275 PyObject *uformat;
5276
5277 if (format == NULL || args == NULL) {
5278 PyErr_BadInternalCall();
5279 return NULL;
5280 }
5281 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005282 if (uformat == NULL)
5283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 fmt = PyUnicode_AS_UNICODE(uformat);
5285 fmtcnt = PyUnicode_GET_SIZE(uformat);
5286
5287 reslen = rescnt = fmtcnt + 100;
5288 result = _PyUnicode_New(reslen);
5289 if (result == NULL)
5290 goto onError;
5291 res = PyUnicode_AS_UNICODE(result);
5292
5293 if (PyTuple_Check(args)) {
5294 arglen = PyTuple_Size(args);
5295 argidx = 0;
5296 }
5297 else {
5298 arglen = -1;
5299 argidx = -2;
5300 }
5301 if (args->ob_type->tp_as_mapping)
5302 dict = args;
5303
5304 while (--fmtcnt >= 0) {
5305 if (*fmt != '%') {
5306 if (--rescnt < 0) {
5307 rescnt = fmtcnt + 100;
5308 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005309 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 return NULL;
5311 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5312 --rescnt;
5313 }
5314 *res++ = *fmt++;
5315 }
5316 else {
5317 /* Got a format specifier */
5318 int flags = 0;
5319 int width = -1;
5320 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 Py_UNICODE c = '\0';
5322 Py_UNICODE fill;
5323 PyObject *v = NULL;
5324 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005325 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 Py_UNICODE sign;
5327 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005328 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329
5330 fmt++;
5331 if (*fmt == '(') {
5332 Py_UNICODE *keystart;
5333 int keylen;
5334 PyObject *key;
5335 int pcount = 1;
5336
5337 if (dict == NULL) {
5338 PyErr_SetString(PyExc_TypeError,
5339 "format requires a mapping");
5340 goto onError;
5341 }
5342 ++fmt;
5343 --fmtcnt;
5344 keystart = fmt;
5345 /* Skip over balanced parentheses */
5346 while (pcount > 0 && --fmtcnt >= 0) {
5347 if (*fmt == ')')
5348 --pcount;
5349 else if (*fmt == '(')
5350 ++pcount;
5351 fmt++;
5352 }
5353 keylen = fmt - keystart - 1;
5354 if (fmtcnt < 0 || pcount > 0) {
5355 PyErr_SetString(PyExc_ValueError,
5356 "incomplete format key");
5357 goto onError;
5358 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005359#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005360 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 then looked up since Python uses strings to hold
5362 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005363 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 key = PyUnicode_EncodeUTF8(keystart,
5365 keylen,
5366 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005367#else
5368 key = PyUnicode_FromUnicode(keystart, keylen);
5369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 if (key == NULL)
5371 goto onError;
5372 if (args_owned) {
5373 Py_DECREF(args);
5374 args_owned = 0;
5375 }
5376 args = PyObject_GetItem(dict, key);
5377 Py_DECREF(key);
5378 if (args == NULL) {
5379 goto onError;
5380 }
5381 args_owned = 1;
5382 arglen = -1;
5383 argidx = -2;
5384 }
5385 while (--fmtcnt >= 0) {
5386 switch (c = *fmt++) {
5387 case '-': flags |= F_LJUST; continue;
5388 case '+': flags |= F_SIGN; continue;
5389 case ' ': flags |= F_BLANK; continue;
5390 case '#': flags |= F_ALT; continue;
5391 case '0': flags |= F_ZERO; continue;
5392 }
5393 break;
5394 }
5395 if (c == '*') {
5396 v = getnextarg(args, arglen, &argidx);
5397 if (v == NULL)
5398 goto onError;
5399 if (!PyInt_Check(v)) {
5400 PyErr_SetString(PyExc_TypeError,
5401 "* wants int");
5402 goto onError;
5403 }
5404 width = PyInt_AsLong(v);
5405 if (width < 0) {
5406 flags |= F_LJUST;
5407 width = -width;
5408 }
5409 if (--fmtcnt >= 0)
5410 c = *fmt++;
5411 }
5412 else if (c >= '0' && c <= '9') {
5413 width = c - '0';
5414 while (--fmtcnt >= 0) {
5415 c = *fmt++;
5416 if (c < '0' || c > '9')
5417 break;
5418 if ((width*10) / 10 != width) {
5419 PyErr_SetString(PyExc_ValueError,
5420 "width too big");
5421 goto onError;
5422 }
5423 width = width*10 + (c - '0');
5424 }
5425 }
5426 if (c == '.') {
5427 prec = 0;
5428 if (--fmtcnt >= 0)
5429 c = *fmt++;
5430 if (c == '*') {
5431 v = getnextarg(args, arglen, &argidx);
5432 if (v == NULL)
5433 goto onError;
5434 if (!PyInt_Check(v)) {
5435 PyErr_SetString(PyExc_TypeError,
5436 "* wants int");
5437 goto onError;
5438 }
5439 prec = PyInt_AsLong(v);
5440 if (prec < 0)
5441 prec = 0;
5442 if (--fmtcnt >= 0)
5443 c = *fmt++;
5444 }
5445 else if (c >= '0' && c <= '9') {
5446 prec = c - '0';
5447 while (--fmtcnt >= 0) {
5448 c = Py_CHARMASK(*fmt++);
5449 if (c < '0' || c > '9')
5450 break;
5451 if ((prec*10) / 10 != prec) {
5452 PyErr_SetString(PyExc_ValueError,
5453 "prec too big");
5454 goto onError;
5455 }
5456 prec = prec*10 + (c - '0');
5457 }
5458 }
5459 } /* prec */
5460 if (fmtcnt >= 0) {
5461 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 if (--fmtcnt >= 0)
5463 c = *fmt++;
5464 }
5465 }
5466 if (fmtcnt < 0) {
5467 PyErr_SetString(PyExc_ValueError,
5468 "incomplete format");
5469 goto onError;
5470 }
5471 if (c != '%') {
5472 v = getnextarg(args, arglen, &argidx);
5473 if (v == NULL)
5474 goto onError;
5475 }
5476 sign = 0;
5477 fill = ' ';
5478 switch (c) {
5479
5480 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005481 pbuf = formatbuf;
5482 /* presume that buffer length is at least 1 */
5483 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 len = 1;
5485 break;
5486
5487 case 's':
5488 case 'r':
5489 if (PyUnicode_Check(v) && c == 's') {
5490 temp = v;
5491 Py_INCREF(temp);
5492 }
5493 else {
5494 PyObject *unicode;
5495 if (c == 's')
5496 temp = PyObject_Str(v);
5497 else
5498 temp = PyObject_Repr(v);
5499 if (temp == NULL)
5500 goto onError;
5501 if (!PyString_Check(temp)) {
5502 /* XXX Note: this should never happen, since
5503 PyObject_Repr() and PyObject_Str() assure
5504 this */
5505 Py_DECREF(temp);
5506 PyErr_SetString(PyExc_TypeError,
5507 "%s argument has non-string str()");
5508 goto onError;
5509 }
Fred Drakee4315f52000-05-09 19:53:39 +00005510 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005512 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 "strict");
5514 Py_DECREF(temp);
5515 temp = unicode;
5516 if (temp == NULL)
5517 goto onError;
5518 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005519 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 len = PyUnicode_GET_SIZE(temp);
5521 if (prec >= 0 && len > prec)
5522 len = prec;
5523 break;
5524
5525 case 'i':
5526 case 'd':
5527 case 'u':
5528 case 'o':
5529 case 'x':
5530 case 'X':
5531 if (c == 'i')
5532 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005533 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005534 temp = formatlong(v, flags, prec, c);
5535 if (!temp)
5536 goto onError;
5537 pbuf = PyUnicode_AS_UNICODE(temp);
5538 len = PyUnicode_GET_SIZE(temp);
5539 /* unbounded ints can always produce
5540 a sign character! */
5541 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005543 else {
5544 pbuf = formatbuf;
5545 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5546 flags, prec, c, v);
5547 if (len < 0)
5548 goto onError;
5549 /* only d conversion is signed */
5550 sign = c == 'd';
5551 }
5552 if (flags & F_ZERO)
5553 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 break;
5555
5556 case 'e':
5557 case 'E':
5558 case 'f':
5559 case 'g':
5560 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005561 pbuf = formatbuf;
5562 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5563 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 if (len < 0)
5565 goto onError;
5566 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005567 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 fill = '0';
5569 break;
5570
5571 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005572 pbuf = formatbuf;
5573 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 if (len < 0)
5575 goto onError;
5576 break;
5577
5578 default:
5579 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005580 "unsupported format character '%c' (0x%x) "
5581 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005582 (31<=c && c<=126) ? c : '?',
5583 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 goto onError;
5585 }
5586 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005587 if (*pbuf == '-' || *pbuf == '+') {
5588 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 len--;
5590 }
5591 else if (flags & F_SIGN)
5592 sign = '+';
5593 else if (flags & F_BLANK)
5594 sign = ' ';
5595 else
5596 sign = 0;
5597 }
5598 if (width < len)
5599 width = len;
5600 if (rescnt < width + (sign != 0)) {
5601 reslen -= rescnt;
5602 rescnt = width + fmtcnt + 100;
5603 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005604 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return NULL;
5606 res = PyUnicode_AS_UNICODE(result)
5607 + reslen - rescnt;
5608 }
5609 if (sign) {
5610 if (fill != ' ')
5611 *res++ = sign;
5612 rescnt--;
5613 if (width > len)
5614 width--;
5615 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005616 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5617 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005618 assert(pbuf[1] == c);
5619 if (fill != ' ') {
5620 *res++ = *pbuf++;
5621 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005622 }
Tim Petersfff53252001-04-12 18:38:48 +00005623 rescnt -= 2;
5624 width -= 2;
5625 if (width < 0)
5626 width = 0;
5627 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 if (width > len && !(flags & F_LJUST)) {
5630 do {
5631 --rescnt;
5632 *res++ = fill;
5633 } while (--width > len);
5634 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005635 if (fill == ' ') {
5636 if (sign)
5637 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005638 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005639 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005640 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005641 *res++ = *pbuf++;
5642 *res++ = *pbuf++;
5643 }
5644 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005645 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 res += len;
5647 rescnt -= len;
5648 while (--width >= len) {
5649 --rescnt;
5650 *res++ = ' ';
5651 }
5652 if (dict && (argidx < arglen) && c != '%') {
5653 PyErr_SetString(PyExc_TypeError,
5654 "not all arguments converted");
5655 goto onError;
5656 }
5657 Py_XDECREF(temp);
5658 } /* '%' */
5659 } /* until end */
5660 if (argidx < arglen && !dict) {
5661 PyErr_SetString(PyExc_TypeError,
5662 "not all arguments converted");
5663 goto onError;
5664 }
5665
5666 if (args_owned) {
5667 Py_DECREF(args);
5668 }
5669 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005670 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005671 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 return (PyObject *)result;
5673
5674 onError:
5675 Py_XDECREF(result);
5676 Py_DECREF(uformat);
5677 if (args_owned) {
5678 Py_DECREF(args);
5679 }
5680 return NULL;
5681}
5682
5683static PyBufferProcs unicode_as_buffer = {
5684 (getreadbufferproc) unicode_buffer_getreadbuf,
5685 (getwritebufferproc) unicode_buffer_getwritebuf,
5686 (getsegcountproc) unicode_buffer_getsegcount,
5687 (getcharbufferproc) unicode_buffer_getcharbuf,
5688};
5689
Guido van Rossume023fe02001-08-30 03:12:59 +00005690staticforward PyObject *
5691unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5692
Tim Peters6d6c1a32001-08-02 04:15:00 +00005693static PyObject *
5694unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5695{
5696 PyObject *x = NULL;
5697 static char *kwlist[] = {"string", "encoding", "errors", 0};
5698 char *encoding = NULL;
5699 char *errors = NULL;
5700
Guido van Rossume023fe02001-08-30 03:12:59 +00005701 if (type != &PyUnicode_Type)
5702 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005703 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5704 kwlist, &x, &encoding, &errors))
5705 return NULL;
5706 if (x == NULL)
5707 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005708 if (encoding == NULL && errors == NULL)
5709 return PyObject_Unicode(x);
5710 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005711 return PyUnicode_FromEncodedObject(x, encoding, errors);
5712}
5713
Guido van Rossume023fe02001-08-30 03:12:59 +00005714static PyObject *
5715unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5716{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005717 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005718 int n;
5719
5720 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5721 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5722 if (tmp == NULL)
5723 return NULL;
5724 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005725 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5726 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005727 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005728 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5729 if (pnew->str == NULL) {
5730 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005731 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005732 return NULL;
5733 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005734 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5735 pnew->length = n;
5736 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005737 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005738 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005739}
5740
Tim Peters6d6c1a32001-08-02 04:15:00 +00005741static char unicode_doc[] =
5742"unicode(string [, encoding[, errors]]) -> object\n\
5743\n\
5744Create a new Unicode object from the given encoded string.\n\
5745encoding defaults to the current default string encoding and \n\
5746errors, defining the error handling, to 'strict'.";
5747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748PyTypeObject PyUnicode_Type = {
5749 PyObject_HEAD_INIT(&PyType_Type)
5750 0, /* ob_size */
5751 "unicode", /* tp_name */
5752 sizeof(PyUnicodeObject), /* tp_size */
5753 0, /* tp_itemsize */
5754 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005755 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005757 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 0, /* tp_setattr */
5759 (cmpfunc) unicode_compare, /* tp_compare */
5760 (reprfunc) unicode_repr, /* tp_repr */
5761 0, /* tp_as_number */
5762 &unicode_as_sequence, /* tp_as_sequence */
5763 0, /* tp_as_mapping */
5764 (hashfunc) unicode_hash, /* tp_hash*/
5765 0, /* tp_call*/
5766 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005767 PyObject_GenericGetAttr, /* tp_getattro */
5768 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005770 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005771 unicode_doc, /* tp_doc */
5772 0, /* tp_traverse */
5773 0, /* tp_clear */
5774 0, /* tp_richcompare */
5775 0, /* tp_weaklistoffset */
5776 0, /* tp_iter */
5777 0, /* tp_iternext */
5778 unicode_methods, /* tp_methods */
5779 0, /* tp_members */
5780 0, /* tp_getset */
5781 0, /* tp_base */
5782 0, /* tp_dict */
5783 0, /* tp_descr_get */
5784 0, /* tp_descr_set */
5785 0, /* tp_dictoffset */
5786 0, /* tp_init */
5787 0, /* tp_alloc */
5788 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005789 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790};
5791
5792/* Initialize the Unicode implementation */
5793
Thomas Wouters78890102000-07-22 19:25:51 +00005794void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005796 int i;
5797
Fred Drakee4315f52000-05-09 19:53:39 +00005798 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005799 unicode_freelist = NULL;
5800 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005802 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005803 for (i = 0; i < 256; i++)
5804 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805}
5806
5807/* Finalize the Unicode implementation */
5808
5809void
Thomas Wouters78890102000-07-22 19:25:51 +00005810_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005812 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005813 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005815 Py_XDECREF(unicode_empty);
5816 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005817
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005818 for (i = 0; i < 256; i++) {
5819 if (unicode_latin1[i]) {
5820 Py_DECREF(unicode_latin1[i]);
5821 unicode_latin1[i] = NULL;
5822 }
5823 }
5824
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005825 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 PyUnicodeObject *v = u;
5827 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005828 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005829 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005830 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005831 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005833 unicode_freelist = NULL;
5834 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835}