blob: 6dea94f4798f93c4019b61e2c22a22dfd7dac664 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 int owned = 0;
456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Greg Steinaf36a3a2000-07-17 09:04:43 +0000513 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000515 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000519 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523}
524
525PyObject *PyUnicode_Decode(const char *s,
526 int size,
527 const char *encoding,
528 const char *errors)
529{
530 PyObject *buffer = NULL, *unicode;
531
Fred Drakee4315f52000-05-09 19:53:39 +0000532 if (encoding == NULL)
533 encoding = PyUnicode_GetDefaultEncoding();
534
535 /* Shortcuts for common default encodings */
536 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "latin-1") == 0)
539 return PyUnicode_DecodeLatin1(s, size, errors);
540 else if (strcmp(encoding, "ascii") == 0)
541 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000542
543 /* Decode via the codec registry */
544 buffer = PyBuffer_FromMemory((void *)s, size);
545 if (buffer == NULL)
546 goto onError;
547 unicode = PyCodec_Decode(buffer, encoding, errors);
548 if (unicode == NULL)
549 goto onError;
550 if (!PyUnicode_Check(unicode)) {
551 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000552 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 unicode->ob_type->tp_name);
554 Py_DECREF(unicode);
555 goto onError;
556 }
557 Py_DECREF(buffer);
558 return unicode;
559
560 onError:
561 Py_XDECREF(buffer);
562 return NULL;
563}
564
565PyObject *PyUnicode_Encode(const Py_UNICODE *s,
566 int size,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v, *unicode;
571
572 unicode = PyUnicode_FromUnicode(s, size);
573 if (unicode == NULL)
574 return NULL;
575 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
576 Py_DECREF(unicode);
577 return v;
578}
579
580PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
581 const char *encoding,
582 const char *errors)
583{
584 PyObject *v;
585
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
Fred Drakee4315f52000-05-09 19:53:39 +0000590
591 if (encoding == NULL)
592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (errors == NULL) {
596 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000597 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "latin-1") == 0)
599 return PyUnicode_AsLatin1String(unicode);
600 else if (strcmp(encoding, "ascii") == 0)
601 return PyUnicode_AsASCIIString(unicode);
602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
604 /* Encode via the codec registry */
605 v = PyCodec_Encode(unicode, encoding, errors);
606 if (v == NULL)
607 goto onError;
608 /* XXX Should we really enforce this ? */
609 if (!PyString_Check(v)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 v->ob_type->tp_name);
613 Py_DECREF(v);
614 goto onError;
615 }
616 return v;
617
618 onError:
619 return NULL;
620}
621
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
623 const char *errors)
624{
625 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
626
627 if (v)
628 return v;
629 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
630 if (v && errors == NULL)
631 ((PyUnicodeObject *)unicode)->defenc = v;
632 return v;
633}
634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
636{
637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
641 return PyUnicode_AS_UNICODE(unicode);
642
643 onError:
644 return NULL;
645}
646
647int PyUnicode_GetSize(PyObject *unicode)
648{
649 if (!PyUnicode_Check(unicode)) {
650 PyErr_BadArgument();
651 goto onError;
652 }
653 return PyUnicode_GET_SIZE(unicode);
654
655 onError:
656 return -1;
657}
658
Thomas Wouters78890102000-07-22 19:25:51 +0000659const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000660{
661 return unicode_default_encoding;
662}
663
664int PyUnicode_SetDefaultEncoding(const char *encoding)
665{
666 PyObject *v;
667
668 /* Make sure the encoding is valid. As side effect, this also
669 loads the encoding into the codec registry cache. */
670 v = _PyCodec_Lookup(encoding);
671 if (v == NULL)
672 goto onError;
673 Py_DECREF(v);
674 strncpy(unicode_default_encoding,
675 encoding,
676 sizeof(unicode_default_encoding));
677 return 0;
678
679 onError:
680 return -1;
681}
682
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000683/* --- UTF-7 Codec -------------------------------------------------------- */
684
685/* see RFC2152 for details */
686
687static
688char utf7_special[128] = {
689 /* indicate whether a UTF-7 character is special i.e. cannot be directly
690 encoded:
691 0 - not special
692 1 - special
693 2 - whitespace (optional)
694 3 - RFC2152 Set O (optional) */
695 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
696 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
699 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
701 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
703
704};
705
706#define SPECIAL(c, encodeO, encodeWS) \
707 (((c)>127 || utf7_special[(c)] == 1) || \
708 (encodeWS && (utf7_special[(c)] == 2)) || \
709 (encodeO && (utf7_special[(c)] == 3)))
710
711#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
712#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
713#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
714 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
715
716#define ENCODE(out, ch, bits) \
717 while (bits >= 6) { \
718 *out++ = B64(ch >> (bits-6)); \
719 bits -= 6; \
720 }
721
722#define DECODE(out, ch, bits, surrogate) \
723 while (bits >= 16) { \
724 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
725 bits -= 16; \
726 if (surrogate) { \
727 /* We have already generated an error for the high surrogate
728 so let's not bother seeing if the low surrogate is correct or not */\
729 surrogate = 0; \
730 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
731 /* This is a surrogate pair. Unfortunately we can't represent \
732 it in a 16-bit character */ \
733 surrogate = 1; \
734 errmsg = "code pairs are not supported"; \
735 goto utf7Error; \
736 } else { \
737 *out++ = outCh; \
738 } \
739 } \
740
741static
742int utf7_decoding_error(Py_UNICODE **dest,
743 const char *errors,
744 const char *details)
745{
746 if ((errors == NULL) ||
747 (strcmp(errors,"strict") == 0)) {
748 PyErr_Format(PyExc_UnicodeError,
749 "UTF-7 decoding error: %.400s",
750 details);
751 return -1;
752 }
753 else if (strcmp(errors,"ignore") == 0) {
754 return 0;
755 }
756 else if (strcmp(errors,"replace") == 0) {
757 if (dest != NULL) {
758 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
759 (*dest)++;
760 }
761 return 0;
762 }
763 else {
764 PyErr_Format(PyExc_ValueError,
765 "UTF-7 decoding error; unknown error handling code: %.400s",
766 errors);
767 return -1;
768 }
769}
770
771PyObject *PyUnicode_DecodeUTF7(const char *s,
772 int size,
773 const char *errors)
774{
775 const char *e;
776 PyUnicodeObject *unicode;
777 Py_UNICODE *p;
778 const char *errmsg = "";
779 int inShift = 0;
780 unsigned int bitsleft = 0;
781 unsigned long charsleft = 0;
782 int surrogate = 0;
783
784 unicode = _PyUnicode_New(size);
785 if (!unicode)
786 return NULL;
787 if (size == 0)
788 return (PyObject *)unicode;
789
790 p = unicode->str;
791 e = s + size;
792
793 while (s < e) {
794 Py_UNICODE ch = *s;
795
796 if (inShift) {
797 if ((ch == '-') || !B64CHAR(ch)) {
798 inShift = 0;
799 s++;
800
801 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
802 if (bitsleft >= 6) {
803 /* The shift sequence has a partial character in it. If
804 bitsleft < 6 then we could just classify it as padding
805 but that is not the case here */
806
807 errmsg = "partial character in shift sequence";
808 goto utf7Error;
809 }
810 /* According to RFC2152 the remaining bits should be zero. We
811 choose to signal an error/insert a replacement character
812 here so indicate the potential of a misencoded character. */
813
814 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
815 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
816 errmsg = "non-zero padding bits in shift sequence";
817 goto utf7Error;
818 }
819
820 if (ch == '-') {
821 if ((s < e) && (*(s) == '-')) {
822 *p++ = '-';
823 inShift = 1;
824 }
825 } else if (SPECIAL(ch,0,0)) {
826 errmsg = "unexpected special character";
827 goto utf7Error;
828 } else {
829 *p++ = ch;
830 }
831 } else {
832 charsleft = (charsleft << 6) | UB64(ch);
833 bitsleft += 6;
834 s++;
835 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
836 }
837 }
838 else if ( ch == '+' ) {
839 s++;
840 if (s < e && *s == '-') {
841 s++;
842 *p++ = '+';
843 } else
844 {
845 inShift = 1;
846 bitsleft = 0;
847 }
848 }
849 else if (SPECIAL(ch,0,0)) {
850 errmsg = "unexpected special character";
851 s++;
852 goto utf7Error;
853 }
854 else {
855 *p++ = ch;
856 s++;
857 }
858 continue;
859 utf7Error:
860 if (utf7_decoding_error(&p, errors, errmsg))
861 goto onError;
862 }
863
864 if (inShift) {
865 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
866 goto onError;
867 }
868
869 if (_PyUnicode_Resize(&unicode, p - unicode->str))
870 goto onError;
871
872 return (PyObject *)unicode;
873
874onError:
875 Py_DECREF(unicode);
876 return NULL;
877}
878
879
880PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
881 int size,
882 int encodeSetO,
883 int encodeWhiteSpace,
884 const char *errors)
885{
886 PyObject *v;
887 /* It might be possible to tighten this worst case */
888 unsigned int cbAllocated = 5 * size;
889 int inShift = 0;
890 int i = 0;
891 unsigned int bitsleft = 0;
892 unsigned long charsleft = 0;
893 char * out;
894 char * start;
895
896 if (size == 0)
897 return PyString_FromStringAndSize(NULL, 0);
898
899 v = PyString_FromStringAndSize(NULL, cbAllocated);
900 if (v == NULL)
901 return NULL;
902
903 start = out = PyString_AS_STRING(v);
904 for (;i < size; ++i) {
905 Py_UNICODE ch = s[i];
906
907 if (!inShift) {
908 if (ch == '+') {
909 *out++ = '+';
910 *out++ = '-';
911 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
912 charsleft = ch;
913 bitsleft = 16;
914 *out++ = '+';
915 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
916 inShift = bitsleft > 0;
917 } else {
918 *out++ = (char) ch;
919 }
920 } else {
921 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
922 *out++ = B64(charsleft << (6-bitsleft));
923 charsleft = 0;
924 bitsleft = 0;
925 /* Characters not in the BASE64 set implicitly unshift the sequence
926 so no '-' is required, except if the character is itself a '-' */
927 if (B64CHAR(ch) || ch == '-') {
928 *out++ = '-';
929 }
930 inShift = 0;
931 *out++ = (char) ch;
932 } else {
933 bitsleft += 16;
934 charsleft = (charsleft << 16) | ch;
935 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
936
937 /* If the next character is special then we dont' need to terminate
938 the shift sequence. If the next character is not a BASE64 character
939 or '-' then the shift sequence will be terminated implicitly and we
940 don't have to insert a '-'. */
941
942 if (bitsleft == 0) {
943 if (i + 1 < size) {
944 Py_UNICODE ch2 = s[i+1];
945
946 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
947
948 } else if (B64CHAR(ch2) || ch2 == '-') {
949 *out++ = '-';
950 inShift = 0;
951 } else {
952 inShift = 0;
953 }
954
955 }
956 else {
957 *out++ = '-';
958 inShift = 0;
959 }
960 }
961 }
962 }
963 }
964 if (bitsleft) {
965 *out++= B64(charsleft << (6-bitsleft) );
966 *out++ = '-';
967 }
968
Tim Peters5de98422002-04-27 18:44:32 +0000969 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000970 return v;
971}
972
973#undef SPECIAL
974#undef B64
975#undef B64CHAR
976#undef UB64
977#undef ENCODE
978#undef DECODE
979
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980/* --- UTF-8 Codec -------------------------------------------------------- */
981
982static
983char utf8_code_length[256] = {
984 /* Map UTF-8 encoded prefix byte to sequence length. zero means
985 illegal prefix. see RFC 2279 for details */
986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
987 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
988 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
989 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
990 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
991 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
992 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
993 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
994 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
996 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
997 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
998 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
999 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1000 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1001 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1002};
1003
1004static
1005int utf8_decoding_error(const char **source,
1006 Py_UNICODE **dest,
1007 const char *errors,
1008 const char *details)
1009{
1010 if ((errors == NULL) ||
1011 (strcmp(errors,"strict") == 0)) {
1012 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001013 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 details);
1015 return -1;
1016 }
1017 else if (strcmp(errors,"ignore") == 0) {
1018 (*source)++;
1019 return 0;
1020 }
1021 else if (strcmp(errors,"replace") == 0) {
1022 (*source)++;
1023 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1024 (*dest)++;
1025 return 0;
1026 }
1027 else {
1028 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001029 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 errors);
1031 return -1;
1032 }
1033}
1034
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035PyObject *PyUnicode_DecodeUTF8(const char *s,
1036 int size,
1037 const char *errors)
1038{
1039 int n;
1040 const char *e;
1041 PyUnicodeObject *unicode;
1042 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001043 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044
1045 /* Note: size will always be longer than the resulting Unicode
1046 character count */
1047 unicode = _PyUnicode_New(size);
1048 if (!unicode)
1049 return NULL;
1050 if (size == 0)
1051 return (PyObject *)unicode;
1052
1053 /* Unpack UTF-8 encoded data */
1054 p = unicode->str;
1055 e = s + size;
1056
1057 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001058 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001061 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062 s++;
1063 continue;
1064 }
1065
1066 n = utf8_code_length[ch];
1067
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001068 if (s + n > e) {
1069 errmsg = "unexpected end of data";
1070 goto utf8Error;
1071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072
1073 switch (n) {
1074
1075 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "unexpected code byte";
1077 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001080 errmsg = "internal error";
1081 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082
1083 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001084 if ((s[1] & 0xc0) != 0x80) {
1085 errmsg = "invalid data";
1086 goto utf8Error;
1087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001089 if (ch < 0x80) {
1090 errmsg = "illegal encoding";
1091 goto utf8Error;
1092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 break;
1096
1097 case 3:
1098 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001099 (s[2] & 0xc0) != 0x80) {
1100 errmsg = "invalid data";
1101 goto utf8Error;
1102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001104 if (ch < 0x0800) {
1105 /* Note: UTF-8 encodings of surrogates are considered
1106 legal UTF-8 sequences;
1107
1108 XXX For wide builds (UCS-4) we should probably try
1109 to recombine the surrogates into a single code
1110 unit.
1111 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001112 errmsg = "illegal encoding";
1113 goto utf8Error;
1114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001116 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001117 break;
1118
1119 case 4:
1120 if ((s[1] & 0xc0) != 0x80 ||
1121 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001122 (s[3] & 0xc0) != 0x80) {
1123 errmsg = "invalid data";
1124 goto utf8Error;
1125 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001126 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1127 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1128 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001129 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001130 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001131 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001132 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001133 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 errmsg = "illegal encoding";
1135 goto utf8Error;
1136 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001137#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001138 *p++ = (Py_UNICODE)ch;
1139#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001140 /* compute and append the two surrogates: */
1141
1142 /* translate from 10000..10FFFF to 0..FFFF */
1143 ch -= 0x10000;
1144
1145 /* high surrogate = top 10 bits added to D800 */
1146 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1147
1148 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001149 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001150#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 break;
1152
1153 default:
1154 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 errmsg = "unsupported Unicode code range";
1156 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 }
1158 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 continue;
1160
1161 utf8Error:
1162 if (utf8_decoding_error(&s, &p, errors, errmsg))
1163 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
1165
1166 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001167 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 goto onError;
1169
1170 return (PyObject *)unicode;
1171
1172onError:
1173 Py_DECREF(unicode);
1174 return NULL;
1175}
1176
Tim Peters602f7402002-04-27 18:03:26 +00001177/* Allocation strategy: if the string is short, convert into a stack buffer
1178 and allocate exactly as much space needed at the end. Else allocate the
1179 maximum possible needed (4 result bytes per Unicode character), and return
1180 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001181*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001182PyObject *
1183PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1184 int size,
1185 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186{
Tim Peters602f7402002-04-27 18:03:26 +00001187#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001188
Tim Peters602f7402002-04-27 18:03:26 +00001189 int i; /* index into s of next input byte */
1190 PyObject *v; /* result string object */
1191 char *p; /* next free byte in output buffer */
1192 int nallocated; /* number of result bytes allocated */
1193 int nneeded; /* number of result bytes needed */
1194 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001195
Tim Peters602f7402002-04-27 18:03:26 +00001196 assert(s != NULL);
1197 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
Tim Peters602f7402002-04-27 18:03:26 +00001199 if (size <= MAX_SHORT_UNICHARS) {
1200 /* Write into the stack buffer; nallocated can't overflow.
1201 * At the end, we'll allocate exactly as much heap space as it
1202 * turns out we need.
1203 */
1204 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1205 v = NULL; /* will allocate after we're done */
1206 p = stackbuf;
1207 }
1208 else {
1209 /* Overallocate on the heap, and give the excess back at the end. */
1210 nallocated = size * 4;
1211 if (nallocated / 4 != size) /* overflow! */
1212 return PyErr_NoMemory();
1213 v = PyString_FromStringAndSize(NULL, nallocated);
1214 if (v == NULL)
1215 return NULL;
1216 p = PyString_AS_STRING(v);
1217 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001218
Tim Peters602f7402002-04-27 18:03:26 +00001219 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001221
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001222 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001223 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001227 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001228 *p++ = (char)(0xc0 | (ch >> 6));
1229 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001230 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001231 else {
Tim Peters602f7402002-04-27 18:03:26 +00001232 /* Encode UCS2 Unicode ordinals */
1233 if (ch < 0x10000) {
1234 /* Special case: check for high surrogate */
1235 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1236 Py_UCS4 ch2 = s[i];
1237 /* Check for low surrogate and combine the two to
1238 form a UCS4 value */
1239 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001240 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001241 i++;
1242 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 }
Tim Peters602f7402002-04-27 18:03:26 +00001244 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001247 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1248 *p++ = (char)(0x80 | (ch & 0x3f));
1249 continue;
1250 }
1251encodeUCS4:
1252 /* Encode UCS4 Unicode ordinals */
1253 *p++ = (char)(0xf0 | (ch >> 18));
1254 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1255 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1256 *p++ = (char)(0x80 | (ch & 0x3f));
1257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001259
Tim Peters602f7402002-04-27 18:03:26 +00001260 if (v == NULL) {
1261 /* This was stack allocated. */
1262 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1263 assert(nneeded <= nallocated);
1264 v = PyString_FromStringAndSize(stackbuf, nneeded);
1265 }
1266 else {
1267 /* Cut back to size actually needed. */
1268 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1269 assert(nneeded <= nallocated);
1270 _PyString_Resize(&v, nneeded);
1271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001273
Tim Peters602f7402002-04-27 18:03:26 +00001274#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1278{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 if (!PyUnicode_Check(unicode)) {
1280 PyErr_BadArgument();
1281 return NULL;
1282 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001283 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1284 PyUnicode_GET_SIZE(unicode),
1285 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286}
1287
1288/* --- UTF-16 Codec ------------------------------------------------------- */
1289
1290static
Tim Peters772747b2001-08-09 22:21:55 +00001291int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 const char *errors,
1293 const char *details)
1294{
1295 if ((errors == NULL) ||
1296 (strcmp(errors,"strict") == 0)) {
1297 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001298 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 details);
1300 return -1;
1301 }
1302 else if (strcmp(errors,"ignore") == 0) {
1303 return 0;
1304 }
1305 else if (strcmp(errors,"replace") == 0) {
1306 if (dest) {
1307 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1308 (*dest)++;
1309 }
1310 return 0;
1311 }
1312 else {
1313 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001314 "UTF-16 decoding error; "
1315 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 errors);
1317 return -1;
1318 }
1319}
1320
Tim Peters772747b2001-08-09 22:21:55 +00001321PyObject *
1322PyUnicode_DecodeUTF16(const char *s,
1323 int size,
1324 const char *errors,
1325 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326{
1327 PyUnicodeObject *unicode;
1328 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001329 const unsigned char *q, *e;
1330 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001331 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001332 /* Offsets from q for retrieving byte pairs in the right order. */
1333#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1334 int ihi = 1, ilo = 0;
1335#else
1336 int ihi = 0, ilo = 1;
1337#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
1339 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001340 if (size & 1) {
1341 if (utf16_decoding_error(NULL, errors, "truncated data"))
1342 return NULL;
1343 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 }
1345
1346 /* Note: size will always be longer than the resulting Unicode
1347 character count */
1348 unicode = _PyUnicode_New(size);
1349 if (!unicode)
1350 return NULL;
1351 if (size == 0)
1352 return (PyObject *)unicode;
1353
1354 /* Unpack UTF-16 encoded data */
1355 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001356 q = (unsigned char *)s;
1357 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
1359 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001360 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001362 /* Check for BOM marks (U+FEFF) in the input and adjust current
1363 byte order setting accordingly. In native mode, the leading BOM
1364 mark is skipped, in all other modes, it is copied to the output
1365 stream as-is (giving a ZWNBSP character). */
1366 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001367 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001368#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001369 if (bom == 0xFEFF) {
1370 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001371 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001372 }
1373 else if (bom == 0xFFFE) {
1374 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001375 bo = 1;
1376 }
1377#else
Tim Peters772747b2001-08-09 22:21:55 +00001378 if (bom == 0xFEFF) {
1379 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001380 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001381 }
1382 else if (bom == 0xFFFE) {
1383 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001384 bo = -1;
1385 }
1386#endif
1387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388
Tim Peters772747b2001-08-09 22:21:55 +00001389 if (bo == -1) {
1390 /* force LE */
1391 ihi = 1;
1392 ilo = 0;
1393 }
1394 else if (bo == 1) {
1395 /* force BE */
1396 ihi = 0;
1397 ilo = 1;
1398 }
1399
1400 while (q < e) {
1401 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1402 q += 2;
1403
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 if (ch < 0xD800 || ch > 0xDFFF) {
1405 *p++ = ch;
1406 continue;
1407 }
1408
1409 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001410 if (q >= e) {
1411 errmsg = "unexpected end of data";
1412 goto utf16Error;
1413 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001414 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001415 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1416 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001417 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001418#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001419 *p++ = ch;
1420 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001421#else
1422 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001423#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001424 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001425 }
1426 else {
1427 errmsg = "illegal UTF-16 surrogate";
1428 goto utf16Error;
1429 }
1430
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001432 errmsg = "illegal encoding";
1433 /* Fall through to report the error */
1434
1435 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001436 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 }
1439
1440 if (byteorder)
1441 *byteorder = bo;
1442
1443 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001444 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 goto onError;
1446
1447 return (PyObject *)unicode;
1448
1449onError:
1450 Py_DECREF(unicode);
1451 return NULL;
1452}
1453
Tim Peters772747b2001-08-09 22:21:55 +00001454PyObject *
1455PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1456 int size,
1457 const char *errors,
1458 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459{
1460 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001461 unsigned char *p;
1462 int i, pairs;
1463 /* Offsets from p for storing byte pairs in the right order. */
1464#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1465 int ihi = 1, ilo = 0;
1466#else
1467 int ihi = 0, ilo = 1;
1468#endif
1469
1470#define STORECHAR(CH) \
1471 do { \
1472 p[ihi] = ((CH) >> 8) & 0xff; \
1473 p[ilo] = (CH) & 0xff; \
1474 p += 2; \
1475 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001477 for (i = pairs = 0; i < size; i++)
1478 if (s[i] >= 0x10000)
1479 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001481 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 if (v == NULL)
1483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Tim Peters772747b2001-08-09 22:21:55 +00001485 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001487 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001488 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001489 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001490
1491 if (byteorder == -1) {
1492 /* force LE */
1493 ihi = 1;
1494 ilo = 0;
1495 }
1496 else if (byteorder == 1) {
1497 /* force BE */
1498 ihi = 0;
1499 ilo = 1;
1500 }
1501
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001502 while (size-- > 0) {
1503 Py_UNICODE ch = *s++;
1504 Py_UNICODE ch2 = 0;
1505 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001506 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1507 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508 }
Tim Peters772747b2001-08-09 22:21:55 +00001509 STORECHAR(ch);
1510 if (ch2)
1511 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001514#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515}
1516
1517PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1518{
1519 if (!PyUnicode_Check(unicode)) {
1520 PyErr_BadArgument();
1521 return NULL;
1522 }
1523 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1524 PyUnicode_GET_SIZE(unicode),
1525 NULL,
1526 0);
1527}
1528
1529/* --- Unicode Escape Codec ----------------------------------------------- */
1530
1531static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001532int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 const char *errors,
1534 const char *details)
1535{
1536 if ((errors == NULL) ||
1537 (strcmp(errors,"strict") == 0)) {
1538 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001539 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 details);
1541 return -1;
1542 }
1543 else if (strcmp(errors,"ignore") == 0) {
1544 return 0;
1545 }
1546 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001547 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1548 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 return 0;
1550 }
1551 else {
1552 PyErr_Format(PyExc_ValueError,
1553 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001554 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 errors);
1556 return -1;
1557 }
1558}
1559
Fredrik Lundh06d12682001-01-24 07:59:11 +00001560static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001561
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1563 int size,
1564 const char *errors)
1565{
1566 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001567 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001569 char* message;
1570 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1571
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572 /* Escaped strings will always be longer than the resulting
1573 Unicode string, so we start with size here and then reduce the
1574 length after conversion to the true value. */
1575 v = _PyUnicode_New(size);
1576 if (v == NULL)
1577 goto onError;
1578 if (size == 0)
1579 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001580
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 p = buf = PyUnicode_AS_UNICODE(v);
1582 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001583
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 while (s < end) {
1585 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001586 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001587 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588
1589 /* Non-escape characters are interpreted as Unicode ordinals */
1590 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001591 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 continue;
1593 }
1594
1595 /* \ - Escapes */
1596 s++;
1597 switch (*s++) {
1598
1599 /* \x escapes */
1600 case '\n': break;
1601 case '\\': *p++ = '\\'; break;
1602 case '\'': *p++ = '\''; break;
1603 case '\"': *p++ = '\"'; break;
1604 case 'b': *p++ = '\b'; break;
1605 case 'f': *p++ = '\014'; break; /* FF */
1606 case 't': *p++ = '\t'; break;
1607 case 'n': *p++ = '\n'; break;
1608 case 'r': *p++ = '\r'; break;
1609 case 'v': *p++ = '\013'; break; /* VT */
1610 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1611
1612 /* \OOO (octal) escapes */
1613 case '0': case '1': case '2': case '3':
1614 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001615 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001617 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001619 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001621 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 break;
1623
Fredrik Lundhccc74732001-02-18 22:13:49 +00001624 /* hex escapes */
1625 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001627 digits = 2;
1628 message = "truncated \\xXX escape";
1629 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 digits = 4;
1634 message = "truncated \\uXXXX escape";
1635 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636
Fredrik Lundhccc74732001-02-18 22:13:49 +00001637 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001638 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001639 digits = 8;
1640 message = "truncated \\UXXXXXXXX escape";
1641 hexescape:
1642 chr = 0;
1643 for (i = 0; i < digits; i++) {
1644 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001646 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001648 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 i++;
1650 break;
1651 }
1652 chr = (chr<<4) & ~0xF;
1653 if (c >= '0' && c <= '9')
1654 chr += c - '0';
1655 else if (c >= 'a' && c <= 'f')
1656 chr += 10 + c - 'a';
1657 else
1658 chr += 10 + c - 'A';
1659 }
1660 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001661 if (chr == 0xffffffff)
1662 /* _decoding_error will have already written into the
1663 target buffer. */
1664 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001665 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001666 /* when we get here, chr is a 32-bit unicode character */
1667 if (chr <= 0xffff)
1668 /* UCS-2 character */
1669 *p++ = (Py_UNICODE) chr;
1670 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001671 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001672 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001673#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001674 *p++ = chr;
1675#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001676 chr -= 0x10000L;
1677 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001678 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001679#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001680 } else {
1681 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001682 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001683 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001684 )
1685 goto onError;
1686 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001687 break;
1688
1689 /* \N{name} */
1690 case 'N':
1691 message = "malformed \\N character escape";
1692 if (ucnhash_CAPI == NULL) {
1693 /* load the unicode data module */
1694 PyObject *m, *v;
1695 m = PyImport_ImportModule("unicodedata");
1696 if (m == NULL)
1697 goto ucnhashError;
1698 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1699 Py_DECREF(m);
1700 if (v == NULL)
1701 goto ucnhashError;
1702 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1703 Py_DECREF(v);
1704 if (ucnhash_CAPI == NULL)
1705 goto ucnhashError;
1706 }
1707 if (*s == '{') {
1708 const char *start = s+1;
1709 /* look for the closing brace */
1710 while (*s != '}' && s < end)
1711 s++;
1712 if (s > start && s < end && *s == '}') {
1713 /* found a name. look it up in the unicode database */
1714 message = "unknown Unicode character name";
1715 s++;
1716 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1717 goto store;
1718 }
1719 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001720 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722 break;
1723
1724 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001725 if (s > end) {
1726 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1727 goto onError;
1728 }
1729 else {
1730 *p++ = '\\';
1731 *p++ = (unsigned char)s[-1];
1732 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 }
1735 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001736 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001737 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001739
Fredrik Lundhccc74732001-02-18 22:13:49 +00001740ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001741 PyErr_SetString(
1742 PyExc_UnicodeError,
1743 "\\N escapes not supported (can't load unicodedata module)"
1744 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001745 return NULL;
1746
Fredrik Lundhccc74732001-02-18 22:13:49 +00001747onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 Py_XDECREF(v);
1749 return NULL;
1750}
1751
1752/* Return a Unicode-Escape string version of the Unicode object.
1753
1754 If quotes is true, the string is enclosed in u"" or u'' quotes as
1755 appropriate.
1756
1757*/
1758
Barry Warsaw51ac5802000-03-20 16:36:48 +00001759static const Py_UNICODE *findchar(const Py_UNICODE *s,
1760 int size,
1761 Py_UNICODE ch);
1762
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763static
1764PyObject *unicodeescape_string(const Py_UNICODE *s,
1765 int size,
1766 int quotes)
1767{
1768 PyObject *repr;
1769 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001771 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772
1773 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1774 if (repr == NULL)
1775 return NULL;
1776
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001777 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778
1779 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 *p++ = 'u';
1781 *p++ = (findchar(s, size, '\'') &&
1782 !findchar(s, size, '"')) ? '"' : '\'';
1783 }
1784 while (size-- > 0) {
1785 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001786
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001788 if (quotes &&
1789 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 *p++ = '\\';
1791 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001792 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001794
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001795#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001796 /* Map 21-bit characters to '\U00xxxxxx' */
1797 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001798 int offset = p - PyString_AS_STRING(repr);
1799
1800 /* Resize the string if necessary */
1801 if (offset + 12 > PyString_GET_SIZE(repr)) {
1802 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001803 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001804 p = PyString_AS_STRING(repr) + offset;
1805 }
1806
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001807 *p++ = '\\';
1808 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001809 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1810 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1811 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1812 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1813 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1814 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1815 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001816 *p++ = hexdigit[ch & 0x0000000F];
1817 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001818 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001819#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001820 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1821 else if (ch >= 0xD800 && ch < 0xDC00) {
1822 Py_UNICODE ch2;
1823 Py_UCS4 ucs;
1824
1825 ch2 = *s++;
1826 size--;
1827 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1828 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1829 *p++ = '\\';
1830 *p++ = 'U';
1831 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1832 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1833 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1834 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1835 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1836 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1837 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1838 *p++ = hexdigit[ucs & 0x0000000F];
1839 continue;
1840 }
1841 /* Fall through: isolated surrogates are copied as-is */
1842 s--;
1843 size++;
1844 }
1845
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001847 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 *p++ = '\\';
1849 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001850 *p++ = hexdigit[(ch >> 12) & 0x000F];
1851 *p++ = hexdigit[(ch >> 8) & 0x000F];
1852 *p++ = hexdigit[(ch >> 4) & 0x000F];
1853 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001855
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001856 /* Map special whitespace to '\t', \n', '\r' */
1857 else if (ch == '\t') {
1858 *p++ = '\\';
1859 *p++ = 't';
1860 }
1861 else if (ch == '\n') {
1862 *p++ = '\\';
1863 *p++ = 'n';
1864 }
1865 else if (ch == '\r') {
1866 *p++ = '\\';
1867 *p++ = 'r';
1868 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001870 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001871 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001873 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001874 *p++ = hexdigit[(ch >> 4) & 0x000F];
1875 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001877
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* Copy everything else as-is */
1879 else
1880 *p++ = (char) ch;
1881 }
1882 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001883 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884
1885 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001886 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 return repr;
1888}
1889
1890PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1891 int size)
1892{
1893 return unicodeescape_string(s, size, 0);
1894}
1895
1896PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1897{
1898 if (!PyUnicode_Check(unicode)) {
1899 PyErr_BadArgument();
1900 return NULL;
1901 }
1902 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1903 PyUnicode_GET_SIZE(unicode));
1904}
1905
1906/* --- Raw Unicode Escape Codec ------------------------------------------- */
1907
1908PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1909 int size,
1910 const char *errors)
1911{
1912 PyUnicodeObject *v;
1913 Py_UNICODE *p, *buf;
1914 const char *end;
1915 const char *bs;
1916
1917 /* Escaped strings will always be longer than the resulting
1918 Unicode string, so we start with size here and then reduce the
1919 length after conversion to the true value. */
1920 v = _PyUnicode_New(size);
1921 if (v == NULL)
1922 goto onError;
1923 if (size == 0)
1924 return (PyObject *)v;
1925 p = buf = PyUnicode_AS_UNICODE(v);
1926 end = s + size;
1927 while (s < end) {
1928 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001929 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 int i;
1931
1932 /* Non-escape characters are interpreted as Unicode ordinals */
1933 if (*s != '\\') {
1934 *p++ = (unsigned char)*s++;
1935 continue;
1936 }
1937
1938 /* \u-escapes are only interpreted iff the number of leading
1939 backslashes if odd */
1940 bs = s;
1941 for (;s < end;) {
1942 if (*s != '\\')
1943 break;
1944 *p++ = (unsigned char)*s++;
1945 }
1946 if (((s - bs) & 1) == 0 ||
1947 s >= end ||
1948 *s != 'u') {
1949 continue;
1950 }
1951 p--;
1952 s++;
1953
1954 /* \uXXXX with 4 hex digits */
1955 for (x = 0, i = 0; i < 4; i++) {
1956 c = (unsigned char)s[i];
1957 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001958 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 "truncated \\uXXXX"))
1960 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001961 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 i++;
1963 break;
1964 }
1965 x = (x<<4) & ~0xF;
1966 if (c >= '0' && c <= '9')
1967 x += c - '0';
1968 else if (c >= 'a' && c <= 'f')
1969 x += 10 + c - 'a';
1970 else
1971 x += 10 + c - 'A';
1972 }
1973 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001974 if (x != 0xffffffff)
1975 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001977 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return (PyObject *)v;
1980
1981 onError:
1982 Py_XDECREF(v);
1983 return NULL;
1984}
1985
1986PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1987 int size)
1988{
1989 PyObject *repr;
1990 char *p;
1991 char *q;
1992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001993 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 repr = PyString_FromStringAndSize(NULL, 6 * size);
1996 if (repr == NULL)
1997 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001998 if (size == 0)
1999 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 p = q = PyString_AS_STRING(repr);
2002 while (size-- > 0) {
2003 Py_UNICODE ch = *s++;
2004 /* Map 16-bit characters to '\uxxxx' */
2005 if (ch >= 256) {
2006 *p++ = '\\';
2007 *p++ = 'u';
2008 *p++ = hexdigit[(ch >> 12) & 0xf];
2009 *p++ = hexdigit[(ch >> 8) & 0xf];
2010 *p++ = hexdigit[(ch >> 4) & 0xf];
2011 *p++ = hexdigit[ch & 15];
2012 }
2013 /* Copy everything else as-is */
2014 else
2015 *p++ = (char) ch;
2016 }
2017 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002018 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 return repr;
2020}
2021
2022PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2023{
2024 if (!PyUnicode_Check(unicode)) {
2025 PyErr_BadArgument();
2026 return NULL;
2027 }
2028 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2029 PyUnicode_GET_SIZE(unicode));
2030}
2031
2032/* --- Latin-1 Codec ------------------------------------------------------ */
2033
2034PyObject *PyUnicode_DecodeLatin1(const char *s,
2035 int size,
2036 const char *errors)
2037{
2038 PyUnicodeObject *v;
2039 Py_UNICODE *p;
2040
2041 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002042 if (size == 1 && *(unsigned char*)s < 256) {
2043 Py_UNICODE r = *(unsigned char*)s;
2044 return PyUnicode_FromUnicode(&r, 1);
2045 }
2046
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 v = _PyUnicode_New(size);
2048 if (v == NULL)
2049 goto onError;
2050 if (size == 0)
2051 return (PyObject *)v;
2052 p = PyUnicode_AS_UNICODE(v);
2053 while (size-- > 0)
2054 *p++ = (unsigned char)*s++;
2055 return (PyObject *)v;
2056
2057 onError:
2058 Py_XDECREF(v);
2059 return NULL;
2060}
2061
2062static
2063int latin1_encoding_error(const Py_UNICODE **source,
2064 char **dest,
2065 const char *errors,
2066 const char *details)
2067{
2068 if ((errors == NULL) ||
2069 (strcmp(errors,"strict") == 0)) {
2070 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002071 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 details);
2073 return -1;
2074 }
2075 else if (strcmp(errors,"ignore") == 0) {
2076 return 0;
2077 }
2078 else if (strcmp(errors,"replace") == 0) {
2079 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002080 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return 0;
2082 }
2083 else {
2084 PyErr_Format(PyExc_ValueError,
2085 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002086 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 errors);
2088 return -1;
2089 }
2090}
2091
2092PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2093 int size,
2094 const char *errors)
2095{
2096 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002097 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 repr = PyString_FromStringAndSize(NULL, size);
2100 if (repr == NULL)
2101 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002102 if (size == 0)
2103 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
2105 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002106 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 while (size-- > 0) {
2108 Py_UNICODE ch = *p++;
2109 if (ch >= 256) {
2110 if (latin1_encoding_error(&p, &s, errors,
2111 "ordinal not in range(256)"))
2112 goto onError;
2113 }
2114 else
2115 *s++ = (char)ch;
2116 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002117 /* Resize if error handling skipped some characters */
2118 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002119 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 return repr;
2121
2122 onError:
2123 Py_DECREF(repr);
2124 return NULL;
2125}
2126
2127PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2128{
2129 if (!PyUnicode_Check(unicode)) {
2130 PyErr_BadArgument();
2131 return NULL;
2132 }
2133 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2134 PyUnicode_GET_SIZE(unicode),
2135 NULL);
2136}
2137
2138/* --- 7-bit ASCII Codec -------------------------------------------------- */
2139
2140static
2141int ascii_decoding_error(const char **source,
2142 Py_UNICODE **dest,
2143 const char *errors,
2144 const char *details)
2145{
2146 if ((errors == NULL) ||
2147 (strcmp(errors,"strict") == 0)) {
2148 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002149 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 details);
2151 return -1;
2152 }
2153 else if (strcmp(errors,"ignore") == 0) {
2154 return 0;
2155 }
2156 else if (strcmp(errors,"replace") == 0) {
2157 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2158 (*dest)++;
2159 return 0;
2160 }
2161 else {
2162 PyErr_Format(PyExc_ValueError,
2163 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 errors);
2166 return -1;
2167 }
2168}
2169
2170PyObject *PyUnicode_DecodeASCII(const char *s,
2171 int size,
2172 const char *errors)
2173{
2174 PyUnicodeObject *v;
2175 Py_UNICODE *p;
2176
2177 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002178 if (size == 1 && *(unsigned char*)s < 128) {
2179 Py_UNICODE r = *(unsigned char*)s;
2180 return PyUnicode_FromUnicode(&r, 1);
2181 }
2182
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 v = _PyUnicode_New(size);
2184 if (v == NULL)
2185 goto onError;
2186 if (size == 0)
2187 return (PyObject *)v;
2188 p = PyUnicode_AS_UNICODE(v);
2189 while (size-- > 0) {
2190 register unsigned char c;
2191
2192 c = (unsigned char)*s++;
2193 if (c < 128)
2194 *p++ = c;
2195 else if (ascii_decoding_error(&s, &p, errors,
2196 "ordinal not in range(128)"))
2197 goto onError;
2198 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002199 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 return (PyObject *)v;
2203
2204 onError:
2205 Py_XDECREF(v);
2206 return NULL;
2207}
2208
2209static
2210int ascii_encoding_error(const Py_UNICODE **source,
2211 char **dest,
2212 const char *errors,
2213 const char *details)
2214{
2215 if ((errors == NULL) ||
2216 (strcmp(errors,"strict") == 0)) {
2217 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002218 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 details);
2220 return -1;
2221 }
2222 else if (strcmp(errors,"ignore") == 0) {
2223 return 0;
2224 }
2225 else if (strcmp(errors,"replace") == 0) {
2226 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002227 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return 0;
2229 }
2230 else {
2231 PyErr_Format(PyExc_ValueError,
2232 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002233 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 errors);
2235 return -1;
2236 }
2237}
2238
2239PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2240 int size,
2241 const char *errors)
2242{
2243 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002244 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002245
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 repr = PyString_FromStringAndSize(NULL, size);
2247 if (repr == NULL)
2248 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002249 if (size == 0)
2250 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251
2252 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002253 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 while (size-- > 0) {
2255 Py_UNICODE ch = *p++;
2256 if (ch >= 128) {
2257 if (ascii_encoding_error(&p, &s, errors,
2258 "ordinal not in range(128)"))
2259 goto onError;
2260 }
2261 else
2262 *s++ = (char)ch;
2263 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002264 /* Resize if error handling skipped some characters */
2265 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002266 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 return repr;
2268
2269 onError:
2270 Py_DECREF(repr);
2271 return NULL;
2272}
2273
2274PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode),
2282 NULL);
2283}
2284
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002285#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002286
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002287/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002288
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002289PyObject *PyUnicode_DecodeMBCS(const char *s,
2290 int size,
2291 const char *errors)
2292{
2293 PyUnicodeObject *v;
2294 Py_UNICODE *p;
2295
2296 /* First get the size of the result */
2297 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002298 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002299 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2300
2301 v = _PyUnicode_New(usize);
2302 if (v == NULL)
2303 return NULL;
2304 if (usize == 0)
2305 return (PyObject *)v;
2306 p = PyUnicode_AS_UNICODE(v);
2307 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2308 Py_DECREF(v);
2309 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2310 }
2311
2312 return (PyObject *)v;
2313}
2314
2315PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2316 int size,
2317 const char *errors)
2318{
2319 PyObject *repr;
2320 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002321 DWORD mbcssize;
2322
2323 /* If there are no characters, bail now! */
2324 if (size==0)
2325 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002326
2327 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002328 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002329 if (mbcssize==0)
2330 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2331
2332 repr = PyString_FromStringAndSize(NULL, mbcssize);
2333 if (repr == NULL)
2334 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002335 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002336 return repr;
2337
2338 /* Do the conversion */
2339 s = PyString_AS_STRING(repr);
2340 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2341 Py_DECREF(repr);
2342 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2343 }
2344 return repr;
2345}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002346
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002347#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349/* --- Character Mapping Codec -------------------------------------------- */
2350
2351static
2352int charmap_decoding_error(const char **source,
2353 Py_UNICODE **dest,
2354 const char *errors,
2355 const char *details)
2356{
2357 if ((errors == NULL) ||
2358 (strcmp(errors,"strict") == 0)) {
2359 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002360 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361 details);
2362 return -1;
2363 }
2364 else if (strcmp(errors,"ignore") == 0) {
2365 return 0;
2366 }
2367 else if (strcmp(errors,"replace") == 0) {
2368 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2369 (*dest)++;
2370 return 0;
2371 }
2372 else {
2373 PyErr_Format(PyExc_ValueError,
2374 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002375 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 errors);
2377 return -1;
2378 }
2379}
2380
2381PyObject *PyUnicode_DecodeCharmap(const char *s,
2382 int size,
2383 PyObject *mapping,
2384 const char *errors)
2385{
2386 PyUnicodeObject *v;
2387 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002388 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
2390 /* Default to Latin-1 */
2391 if (mapping == NULL)
2392 return PyUnicode_DecodeLatin1(s, size, errors);
2393
2394 v = _PyUnicode_New(size);
2395 if (v == NULL)
2396 goto onError;
2397 if (size == 0)
2398 return (PyObject *)v;
2399 p = PyUnicode_AS_UNICODE(v);
2400 while (size-- > 0) {
2401 unsigned char ch = *s++;
2402 PyObject *w, *x;
2403
2404 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2405 w = PyInt_FromLong((long)ch);
2406 if (w == NULL)
2407 goto onError;
2408 x = PyObject_GetItem(mapping, w);
2409 Py_DECREF(w);
2410 if (x == NULL) {
2411 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002412 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002414 x = Py_None;
2415 Py_INCREF(x);
2416 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
2419
2420 /* Apply mapping */
2421 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002422 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 if (value < 0 || value > 65535) {
2424 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002425 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 Py_DECREF(x);
2427 goto onError;
2428 }
2429 *p++ = (Py_UNICODE)value;
2430 }
2431 else if (x == Py_None) {
2432 /* undefined mapping */
2433 if (charmap_decoding_error(&s, &p, errors,
2434 "character maps to <undefined>")) {
2435 Py_DECREF(x);
2436 goto onError;
2437 }
2438 }
2439 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002440 int targetsize = PyUnicode_GET_SIZE(x);
2441
2442 if (targetsize == 1)
2443 /* 1-1 mapping */
2444 *p++ = *PyUnicode_AS_UNICODE(x);
2445
2446 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002448 if (targetsize > extrachars) {
2449 /* resize first */
2450 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2451 int needed = (targetsize - extrachars) + \
2452 (targetsize << 2);
2453 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002454 if (_PyUnicode_Resize(&v,
2455 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002456 Py_DECREF(x);
2457 goto onError;
2458 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002459 p = PyUnicode_AS_UNICODE(v) + oldpos;
2460 }
2461 Py_UNICODE_COPY(p,
2462 PyUnicode_AS_UNICODE(x),
2463 targetsize);
2464 p += targetsize;
2465 extrachars -= targetsize;
2466 }
2467 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
2469 else {
2470 /* wrong return value */
2471 PyErr_SetString(PyExc_TypeError,
2472 "character mapping must return integer, None or unicode");
2473 Py_DECREF(x);
2474 goto onError;
2475 }
2476 Py_DECREF(x);
2477 }
2478 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002479 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 goto onError;
2481 return (PyObject *)v;
2482
2483 onError:
2484 Py_XDECREF(v);
2485 return NULL;
2486}
2487
2488static
2489int charmap_encoding_error(const Py_UNICODE **source,
2490 char **dest,
2491 const char *errors,
2492 const char *details)
2493{
2494 if ((errors == NULL) ||
2495 (strcmp(errors,"strict") == 0)) {
2496 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002497 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 details);
2499 return -1;
2500 }
2501 else if (strcmp(errors,"ignore") == 0) {
2502 return 0;
2503 }
2504 else if (strcmp(errors,"replace") == 0) {
2505 **dest = '?';
2506 (*dest)++;
2507 return 0;
2508 }
2509 else {
2510 PyErr_Format(PyExc_ValueError,
2511 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002512 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 errors);
2514 return -1;
2515 }
2516}
2517
2518PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2519 int size,
2520 PyObject *mapping,
2521 const char *errors)
2522{
2523 PyObject *v;
2524 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002525 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526
2527 /* Default to Latin-1 */
2528 if (mapping == NULL)
2529 return PyUnicode_EncodeLatin1(p, size, errors);
2530
2531 v = PyString_FromStringAndSize(NULL, size);
2532 if (v == NULL)
2533 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002534 if (size == 0)
2535 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 s = PyString_AS_STRING(v);
2537 while (size-- > 0) {
2538 Py_UNICODE ch = *p++;
2539 PyObject *w, *x;
2540
2541 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2542 w = PyInt_FromLong((long)ch);
2543 if (w == NULL)
2544 goto onError;
2545 x = PyObject_GetItem(mapping, w);
2546 Py_DECREF(w);
2547 if (x == NULL) {
2548 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002549 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002551 x = Py_None;
2552 Py_INCREF(x);
2553 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002554 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 }
2556
2557 /* Apply mapping */
2558 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002559 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 if (value < 0 || value > 255) {
2561 PyErr_SetString(PyExc_TypeError,
2562 "character mapping must be in range(256)");
2563 Py_DECREF(x);
2564 goto onError;
2565 }
2566 *s++ = (char)value;
2567 }
2568 else if (x == Py_None) {
2569 /* undefined mapping */
2570 if (charmap_encoding_error(&p, &s, errors,
2571 "character maps to <undefined>")) {
2572 Py_DECREF(x);
2573 goto onError;
2574 }
2575 }
2576 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002577 int targetsize = PyString_GET_SIZE(x);
2578
2579 if (targetsize == 1)
2580 /* 1-1 mapping */
2581 *s++ = *PyString_AS_STRING(x);
2582
2583 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002585 if (targetsize > extrachars) {
2586 /* resize first */
2587 int oldpos = (int)(s - PyString_AS_STRING(v));
2588 int needed = (targetsize - extrachars) + \
2589 (targetsize << 2);
2590 extrachars += needed;
2591 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002592 Py_DECREF(x);
2593 goto onError;
2594 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002595 s = PyString_AS_STRING(v) + oldpos;
2596 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002597 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002598 s += targetsize;
2599 extrachars -= targetsize;
2600 }
2601 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
2603 else {
2604 /* wrong return value */
2605 PyErr_SetString(PyExc_TypeError,
2606 "character mapping must return integer, None or unicode");
2607 Py_DECREF(x);
2608 goto onError;
2609 }
2610 Py_DECREF(x);
2611 }
2612 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
Tim Peters5de98422002-04-27 18:44:32 +00002613 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 return v;
2615
2616 onError:
Tim Peters5de98422002-04-27 18:44:32 +00002617 Py_XDECREF(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 return NULL;
2619}
2620
2621PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2622 PyObject *mapping)
2623{
2624 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2625 PyErr_BadArgument();
2626 return NULL;
2627 }
2628 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2629 PyUnicode_GET_SIZE(unicode),
2630 mapping,
2631 NULL);
2632}
2633
2634static
2635int translate_error(const Py_UNICODE **source,
2636 Py_UNICODE **dest,
2637 const char *errors,
2638 const char *details)
2639{
2640 if ((errors == NULL) ||
2641 (strcmp(errors,"strict") == 0)) {
2642 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002643 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 details);
2645 return -1;
2646 }
2647 else if (strcmp(errors,"ignore") == 0) {
2648 return 0;
2649 }
2650 else if (strcmp(errors,"replace") == 0) {
2651 **dest = '?';
2652 (*dest)++;
2653 return 0;
2654 }
2655 else {
2656 PyErr_Format(PyExc_ValueError,
2657 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002658 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 errors);
2660 return -1;
2661 }
2662}
2663
2664PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2665 int size,
2666 PyObject *mapping,
2667 const char *errors)
2668{
2669 PyUnicodeObject *v;
2670 Py_UNICODE *p;
2671
2672 if (mapping == NULL) {
2673 PyErr_BadArgument();
2674 return NULL;
2675 }
2676
2677 /* Output will never be longer than input */
2678 v = _PyUnicode_New(size);
2679 if (v == NULL)
2680 goto onError;
2681 if (size == 0)
2682 goto done;
2683 p = PyUnicode_AS_UNICODE(v);
2684 while (size-- > 0) {
2685 Py_UNICODE ch = *s++;
2686 PyObject *w, *x;
2687
2688 /* Get mapping */
2689 w = PyInt_FromLong(ch);
2690 if (w == NULL)
2691 goto onError;
2692 x = PyObject_GetItem(mapping, w);
2693 Py_DECREF(w);
2694 if (x == NULL) {
2695 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2696 /* No mapping found: default to 1-1 mapping */
2697 PyErr_Clear();
2698 *p++ = ch;
2699 continue;
2700 }
2701 goto onError;
2702 }
2703
2704 /* Apply mapping */
2705 if (PyInt_Check(x))
2706 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2707 else if (x == Py_None) {
2708 /* undefined mapping */
2709 if (translate_error(&s, &p, errors,
2710 "character maps to <undefined>")) {
2711 Py_DECREF(x);
2712 goto onError;
2713 }
2714 }
2715 else if (PyUnicode_Check(x)) {
2716 if (PyUnicode_GET_SIZE(x) != 1) {
2717 /* 1-n mapping */
2718 PyErr_SetString(PyExc_NotImplementedError,
2719 "1-n mappings are currently not implemented");
2720 Py_DECREF(x);
2721 goto onError;
2722 }
2723 *p++ = *PyUnicode_AS_UNICODE(x);
2724 }
2725 else {
2726 /* wrong return value */
2727 PyErr_SetString(PyExc_TypeError,
2728 "translate mapping must return integer, None or unicode");
2729 Py_DECREF(x);
2730 goto onError;
2731 }
2732 Py_DECREF(x);
2733 }
2734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002735 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
2738 done:
2739 return (PyObject *)v;
2740
2741 onError:
2742 Py_XDECREF(v);
2743 return NULL;
2744}
2745
2746PyObject *PyUnicode_Translate(PyObject *str,
2747 PyObject *mapping,
2748 const char *errors)
2749{
2750 PyObject *result;
2751
2752 str = PyUnicode_FromObject(str);
2753 if (str == NULL)
2754 goto onError;
2755 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2756 PyUnicode_GET_SIZE(str),
2757 mapping,
2758 errors);
2759 Py_DECREF(str);
2760 return result;
2761
2762 onError:
2763 Py_XDECREF(str);
2764 return NULL;
2765}
2766
Guido van Rossum9e896b32000-04-05 20:11:21 +00002767/* --- Decimal Encoder ---------------------------------------------------- */
2768
2769int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2770 int length,
2771 char *output,
2772 const char *errors)
2773{
2774 Py_UNICODE *p, *end;
2775
2776 if (output == NULL) {
2777 PyErr_BadArgument();
2778 return -1;
2779 }
2780
2781 p = s;
2782 end = s + length;
2783 while (p < end) {
2784 register Py_UNICODE ch = *p++;
2785 int decimal;
2786
2787 if (Py_UNICODE_ISSPACE(ch)) {
2788 *output++ = ' ';
2789 continue;
2790 }
2791 decimal = Py_UNICODE_TODECIMAL(ch);
2792 if (decimal >= 0) {
2793 *output++ = '0' + decimal;
2794 continue;
2795 }
Guido van Rossumba477042000-04-06 18:18:10 +00002796 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002797 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002798 continue;
2799 }
2800 /* All other characters are considered invalid */
2801 if (errors == NULL || strcmp(errors, "strict") == 0) {
2802 PyErr_SetString(PyExc_ValueError,
2803 "invalid decimal Unicode string");
2804 goto onError;
2805 }
2806 else if (strcmp(errors, "ignore") == 0)
2807 continue;
2808 else if (strcmp(errors, "replace") == 0) {
2809 *output++ = '?';
2810 continue;
2811 }
2812 }
2813 /* 0-terminate the output string */
2814 *output++ = '\0';
2815 return 0;
2816
2817 onError:
2818 return -1;
2819}
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821/* --- Helpers ------------------------------------------------------------ */
2822
2823static
2824int count(PyUnicodeObject *self,
2825 int start,
2826 int end,
2827 PyUnicodeObject *substring)
2828{
2829 int count = 0;
2830
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002831 if (start < 0)
2832 start += self->length;
2833 if (start < 0)
2834 start = 0;
2835 if (end > self->length)
2836 end = self->length;
2837 if (end < 0)
2838 end += self->length;
2839 if (end < 0)
2840 end = 0;
2841
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002842 if (substring->length == 0)
2843 return (end - start + 1);
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 end -= substring->length;
2846
2847 while (start <= end)
2848 if (Py_UNICODE_MATCH(self, start, substring)) {
2849 count++;
2850 start += substring->length;
2851 } else
2852 start++;
2853
2854 return count;
2855}
2856
2857int PyUnicode_Count(PyObject *str,
2858 PyObject *substr,
2859 int start,
2860 int end)
2861{
2862 int result;
2863
2864 str = PyUnicode_FromObject(str);
2865 if (str == NULL)
2866 return -1;
2867 substr = PyUnicode_FromObject(substr);
2868 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002869 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 return -1;
2871 }
2872
2873 result = count((PyUnicodeObject *)str,
2874 start, end,
2875 (PyUnicodeObject *)substr);
2876
2877 Py_DECREF(str);
2878 Py_DECREF(substr);
2879 return result;
2880}
2881
2882static
2883int findstring(PyUnicodeObject *self,
2884 PyUnicodeObject *substring,
2885 int start,
2886 int end,
2887 int direction)
2888{
2889 if (start < 0)
2890 start += self->length;
2891 if (start < 0)
2892 start = 0;
2893
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 if (end > self->length)
2895 end = self->length;
2896 if (end < 0)
2897 end += self->length;
2898 if (end < 0)
2899 end = 0;
2900
Guido van Rossum76afbd92002-08-20 17:29:29 +00002901 if (substring->length == 0)
2902 return (direction > 0) ? start : end;
2903
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 end -= substring->length;
2905
2906 if (direction < 0) {
2907 for (; end >= start; end--)
2908 if (Py_UNICODE_MATCH(self, end, substring))
2909 return end;
2910 } else {
2911 for (; start <= end; start++)
2912 if (Py_UNICODE_MATCH(self, start, substring))
2913 return start;
2914 }
2915
2916 return -1;
2917}
2918
2919int PyUnicode_Find(PyObject *str,
2920 PyObject *substr,
2921 int start,
2922 int end,
2923 int direction)
2924{
2925 int result;
2926
2927 str = PyUnicode_FromObject(str);
2928 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002929 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 substr = PyUnicode_FromObject(substr);
2931 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00002932 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002933 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 }
2935
2936 result = findstring((PyUnicodeObject *)str,
2937 (PyUnicodeObject *)substr,
2938 start, end, direction);
2939 Py_DECREF(str);
2940 Py_DECREF(substr);
2941 return result;
2942}
2943
2944static
2945int tailmatch(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int start,
2948 int end,
2949 int direction)
2950{
2951 if (start < 0)
2952 start += self->length;
2953 if (start < 0)
2954 start = 0;
2955
2956 if (substring->length == 0)
2957 return 1;
2958
2959 if (end > self->length)
2960 end = self->length;
2961 if (end < 0)
2962 end += self->length;
2963 if (end < 0)
2964 end = 0;
2965
2966 end -= substring->length;
2967 if (end < start)
2968 return 0;
2969
2970 if (direction > 0) {
2971 if (Py_UNICODE_MATCH(self, end, substring))
2972 return 1;
2973 } else {
2974 if (Py_UNICODE_MATCH(self, start, substring))
2975 return 1;
2976 }
2977
2978 return 0;
2979}
2980
2981int PyUnicode_Tailmatch(PyObject *str,
2982 PyObject *substr,
2983 int start,
2984 int end,
2985 int direction)
2986{
2987 int result;
2988
2989 str = PyUnicode_FromObject(str);
2990 if (str == NULL)
2991 return -1;
2992 substr = PyUnicode_FromObject(substr);
2993 if (substr == NULL) {
2994 Py_DECREF(substr);
2995 return -1;
2996 }
2997
2998 result = tailmatch((PyUnicodeObject *)str,
2999 (PyUnicodeObject *)substr,
3000 start, end, direction);
3001 Py_DECREF(str);
3002 Py_DECREF(substr);
3003 return result;
3004}
3005
3006static
3007const Py_UNICODE *findchar(const Py_UNICODE *s,
3008 int size,
3009 Py_UNICODE ch)
3010{
3011 /* like wcschr, but doesn't stop at NULL characters */
3012
3013 while (size-- > 0) {
3014 if (*s == ch)
3015 return s;
3016 s++;
3017 }
3018
3019 return NULL;
3020}
3021
3022/* Apply fixfct filter to the Unicode object self and return a
3023 reference to the modified object */
3024
3025static
3026PyObject *fixup(PyUnicodeObject *self,
3027 int (*fixfct)(PyUnicodeObject *s))
3028{
3029
3030 PyUnicodeObject *u;
3031
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003032 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 if (u == NULL)
3034 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003035
3036 Py_UNICODE_COPY(u->str, self->str, self->length);
3037
Tim Peters7a29bd52001-09-12 03:03:31 +00003038 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 /* fixfct should return TRUE if it modified the buffer. If
3040 FALSE, return a reference to the original buffer instead
3041 (to save space, not time) */
3042 Py_INCREF(self);
3043 Py_DECREF(u);
3044 return (PyObject*) self;
3045 }
3046 return (PyObject*) u;
3047}
3048
3049static
3050int fixupper(PyUnicodeObject *self)
3051{
3052 int len = self->length;
3053 Py_UNICODE *s = self->str;
3054 int status = 0;
3055
3056 while (len-- > 0) {
3057 register Py_UNICODE ch;
3058
3059 ch = Py_UNICODE_TOUPPER(*s);
3060 if (ch != *s) {
3061 status = 1;
3062 *s = ch;
3063 }
3064 s++;
3065 }
3066
3067 return status;
3068}
3069
3070static
3071int fixlower(PyUnicodeObject *self)
3072{
3073 int len = self->length;
3074 Py_UNICODE *s = self->str;
3075 int status = 0;
3076
3077 while (len-- > 0) {
3078 register Py_UNICODE ch;
3079
3080 ch = Py_UNICODE_TOLOWER(*s);
3081 if (ch != *s) {
3082 status = 1;
3083 *s = ch;
3084 }
3085 s++;
3086 }
3087
3088 return status;
3089}
3090
3091static
3092int fixswapcase(PyUnicodeObject *self)
3093{
3094 int len = self->length;
3095 Py_UNICODE *s = self->str;
3096 int status = 0;
3097
3098 while (len-- > 0) {
3099 if (Py_UNICODE_ISUPPER(*s)) {
3100 *s = Py_UNICODE_TOLOWER(*s);
3101 status = 1;
3102 } else if (Py_UNICODE_ISLOWER(*s)) {
3103 *s = Py_UNICODE_TOUPPER(*s);
3104 status = 1;
3105 }
3106 s++;
3107 }
3108
3109 return status;
3110}
3111
3112static
3113int fixcapitalize(PyUnicodeObject *self)
3114{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003115 int len = self->length;
3116 Py_UNICODE *s = self->str;
3117 int status = 0;
3118
3119 if (len == 0)
3120 return 0;
3121 if (Py_UNICODE_ISLOWER(*s)) {
3122 *s = Py_UNICODE_TOUPPER(*s);
3123 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003125 s++;
3126 while (--len > 0) {
3127 if (Py_UNICODE_ISUPPER(*s)) {
3128 *s = Py_UNICODE_TOLOWER(*s);
3129 status = 1;
3130 }
3131 s++;
3132 }
3133 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134}
3135
3136static
3137int fixtitle(PyUnicodeObject *self)
3138{
3139 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3140 register Py_UNICODE *e;
3141 int previous_is_cased;
3142
3143 /* Shortcut for single character strings */
3144 if (PyUnicode_GET_SIZE(self) == 1) {
3145 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3146 if (*p != ch) {
3147 *p = ch;
3148 return 1;
3149 }
3150 else
3151 return 0;
3152 }
3153
3154 e = p + PyUnicode_GET_SIZE(self);
3155 previous_is_cased = 0;
3156 for (; p < e; p++) {
3157 register const Py_UNICODE ch = *p;
3158
3159 if (previous_is_cased)
3160 *p = Py_UNICODE_TOLOWER(ch);
3161 else
3162 *p = Py_UNICODE_TOTITLE(ch);
3163
3164 if (Py_UNICODE_ISLOWER(ch) ||
3165 Py_UNICODE_ISUPPER(ch) ||
3166 Py_UNICODE_ISTITLE(ch))
3167 previous_is_cased = 1;
3168 else
3169 previous_is_cased = 0;
3170 }
3171 return 1;
3172}
3173
3174PyObject *PyUnicode_Join(PyObject *separator,
3175 PyObject *seq)
3176{
3177 Py_UNICODE *sep;
3178 int seplen;
3179 PyUnicodeObject *res = NULL;
3180 int reslen = 0;
3181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 int sz = 100;
3183 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003184 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
Tim Peters2cfe3682001-05-05 05:36:48 +00003186 it = PyObject_GetIter(seq);
3187 if (it == NULL)
3188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189
3190 if (separator == NULL) {
3191 Py_UNICODE blank = ' ';
3192 sep = &blank;
3193 seplen = 1;
3194 }
3195 else {
3196 separator = PyUnicode_FromObject(separator);
3197 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 sep = PyUnicode_AS_UNICODE(separator);
3200 seplen = PyUnicode_GET_SIZE(separator);
3201 }
3202
3203 res = _PyUnicode_New(sz);
3204 if (res == NULL)
3205 goto onError;
3206 p = PyUnicode_AS_UNICODE(res);
3207 reslen = 0;
3208
Tim Peters2cfe3682001-05-05 05:36:48 +00003209 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003211 PyObject *item = PyIter_Next(it);
3212 if (item == NULL) {
3213 if (PyErr_Occurred())
3214 goto onError;
3215 break;
3216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 if (!PyUnicode_Check(item)) {
3218 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003219 if (!PyString_Check(item)) {
3220 PyErr_Format(PyExc_TypeError,
3221 "sequence item %i: expected string or Unicode,"
3222 " %.80s found",
3223 i, item->ob_type->tp_name);
3224 Py_DECREF(item);
3225 goto onError;
3226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 v = PyUnicode_FromObject(item);
3228 Py_DECREF(item);
3229 item = v;
3230 if (item == NULL)
3231 goto onError;
3232 }
3233 itemlen = PyUnicode_GET_SIZE(item);
3234 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003235 if (_PyUnicode_Resize(&res, sz*2)) {
3236 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 sz *= 2;
3240 p = PyUnicode_AS_UNICODE(res) + reslen;
3241 }
3242 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003243 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 p += seplen;
3245 reslen += seplen;
3246 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003247 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 p += itemlen;
3249 reslen += itemlen;
3250 Py_DECREF(item);
3251 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003252 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 goto onError;
3254
3255 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003256 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 return (PyObject *)res;
3258
3259 onError:
3260 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003261 Py_XDECREF(res);
3262 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 return NULL;
3264}
3265
3266static
3267PyUnicodeObject *pad(PyUnicodeObject *self,
3268 int left,
3269 int right,
3270 Py_UNICODE fill)
3271{
3272 PyUnicodeObject *u;
3273
3274 if (left < 0)
3275 left = 0;
3276 if (right < 0)
3277 right = 0;
3278
Tim Peters7a29bd52001-09-12 03:03:31 +00003279 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 Py_INCREF(self);
3281 return self;
3282 }
3283
3284 u = _PyUnicode_New(left + self->length + right);
3285 if (u) {
3286 if (left)
3287 Py_UNICODE_FILL(u->str, fill, left);
3288 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3289 if (right)
3290 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3291 }
3292
3293 return u;
3294}
3295
3296#define SPLIT_APPEND(data, left, right) \
3297 str = PyUnicode_FromUnicode(data + left, right - left); \
3298 if (!str) \
3299 goto onError; \
3300 if (PyList_Append(list, str)) { \
3301 Py_DECREF(str); \
3302 goto onError; \
3303 } \
3304 else \
3305 Py_DECREF(str);
3306
3307static
3308PyObject *split_whitespace(PyUnicodeObject *self,
3309 PyObject *list,
3310 int maxcount)
3311{
3312 register int i;
3313 register int j;
3314 int len = self->length;
3315 PyObject *str;
3316
3317 for (i = j = 0; i < len; ) {
3318 /* find a token */
3319 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3320 i++;
3321 j = i;
3322 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3323 i++;
3324 if (j < i) {
3325 if (maxcount-- <= 0)
3326 break;
3327 SPLIT_APPEND(self->str, j, i);
3328 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3329 i++;
3330 j = i;
3331 }
3332 }
3333 if (j < len) {
3334 SPLIT_APPEND(self->str, j, len);
3335 }
3336 return list;
3337
3338 onError:
3339 Py_DECREF(list);
3340 return NULL;
3341}
3342
3343PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003344 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345{
3346 register int i;
3347 register int j;
3348 int len;
3349 PyObject *list;
3350 PyObject *str;
3351 Py_UNICODE *data;
3352
3353 string = PyUnicode_FromObject(string);
3354 if (string == NULL)
3355 return NULL;
3356 data = PyUnicode_AS_UNICODE(string);
3357 len = PyUnicode_GET_SIZE(string);
3358
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 list = PyList_New(0);
3360 if (!list)
3361 goto onError;
3362
3363 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003364 int eol;
3365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 /* Find a line and append it */
3367 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3368 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369
3370 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003371 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 if (i < len) {
3373 if (data[i] == '\r' && i + 1 < len &&
3374 data[i+1] == '\n')
3375 i += 2;
3376 else
3377 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003378 if (keepends)
3379 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
Guido van Rossum86662912000-04-11 15:38:46 +00003381 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 j = i;
3383 }
3384 if (j < len) {
3385 SPLIT_APPEND(data, j, len);
3386 }
3387
3388 Py_DECREF(string);
3389 return list;
3390
3391 onError:
3392 Py_DECREF(list);
3393 Py_DECREF(string);
3394 return NULL;
3395}
3396
3397static
3398PyObject *split_char(PyUnicodeObject *self,
3399 PyObject *list,
3400 Py_UNICODE ch,
3401 int maxcount)
3402{
3403 register int i;
3404 register int j;
3405 int len = self->length;
3406 PyObject *str;
3407
3408 for (i = j = 0; i < len; ) {
3409 if (self->str[i] == ch) {
3410 if (maxcount-- <= 0)
3411 break;
3412 SPLIT_APPEND(self->str, j, i);
3413 i = j = i + 1;
3414 } else
3415 i++;
3416 }
3417 if (j <= len) {
3418 SPLIT_APPEND(self->str, j, len);
3419 }
3420 return list;
3421
3422 onError:
3423 Py_DECREF(list);
3424 return NULL;
3425}
3426
3427static
3428PyObject *split_substring(PyUnicodeObject *self,
3429 PyObject *list,
3430 PyUnicodeObject *substring,
3431 int maxcount)
3432{
3433 register int i;
3434 register int j;
3435 int len = self->length;
3436 int sublen = substring->length;
3437 PyObject *str;
3438
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003439 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 if (Py_UNICODE_MATCH(self, i, substring)) {
3441 if (maxcount-- <= 0)
3442 break;
3443 SPLIT_APPEND(self->str, j, i);
3444 i = j = i + sublen;
3445 } else
3446 i++;
3447 }
3448 if (j <= len) {
3449 SPLIT_APPEND(self->str, j, len);
3450 }
3451 return list;
3452
3453 onError:
3454 Py_DECREF(list);
3455 return NULL;
3456}
3457
3458#undef SPLIT_APPEND
3459
3460static
3461PyObject *split(PyUnicodeObject *self,
3462 PyUnicodeObject *substring,
3463 int maxcount)
3464{
3465 PyObject *list;
3466
3467 if (maxcount < 0)
3468 maxcount = INT_MAX;
3469
3470 list = PyList_New(0);
3471 if (!list)
3472 return NULL;
3473
3474 if (substring == NULL)
3475 return split_whitespace(self,list,maxcount);
3476
3477 else if (substring->length == 1)
3478 return split_char(self,list,substring->str[0],maxcount);
3479
3480 else if (substring->length == 0) {
3481 Py_DECREF(list);
3482 PyErr_SetString(PyExc_ValueError, "empty separator");
3483 return NULL;
3484 }
3485 else
3486 return split_substring(self,list,substring,maxcount);
3487}
3488
3489static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490PyObject *replace(PyUnicodeObject *self,
3491 PyUnicodeObject *str1,
3492 PyUnicodeObject *str2,
3493 int maxcount)
3494{
3495 PyUnicodeObject *u;
3496
3497 if (maxcount < 0)
3498 maxcount = INT_MAX;
3499
3500 if (str1->length == 1 && str2->length == 1) {
3501 int i;
3502
3503 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003504 if (!findchar(self->str, self->length, str1->str[0]) &&
3505 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 /* nothing to replace, return original string */
3507 Py_INCREF(self);
3508 u = self;
3509 } else {
3510 Py_UNICODE u1 = str1->str[0];
3511 Py_UNICODE u2 = str2->str[0];
3512
3513 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003514 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 self->length
3516 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003517 if (u != NULL) {
3518 Py_UNICODE_COPY(u->str, self->str,
3519 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 for (i = 0; i < u->length; i++)
3521 if (u->str[i] == u1) {
3522 if (--maxcount < 0)
3523 break;
3524 u->str[i] = u2;
3525 }
3526 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528
3529 } else {
3530 int n, i;
3531 Py_UNICODE *p;
3532
3533 /* replace strings */
3534 n = count(self, 0, self->length, str1);
3535 if (n > maxcount)
3536 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003537 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 /* nothing to replace, return original string */
3539 Py_INCREF(self);
3540 u = self;
3541 } else {
3542 u = _PyUnicode_New(
3543 self->length + n * (str2->length - str1->length));
3544 if (u) {
3545 i = 0;
3546 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00003547 if (str1->length > 0) {
3548 while (i <= self->length - str1->length)
3549 if (Py_UNICODE_MATCH(self, i, str1)) {
3550 /* replace string segment */
3551 Py_UNICODE_COPY(p, str2->str, str2->length);
3552 p += str2->length;
3553 i += str1->length;
3554 if (--n <= 0) {
3555 /* copy remaining part */
3556 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3557 break;
3558 }
3559 } else
3560 *p++ = self->str[i++];
3561 } else {
3562 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 Py_UNICODE_COPY(p, str2->str, str2->length);
3564 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00003565 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00003568 }
3569 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 }
3572 }
3573 }
3574
3575 return (PyObject *) u;
3576}
3577
3578/* --- Unicode Object Methods --------------------------------------------- */
3579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003580PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581"S.title() -> unicode\n\
3582\n\
3583Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003584characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585
3586static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003587unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 return fixup(self, fixtitle);
3590}
3591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003592PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593"S.capitalize() -> unicode\n\
3594\n\
3595Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003596have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
3598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003599unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 return fixup(self, fixcapitalize);
3602}
3603
3604#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003605PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606"S.capwords() -> unicode\n\
3607\n\
3608Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003609normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610
3611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003612unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613{
3614 PyObject *list;
3615 PyObject *item;
3616 int i;
3617
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 /* Split into words */
3619 list = split(self, NULL, -1);
3620 if (!list)
3621 return NULL;
3622
3623 /* Capitalize each word */
3624 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3625 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3626 fixcapitalize);
3627 if (item == NULL)
3628 goto onError;
3629 Py_DECREF(PyList_GET_ITEM(list, i));
3630 PyList_SET_ITEM(list, i, item);
3631 }
3632
3633 /* Join the words to form a new string */
3634 item = PyUnicode_Join(NULL, list);
3635
3636onError:
3637 Py_DECREF(list);
3638 return (PyObject *)item;
3639}
3640#endif
3641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003642PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643"S.center(width) -> unicode\n\
3644\n\
3645Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003646using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647
3648static PyObject *
3649unicode_center(PyUnicodeObject *self, PyObject *args)
3650{
3651 int marg, left;
3652 int width;
3653
3654 if (!PyArg_ParseTuple(args, "i:center", &width))
3655 return NULL;
3656
Tim Peters7a29bd52001-09-12 03:03:31 +00003657 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 Py_INCREF(self);
3659 return (PyObject*) self;
3660 }
3661
3662 marg = width - self->length;
3663 left = marg / 2 + (marg & width & 1);
3664
3665 return (PyObject*) pad(self, left, marg - left, ' ');
3666}
3667
Marc-André Lemburge5034372000-08-08 08:04:29 +00003668#if 0
3669
3670/* This code should go into some future Unicode collation support
3671 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003672 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003673
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003674/* speedy UTF-16 code point order comparison */
3675/* gleaned from: */
3676/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3677
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003678static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003679{
3680 0, 0, 0, 0, 0, 0, 0, 0,
3681 0, 0, 0, 0, 0, 0, 0, 0,
3682 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003683 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003684};
3685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686static int
3687unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3688{
3689 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003690
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 Py_UNICODE *s1 = str1->str;
3692 Py_UNICODE *s2 = str2->str;
3693
3694 len1 = str1->length;
3695 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003698 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003699
3700 c1 = *s1++;
3701 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003702
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003703 if (c1 > (1<<11) * 26)
3704 c1 += utf16Fixup[c1>>11];
3705 if (c2 > (1<<11) * 26)
3706 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003707 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003708
3709 if (c1 != c2)
3710 return (c1 < c2) ? -1 : 1;
3711
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003712 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 }
3714
3715 return (len1 < len2) ? -1 : (len1 != len2);
3716}
3717
Marc-André Lemburge5034372000-08-08 08:04:29 +00003718#else
3719
3720static int
3721unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3722{
3723 register int len1, len2;
3724
3725 Py_UNICODE *s1 = str1->str;
3726 Py_UNICODE *s2 = str2->str;
3727
3728 len1 = str1->length;
3729 len2 = str2->length;
3730
3731 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003732 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003733
Fredrik Lundh45714e92001-06-26 16:39:36 +00003734 c1 = *s1++;
3735 c2 = *s2++;
3736
3737 if (c1 != c2)
3738 return (c1 < c2) ? -1 : 1;
3739
Marc-André Lemburge5034372000-08-08 08:04:29 +00003740 len1--; len2--;
3741 }
3742
3743 return (len1 < len2) ? -1 : (len1 != len2);
3744}
3745
3746#endif
3747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748int PyUnicode_Compare(PyObject *left,
3749 PyObject *right)
3750{
3751 PyUnicodeObject *u = NULL, *v = NULL;
3752 int result;
3753
3754 /* Coerce the two arguments */
3755 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3756 if (u == NULL)
3757 goto onError;
3758 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3759 if (v == NULL)
3760 goto onError;
3761
Thomas Wouters7e474022000-07-16 12:04:32 +00003762 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 if (v == u) {
3764 Py_DECREF(u);
3765 Py_DECREF(v);
3766 return 0;
3767 }
3768
3769 result = unicode_compare(u, v);
3770
3771 Py_DECREF(u);
3772 Py_DECREF(v);
3773 return result;
3774
3775onError:
3776 Py_XDECREF(u);
3777 Py_XDECREF(v);
3778 return -1;
3779}
3780
Guido van Rossum403d68b2000-03-13 15:55:09 +00003781int PyUnicode_Contains(PyObject *container,
3782 PyObject *element)
3783{
3784 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00003785 int result, size;
3786 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00003787
3788 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003789 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003790 if (v == NULL) {
3791 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00003792 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003793 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003794 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003795 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3796 if (u == NULL) {
3797 Py_DECREF(v);
3798 goto onError;
3799 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003800
Barry Warsaw817918c2002-08-06 16:58:21 +00003801 size = PyUnicode_GET_SIZE(v);
3802 rhs = PyUnicode_AS_UNICODE(v);
3803 lhs = PyUnicode_AS_UNICODE(u);
3804
Guido van Rossum403d68b2000-03-13 15:55:09 +00003805 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00003806 if (size == 1) {
3807 end = lhs + PyUnicode_GET_SIZE(u);
3808 while (lhs < end) {
3809 if (*lhs++ == *rhs) {
3810 result = 1;
3811 break;
3812 }
3813 }
3814 }
3815 else {
3816 end = lhs + (PyUnicode_GET_SIZE(u) - size);
3817 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00003818 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00003819 result = 1;
3820 break;
3821 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003822 }
3823 }
3824
3825 Py_DECREF(u);
3826 Py_DECREF(v);
3827 return result;
3828
3829onError:
3830 Py_XDECREF(u);
3831 Py_XDECREF(v);
3832 return -1;
3833}
3834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835/* Concat to string or Unicode object giving a new Unicode object. */
3836
3837PyObject *PyUnicode_Concat(PyObject *left,
3838 PyObject *right)
3839{
3840 PyUnicodeObject *u = NULL, *v = NULL, *w;
3841
3842 /* Coerce the two arguments */
3843 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3844 if (u == NULL)
3845 goto onError;
3846 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3847 if (v == NULL)
3848 goto onError;
3849
3850 /* Shortcuts */
3851 if (v == unicode_empty) {
3852 Py_DECREF(v);
3853 return (PyObject *)u;
3854 }
3855 if (u == unicode_empty) {
3856 Py_DECREF(u);
3857 return (PyObject *)v;
3858 }
3859
3860 /* Concat the two Unicode strings */
3861 w = _PyUnicode_New(u->length + v->length);
3862 if (w == NULL)
3863 goto onError;
3864 Py_UNICODE_COPY(w->str, u->str, u->length);
3865 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3866
3867 Py_DECREF(u);
3868 Py_DECREF(v);
3869 return (PyObject *)w;
3870
3871onError:
3872 Py_XDECREF(u);
3873 Py_XDECREF(v);
3874 return NULL;
3875}
3876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003877PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878"S.count(sub[, start[, end]]) -> int\n\
3879\n\
3880Return the number of occurrences of substring sub in Unicode string\n\
3881S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003882interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883
3884static PyObject *
3885unicode_count(PyUnicodeObject *self, PyObject *args)
3886{
3887 PyUnicodeObject *substring;
3888 int start = 0;
3889 int end = INT_MAX;
3890 PyObject *result;
3891
Guido van Rossumb8872e62000-05-09 14:14:27 +00003892 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3893 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 return NULL;
3895
3896 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3897 (PyObject *)substring);
3898 if (substring == NULL)
3899 return NULL;
3900
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 if (start < 0)
3902 start += self->length;
3903 if (start < 0)
3904 start = 0;
3905 if (end > self->length)
3906 end = self->length;
3907 if (end < 0)
3908 end += self->length;
3909 if (end < 0)
3910 end = 0;
3911
3912 result = PyInt_FromLong((long) count(self, start, end, substring));
3913
3914 Py_DECREF(substring);
3915 return result;
3916}
3917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003918PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919"S.encode([encoding[,errors]]) -> string\n\
3920\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003921Return an encoded string version of S. Default encoding is the current\n\
3922default string encoding. errors may be given to set a different error\n\
3923handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003924a ValueError. Other possible values are 'ignore' and 'replace'.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925
3926static PyObject *
3927unicode_encode(PyUnicodeObject *self, PyObject *args)
3928{
3929 char *encoding = NULL;
3930 char *errors = NULL;
3931 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3932 return NULL;
3933 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3934}
3935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003936PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937"S.expandtabs([tabsize]) -> unicode\n\
3938\n\
3939Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003940If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941
3942static PyObject*
3943unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3944{
3945 Py_UNICODE *e;
3946 Py_UNICODE *p;
3947 Py_UNICODE *q;
3948 int i, j;
3949 PyUnicodeObject *u;
3950 int tabsize = 8;
3951
3952 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3953 return NULL;
3954
Thomas Wouters7e474022000-07-16 12:04:32 +00003955 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 i = j = 0;
3957 e = self->str + self->length;
3958 for (p = self->str; p < e; p++)
3959 if (*p == '\t') {
3960 if (tabsize > 0)
3961 j += tabsize - (j % tabsize);
3962 }
3963 else {
3964 j++;
3965 if (*p == '\n' || *p == '\r') {
3966 i += j;
3967 j = 0;
3968 }
3969 }
3970
3971 /* Second pass: create output string and fill it */
3972 u = _PyUnicode_New(i + j);
3973 if (!u)
3974 return NULL;
3975
3976 j = 0;
3977 q = u->str;
3978
3979 for (p = self->str; p < e; p++)
3980 if (*p == '\t') {
3981 if (tabsize > 0) {
3982 i = tabsize - (j % tabsize);
3983 j += i;
3984 while (i--)
3985 *q++ = ' ';
3986 }
3987 }
3988 else {
3989 j++;
3990 *q++ = *p;
3991 if (*p == '\n' || *p == '\r')
3992 j = 0;
3993 }
3994
3995 return (PyObject*) u;
3996}
3997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003998PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999"S.find(sub [,start [,end]]) -> int\n\
4000\n\
4001Return the lowest index in S where substring sub is found,\n\
4002such that sub is contained within s[start,end]. Optional\n\
4003arguments start and end are interpreted as in slice notation.\n\
4004\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004005Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006
4007static PyObject *
4008unicode_find(PyUnicodeObject *self, PyObject *args)
4009{
4010 PyUnicodeObject *substring;
4011 int start = 0;
4012 int end = INT_MAX;
4013 PyObject *result;
4014
Guido van Rossumb8872e62000-05-09 14:14:27 +00004015 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4016 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 return NULL;
4018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4019 (PyObject *)substring);
4020 if (substring == NULL)
4021 return NULL;
4022
4023 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4024
4025 Py_DECREF(substring);
4026 return result;
4027}
4028
4029static PyObject *
4030unicode_getitem(PyUnicodeObject *self, int index)
4031{
4032 if (index < 0 || index >= self->length) {
4033 PyErr_SetString(PyExc_IndexError, "string index out of range");
4034 return NULL;
4035 }
4036
4037 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4038}
4039
4040static long
4041unicode_hash(PyUnicodeObject *self)
4042{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004043 /* Since Unicode objects compare equal to their ASCII string
4044 counterparts, they should use the individual character values
4045 as basis for their hash value. This is needed to assure that
4046 strings and Unicode objects behave in the same way as
4047 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048
Fredrik Lundhdde61642000-07-10 18:27:47 +00004049 register int len;
4050 register Py_UNICODE *p;
4051 register long x;
4052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (self->hash != -1)
4054 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004055 len = PyUnicode_GET_SIZE(self);
4056 p = PyUnicode_AS_UNICODE(self);
4057 x = *p << 7;
4058 while (--len >= 0)
4059 x = (1000003*x) ^ *p++;
4060 x ^= PyUnicode_GET_SIZE(self);
4061 if (x == -1)
4062 x = -2;
4063 self->hash = x;
4064 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065}
4066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004067PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068"S.index(sub [,start [,end]]) -> int\n\
4069\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004070Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071
4072static PyObject *
4073unicode_index(PyUnicodeObject *self, PyObject *args)
4074{
4075 int result;
4076 PyUnicodeObject *substring;
4077 int start = 0;
4078 int end = INT_MAX;
4079
Guido van Rossumb8872e62000-05-09 14:14:27 +00004080 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4081 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 return NULL;
4083
4084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4085 (PyObject *)substring);
4086 if (substring == NULL)
4087 return NULL;
4088
4089 result = findstring(self, substring, start, end, 1);
4090
4091 Py_DECREF(substring);
4092 if (result < 0) {
4093 PyErr_SetString(PyExc_ValueError, "substring not found");
4094 return NULL;
4095 }
4096 return PyInt_FromLong(result);
4097}
4098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004099PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004100"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004102Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004103at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104
4105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004106unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107{
4108 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4109 register const Py_UNICODE *e;
4110 int cased;
4111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 /* Shortcut for single character strings */
4113 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004114 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004116 /* Special case for empty strings */
4117 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004118 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004119
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 e = p + PyUnicode_GET_SIZE(self);
4121 cased = 0;
4122 for (; p < e; p++) {
4123 register const Py_UNICODE ch = *p;
4124
4125 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004126 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 else if (!cased && Py_UNICODE_ISLOWER(ch))
4128 cased = 1;
4129 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004130 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131}
4132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004133PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004134"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004136Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004137at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138
4139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004140unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141{
4142 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4143 register const Py_UNICODE *e;
4144 int cased;
4145
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 /* Shortcut for single character strings */
4147 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004148 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004150 /* Special case for empty strings */
4151 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004152 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004153
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 e = p + PyUnicode_GET_SIZE(self);
4155 cased = 0;
4156 for (; p < e; p++) {
4157 register const Py_UNICODE ch = *p;
4158
4159 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004160 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 else if (!cased && Py_UNICODE_ISUPPER(ch))
4162 cased = 1;
4163 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004164 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165}
4166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004167PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004168"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004170Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4171characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004172only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173
4174static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004175unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176{
4177 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4178 register const Py_UNICODE *e;
4179 int cased, previous_is_cased;
4180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 /* Shortcut for single character strings */
4182 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004183 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4184 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004186 /* Special case for empty strings */
4187 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004189
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 e = p + PyUnicode_GET_SIZE(self);
4191 cased = 0;
4192 previous_is_cased = 0;
4193 for (; p < e; p++) {
4194 register const Py_UNICODE ch = *p;
4195
4196 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4197 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004198 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 previous_is_cased = 1;
4200 cased = 1;
4201 }
4202 else if (Py_UNICODE_ISLOWER(ch)) {
4203 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 previous_is_cased = 1;
4206 cased = 1;
4207 }
4208 else
4209 previous_is_cased = 0;
4210 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004211 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212}
4213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004214PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004215"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004218False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219
4220static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004221unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222{
4223 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4224 register const Py_UNICODE *e;
4225
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 /* Shortcut for single character strings */
4227 if (PyUnicode_GET_SIZE(self) == 1 &&
4228 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004229 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004231 /* Special case for empty strings */
4232 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004233 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004234
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 e = p + PyUnicode_GET_SIZE(self);
4236 for (; p < e; p++) {
4237 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004238 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004240 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241}
4242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004243PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004244"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004245\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004247and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004248
4249static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004250unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004251{
4252 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4253 register const Py_UNICODE *e;
4254
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004255 /* Shortcut for single character strings */
4256 if (PyUnicode_GET_SIZE(self) == 1 &&
4257 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004258 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004259
4260 /* Special case for empty strings */
4261 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004262 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004263
4264 e = p + PyUnicode_GET_SIZE(self);
4265 for (; p < e; p++) {
4266 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004267 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004269 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004270}
4271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004272PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004273"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004276and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004277
4278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004279unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004280{
4281 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4282 register const Py_UNICODE *e;
4283
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004284 /* Shortcut for single character strings */
4285 if (PyUnicode_GET_SIZE(self) == 1 &&
4286 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004287 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004288
4289 /* Special case for empty strings */
4290 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004291 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004292
4293 e = p + PyUnicode_GET_SIZE(self);
4294 for (; p < e; p++) {
4295 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004296 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004297 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004298 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004299}
4300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004301PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004302"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004305False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306
4307static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004308unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
4310 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4311 register const Py_UNICODE *e;
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 /* Shortcut for single character strings */
4314 if (PyUnicode_GET_SIZE(self) == 1 &&
4315 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004316 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004318 /* Special case for empty strings */
4319 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004320 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004321
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 e = p + PyUnicode_GET_SIZE(self);
4323 for (; p < e; p++) {
4324 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004325 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004327 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328}
4329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004330PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004331"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004333Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004334False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335
4336static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004337unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338{
4339 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4340 register const Py_UNICODE *e;
4341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 /* Shortcut for single character strings */
4343 if (PyUnicode_GET_SIZE(self) == 1 &&
4344 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004345 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004347 /* Special case for empty strings */
4348 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004349 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 e = p + PyUnicode_GET_SIZE(self);
4352 for (; p < e; p++) {
4353 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004354 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004356 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357}
4358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004359PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004360"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004362Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004363False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364
4365static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004366unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367{
4368 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4369 register const Py_UNICODE *e;
4370
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 /* Shortcut for single character strings */
4372 if (PyUnicode_GET_SIZE(self) == 1 &&
4373 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004374 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004376 /* Special case for empty strings */
4377 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004378 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004379
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 e = p + PyUnicode_GET_SIZE(self);
4381 for (; p < e; p++) {
4382 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004383 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004385 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386}
4387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004388PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389"S.join(sequence) -> unicode\n\
4390\n\
4391Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004392sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393
4394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004395unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004397 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398}
4399
4400static int
4401unicode_length(PyUnicodeObject *self)
4402{
4403 return self->length;
4404}
4405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004406PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407"S.ljust(width) -> unicode\n\
4408\n\
4409Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004410done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411
4412static PyObject *
4413unicode_ljust(PyUnicodeObject *self, PyObject *args)
4414{
4415 int width;
4416 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4417 return NULL;
4418
Tim Peters7a29bd52001-09-12 03:03:31 +00004419 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 Py_INCREF(self);
4421 return (PyObject*) self;
4422 }
4423
4424 return (PyObject*) pad(self, 0, width - self->length, ' ');
4425}
4426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004427PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428"S.lower() -> unicode\n\
4429\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004430Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431
4432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004433unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 return fixup(self, fixlower);
4436}
4437
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004438#define LEFTSTRIP 0
4439#define RIGHTSTRIP 1
4440#define BOTHSTRIP 2
4441
4442/* Arrays indexed by above */
4443static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4444
4445#define STRIPNAME(i) (stripformat[i]+3)
4446
4447static const Py_UNICODE *
4448unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4449{
Tim Peters030a5ce2002-04-22 19:00:10 +00004450 size_t i;
4451 for (i = 0; i < n; ++i)
4452 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004453 return s+i;
4454 return NULL;
4455}
4456
4457/* externally visible for str.strip(unicode) */
4458PyObject *
4459_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4460{
4461 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4462 int len = PyUnicode_GET_SIZE(self);
4463 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4464 int seplen = PyUnicode_GET_SIZE(sepobj);
4465 int i, j;
4466
4467 i = 0;
4468 if (striptype != RIGHTSTRIP) {
4469 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4470 i++;
4471 }
4472 }
4473
4474 j = len;
4475 if (striptype != LEFTSTRIP) {
4476 do {
4477 j--;
4478 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4479 j++;
4480 }
4481
4482 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4483 Py_INCREF(self);
4484 return (PyObject*)self;
4485 }
4486 else
4487 return PyUnicode_FromUnicode(s+i, j-i);
4488}
4489
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490
4491static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004492do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004494 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4495 int len = PyUnicode_GET_SIZE(self), i, j;
4496
4497 i = 0;
4498 if (striptype != RIGHTSTRIP) {
4499 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4500 i++;
4501 }
4502 }
4503
4504 j = len;
4505 if (striptype != LEFTSTRIP) {
4506 do {
4507 j--;
4508 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4509 j++;
4510 }
4511
4512 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4513 Py_INCREF(self);
4514 return (PyObject*)self;
4515 }
4516 else
4517 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518}
4519
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004520
4521static PyObject *
4522do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4523{
4524 PyObject *sep = NULL;
4525
4526 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4527 return NULL;
4528
4529 if (sep != NULL && sep != Py_None) {
4530 if (PyUnicode_Check(sep))
4531 return _PyUnicode_XStrip(self, striptype, sep);
4532 else if (PyString_Check(sep)) {
4533 PyObject *res;
4534 sep = PyUnicode_FromObject(sep);
4535 if (sep==NULL)
4536 return NULL;
4537 res = _PyUnicode_XStrip(self, striptype, sep);
4538 Py_DECREF(sep);
4539 return res;
4540 }
4541 else {
4542 PyErr_Format(PyExc_TypeError,
4543 "%s arg must be None, unicode or str",
4544 STRIPNAME(striptype));
4545 return NULL;
4546 }
4547 }
4548
4549 return do_strip(self, striptype);
4550}
4551
4552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004553PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004554"S.strip([sep]) -> unicode\n\
4555\n\
4556Return a copy of the string S with leading and trailing\n\
4557whitespace removed.\n\
4558If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004559If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004560
4561static PyObject *
4562unicode_strip(PyUnicodeObject *self, PyObject *args)
4563{
4564 if (PyTuple_GET_SIZE(args) == 0)
4565 return do_strip(self, BOTHSTRIP); /* Common case */
4566 else
4567 return do_argstrip(self, BOTHSTRIP, args);
4568}
4569
4570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004571PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004572"S.lstrip([sep]) -> unicode\n\
4573\n\
4574Return a copy of the string S with leading whitespace removed.\n\
4575If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004576If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004577
4578static PyObject *
4579unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4580{
4581 if (PyTuple_GET_SIZE(args) == 0)
4582 return do_strip(self, LEFTSTRIP); /* Common case */
4583 else
4584 return do_argstrip(self, LEFTSTRIP, args);
4585}
4586
4587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004588PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004589"S.rstrip([sep]) -> unicode\n\
4590\n\
4591Return a copy of the string S with trailing whitespace removed.\n\
4592If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004593If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004594
4595static PyObject *
4596unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4597{
4598 if (PyTuple_GET_SIZE(args) == 0)
4599 return do_strip(self, RIGHTSTRIP); /* Common case */
4600 else
4601 return do_argstrip(self, RIGHTSTRIP, args);
4602}
4603
4604
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605static PyObject*
4606unicode_repeat(PyUnicodeObject *str, int len)
4607{
4608 PyUnicodeObject *u;
4609 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004610 int nchars;
4611 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
4613 if (len < 0)
4614 len = 0;
4615
Tim Peters7a29bd52001-09-12 03:03:31 +00004616 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617 /* no repeat, return original string */
4618 Py_INCREF(str);
4619 return (PyObject*) str;
4620 }
Tim Peters8f422462000-09-09 06:13:41 +00004621
4622 /* ensure # of chars needed doesn't overflow int and # of bytes
4623 * needed doesn't overflow size_t
4624 */
4625 nchars = len * str->length;
4626 if (len && nchars / len != str->length) {
4627 PyErr_SetString(PyExc_OverflowError,
4628 "repeated string is too long");
4629 return NULL;
4630 }
4631 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4632 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4633 PyErr_SetString(PyExc_OverflowError,
4634 "repeated string is too long");
4635 return NULL;
4636 }
4637 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 if (!u)
4639 return NULL;
4640
4641 p = u->str;
4642
4643 while (len-- > 0) {
4644 Py_UNICODE_COPY(p, str->str, str->length);
4645 p += str->length;
4646 }
4647
4648 return (PyObject*) u;
4649}
4650
4651PyObject *PyUnicode_Replace(PyObject *obj,
4652 PyObject *subobj,
4653 PyObject *replobj,
4654 int maxcount)
4655{
4656 PyObject *self;
4657 PyObject *str1;
4658 PyObject *str2;
4659 PyObject *result;
4660
4661 self = PyUnicode_FromObject(obj);
4662 if (self == NULL)
4663 return NULL;
4664 str1 = PyUnicode_FromObject(subobj);
4665 if (str1 == NULL) {
4666 Py_DECREF(self);
4667 return NULL;
4668 }
4669 str2 = PyUnicode_FromObject(replobj);
4670 if (str2 == NULL) {
4671 Py_DECREF(self);
4672 Py_DECREF(str1);
4673 return NULL;
4674 }
4675 result = replace((PyUnicodeObject *)self,
4676 (PyUnicodeObject *)str1,
4677 (PyUnicodeObject *)str2,
4678 maxcount);
4679 Py_DECREF(self);
4680 Py_DECREF(str1);
4681 Py_DECREF(str2);
4682 return result;
4683}
4684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004685PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686"S.replace (old, new[, maxsplit]) -> unicode\n\
4687\n\
4688Return a copy of S with all occurrences of substring\n\
4689old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004690given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691
4692static PyObject*
4693unicode_replace(PyUnicodeObject *self, PyObject *args)
4694{
4695 PyUnicodeObject *str1;
4696 PyUnicodeObject *str2;
4697 int maxcount = -1;
4698 PyObject *result;
4699
4700 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4701 return NULL;
4702 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4703 if (str1 == NULL)
4704 return NULL;
4705 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4706 if (str2 == NULL)
4707 return NULL;
4708
4709 result = replace(self, str1, str2, maxcount);
4710
4711 Py_DECREF(str1);
4712 Py_DECREF(str2);
4713 return result;
4714}
4715
4716static
4717PyObject *unicode_repr(PyObject *unicode)
4718{
4719 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4720 PyUnicode_GET_SIZE(unicode),
4721 1);
4722}
4723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004724PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725"S.rfind(sub [,start [,end]]) -> int\n\
4726\n\
4727Return the highest index in S where substring sub is found,\n\
4728such that sub is contained within s[start,end]. Optional\n\
4729arguments start and end are interpreted as in slice notation.\n\
4730\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004731Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732
4733static PyObject *
4734unicode_rfind(PyUnicodeObject *self, PyObject *args)
4735{
4736 PyUnicodeObject *substring;
4737 int start = 0;
4738 int end = INT_MAX;
4739 PyObject *result;
4740
Guido van Rossumb8872e62000-05-09 14:14:27 +00004741 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4742 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 return NULL;
4744 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4745 (PyObject *)substring);
4746 if (substring == NULL)
4747 return NULL;
4748
4749 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4750
4751 Py_DECREF(substring);
4752 return result;
4753}
4754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004755PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756"S.rindex(sub [,start [,end]]) -> int\n\
4757\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004758Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759
4760static PyObject *
4761unicode_rindex(PyUnicodeObject *self, PyObject *args)
4762{
4763 int result;
4764 PyUnicodeObject *substring;
4765 int start = 0;
4766 int end = INT_MAX;
4767
Guido van Rossumb8872e62000-05-09 14:14:27 +00004768 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4769 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 return NULL;
4771 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4772 (PyObject *)substring);
4773 if (substring == NULL)
4774 return NULL;
4775
4776 result = findstring(self, substring, start, end, -1);
4777
4778 Py_DECREF(substring);
4779 if (result < 0) {
4780 PyErr_SetString(PyExc_ValueError, "substring not found");
4781 return NULL;
4782 }
4783 return PyInt_FromLong(result);
4784}
4785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004786PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787"S.rjust(width) -> unicode\n\
4788\n\
4789Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004790done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
4792static PyObject *
4793unicode_rjust(PyUnicodeObject *self, PyObject *args)
4794{
4795 int width;
4796 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4797 return NULL;
4798
Tim Peters7a29bd52001-09-12 03:03:31 +00004799 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 Py_INCREF(self);
4801 return (PyObject*) self;
4802 }
4803
4804 return (PyObject*) pad(self, width - self->length, 0, ' ');
4805}
4806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807static PyObject*
4808unicode_slice(PyUnicodeObject *self, int start, int end)
4809{
4810 /* standard clamping */
4811 if (start < 0)
4812 start = 0;
4813 if (end < 0)
4814 end = 0;
4815 if (end > self->length)
4816 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004817 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 /* full slice, return original string */
4819 Py_INCREF(self);
4820 return (PyObject*) self;
4821 }
4822 if (start > end)
4823 start = end;
4824 /* copy slice */
4825 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4826 end - start);
4827}
4828
4829PyObject *PyUnicode_Split(PyObject *s,
4830 PyObject *sep,
4831 int maxsplit)
4832{
4833 PyObject *result;
4834
4835 s = PyUnicode_FromObject(s);
4836 if (s == NULL)
4837 return NULL;
4838 if (sep != NULL) {
4839 sep = PyUnicode_FromObject(sep);
4840 if (sep == NULL) {
4841 Py_DECREF(s);
4842 return NULL;
4843 }
4844 }
4845
4846 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4847
4848 Py_DECREF(s);
4849 Py_XDECREF(sep);
4850 return result;
4851}
4852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004853PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854"S.split([sep [,maxsplit]]) -> list of strings\n\
4855\n\
4856Return a list of the words in S, using sep as the\n\
4857delimiter string. If maxsplit is given, at most maxsplit\n\
4858splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004859is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860
4861static PyObject*
4862unicode_split(PyUnicodeObject *self, PyObject *args)
4863{
4864 PyObject *substring = Py_None;
4865 int maxcount = -1;
4866
4867 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4868 return NULL;
4869
4870 if (substring == Py_None)
4871 return split(self, NULL, maxcount);
4872 else if (PyUnicode_Check(substring))
4873 return split(self, (PyUnicodeObject *)substring, maxcount);
4874 else
4875 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4876}
4877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004878PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00004879"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880\n\
4881Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004882Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004883is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884
4885static PyObject*
4886unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4887{
Guido van Rossum86662912000-04-11 15:38:46 +00004888 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889
Guido van Rossum86662912000-04-11 15:38:46 +00004890 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 return NULL;
4892
Guido van Rossum86662912000-04-11 15:38:46 +00004893 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894}
4895
4896static
4897PyObject *unicode_str(PyUnicodeObject *self)
4898{
Fred Drakee4315f52000-05-09 19:53:39 +00004899 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900}
4901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004902PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903"S.swapcase() -> unicode\n\
4904\n\
4905Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004906and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907
4908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004909unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 return fixup(self, fixswapcase);
4912}
4913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915"S.translate(table) -> unicode\n\
4916\n\
4917Return a copy of the string S, where all characters have been mapped\n\
4918through the given translation table, which must be a mapping of\n\
4919Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004920are left untouched. Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921
4922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004923unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 return PyUnicode_TranslateCharmap(self->str,
4926 self->length,
4927 table,
4928 "ignore");
4929}
4930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004931PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932"S.upper() -> unicode\n\
4933\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004934Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935
4936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004937unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 return fixup(self, fixupper);
4940}
4941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004942PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943"S.zfill(width) -> unicode\n\
4944\n\
4945Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004946of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947
4948static PyObject *
4949unicode_zfill(PyUnicodeObject *self, PyObject *args)
4950{
4951 int fill;
4952 PyUnicodeObject *u;
4953
4954 int width;
4955 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4956 return NULL;
4957
4958 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004959 if (PyUnicode_CheckExact(self)) {
4960 Py_INCREF(self);
4961 return (PyObject*) self;
4962 }
4963 else
4964 return PyUnicode_FromUnicode(
4965 PyUnicode_AS_UNICODE(self),
4966 PyUnicode_GET_SIZE(self)
4967 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 }
4969
4970 fill = width - self->length;
4971
4972 u = pad(self, fill, 0, '0');
4973
Walter Dörwald068325e2002-04-15 13:36:47 +00004974 if (u == NULL)
4975 return NULL;
4976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 if (u->str[fill] == '+' || u->str[fill] == '-') {
4978 /* move sign to beginning of string */
4979 u->str[0] = u->str[fill];
4980 u->str[fill] = '0';
4981 }
4982
4983 return (PyObject*) u;
4984}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985
4986#if 0
4987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004988unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 return PyInt_FromLong(unicode_freelist_size);
4991}
4992#endif
4993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004994PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004995"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004997Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004999comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000
5001static PyObject *
5002unicode_startswith(PyUnicodeObject *self,
5003 PyObject *args)
5004{
5005 PyUnicodeObject *substring;
5006 int start = 0;
5007 int end = INT_MAX;
5008 PyObject *result;
5009
Guido van Rossumb8872e62000-05-09 14:14:27 +00005010 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5011 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 return NULL;
5013 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5014 (PyObject *)substring);
5015 if (substring == NULL)
5016 return NULL;
5017
Guido van Rossum77f6a652002-04-03 22:41:51 +00005018 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019
5020 Py_DECREF(substring);
5021 return result;
5022}
5023
5024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005025PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005026"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005028Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005030comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031
5032static PyObject *
5033unicode_endswith(PyUnicodeObject *self,
5034 PyObject *args)
5035{
5036 PyUnicodeObject *substring;
5037 int start = 0;
5038 int end = INT_MAX;
5039 PyObject *result;
5040
Guido van Rossumb8872e62000-05-09 14:14:27 +00005041 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5042 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 return NULL;
5044 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5045 (PyObject *)substring);
5046 if (substring == NULL)
5047 return NULL;
5048
Guido van Rossum77f6a652002-04-03 22:41:51 +00005049 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050
5051 Py_DECREF(substring);
5052 return result;
5053}
5054
5055
5056static PyMethodDef unicode_methods[] = {
5057
5058 /* Order is according to common usage: often used methods should
5059 appear first, since lookup is done sequentially. */
5060
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005061 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5062 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5063 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5064 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5065 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5066 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5067 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5068 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5069 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5070 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5071 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5072 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5073 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005074 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005075/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5076 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5077 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5078 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005079 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005080 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005081 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005082 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5083 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5084 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5085 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5086 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5087 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5088 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5089 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5090 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5091 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5092 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5093 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5094 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5095 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005096 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005097#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005098 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099#endif
5100
5101#if 0
5102 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005103 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104#endif
5105
5106 {NULL, NULL}
5107};
5108
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109static PySequenceMethods unicode_as_sequence = {
5110 (inquiry) unicode_length, /* sq_length */
5111 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5112 (intargfunc) unicode_repeat, /* sq_repeat */
5113 (intargfunc) unicode_getitem, /* sq_item */
5114 (intintargfunc) unicode_slice, /* sq_slice */
5115 0, /* sq_ass_item */
5116 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005117 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118};
5119
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005120static PyObject*
5121unicode_subscript(PyUnicodeObject* self, PyObject* item)
5122{
5123 if (PyInt_Check(item)) {
5124 long i = PyInt_AS_LONG(item);
5125 if (i < 0)
5126 i += PyString_GET_SIZE(self);
5127 return unicode_getitem(self, i);
5128 } else if (PyLong_Check(item)) {
5129 long i = PyLong_AsLong(item);
5130 if (i == -1 && PyErr_Occurred())
5131 return NULL;
5132 if (i < 0)
5133 i += PyString_GET_SIZE(self);
5134 return unicode_getitem(self, i);
5135 } else if (PySlice_Check(item)) {
5136 int start, stop, step, slicelength, cur, i;
5137 Py_UNICODE* source_buf;
5138 Py_UNICODE* result_buf;
5139 PyObject* result;
5140
5141 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5142 &start, &stop, &step, &slicelength) < 0) {
5143 return NULL;
5144 }
5145
5146 if (slicelength <= 0) {
5147 return PyUnicode_FromUnicode(NULL, 0);
5148 } else {
5149 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5150 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5151
5152 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5153 result_buf[i] = source_buf[cur];
5154 }
5155
5156 result = PyUnicode_FromUnicode(result_buf, slicelength);
5157 PyMem_FREE(result_buf);
5158 return result;
5159 }
5160 } else {
5161 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5162 return NULL;
5163 }
5164}
5165
5166static PyMappingMethods unicode_as_mapping = {
5167 (inquiry)unicode_length, /* mp_length */
5168 (binaryfunc)unicode_subscript, /* mp_subscript */
5169 (objobjargproc)0, /* mp_ass_subscript */
5170};
5171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172static int
5173unicode_buffer_getreadbuf(PyUnicodeObject *self,
5174 int index,
5175 const void **ptr)
5176{
5177 if (index != 0) {
5178 PyErr_SetString(PyExc_SystemError,
5179 "accessing non-existent unicode segment");
5180 return -1;
5181 }
5182 *ptr = (void *) self->str;
5183 return PyUnicode_GET_DATA_SIZE(self);
5184}
5185
5186static int
5187unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5188 const void **ptr)
5189{
5190 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005191 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 return -1;
5193}
5194
5195static int
5196unicode_buffer_getsegcount(PyUnicodeObject *self,
5197 int *lenp)
5198{
5199 if (lenp)
5200 *lenp = PyUnicode_GET_DATA_SIZE(self);
5201 return 1;
5202}
5203
5204static int
5205unicode_buffer_getcharbuf(PyUnicodeObject *self,
5206 int index,
5207 const void **ptr)
5208{
5209 PyObject *str;
5210
5211 if (index != 0) {
5212 PyErr_SetString(PyExc_SystemError,
5213 "accessing non-existent unicode segment");
5214 return -1;
5215 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005216 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 if (str == NULL)
5218 return -1;
5219 *ptr = (void *) PyString_AS_STRING(str);
5220 return PyString_GET_SIZE(str);
5221}
5222
5223/* Helpers for PyUnicode_Format() */
5224
5225static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005226getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
5228 int argidx = *p_argidx;
5229 if (argidx < arglen) {
5230 (*p_argidx)++;
5231 if (arglen < 0)
5232 return args;
5233 else
5234 return PyTuple_GetItem(args, argidx);
5235 }
5236 PyErr_SetString(PyExc_TypeError,
5237 "not enough arguments for format string");
5238 return NULL;
5239}
5240
5241#define F_LJUST (1<<0)
5242#define F_SIGN (1<<1)
5243#define F_BLANK (1<<2)
5244#define F_ALT (1<<3)
5245#define F_ZERO (1<<4)
5246
5247static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249{
5250 register int i;
5251 int len;
5252 va_list va;
5253 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
5256 /* First, format the string as char array, then expand to Py_UNICODE
5257 array. */
5258 charbuffer = (char *)buffer;
5259 len = vsprintf(charbuffer, format, va);
5260 for (i = len - 1; i >= 0; i--)
5261 buffer[i] = (Py_UNICODE) charbuffer[i];
5262
5263 va_end(va);
5264 return len;
5265}
5266
Guido van Rossum078151d2002-08-11 04:24:12 +00005267/* XXX To save some code duplication, formatfloat/long/int could have been
5268 shared with stringobject.c, converting from 8-bit to Unicode after the
5269 formatting is done. */
5270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271static int
5272formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005273 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 int flags,
5275 int prec,
5276 int type,
5277 PyObject *v)
5278{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005279 /* fmt = '%#.' + `prec` + `type`
5280 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 char fmt[20];
5282 double x;
5283
5284 x = PyFloat_AsDouble(v);
5285 if (x == -1.0 && PyErr_Occurred())
5286 return -1;
5287 if (prec < 0)
5288 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5290 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005291 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5292 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005293 /* worst case length calc to ensure no buffer overrun:
5294 fmt = %#.<prec>g
5295 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5296 for any double rep.)
5297 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5298 If prec=0 the effective precision is 1 (the leading digit is
5299 always given), therefore increase by one to 10+prec. */
5300 if (buflen <= (size_t)10 + (size_t)prec) {
5301 PyErr_SetString(PyExc_OverflowError,
5302 "formatted float is too long (precision too long?)");
5303 return -1;
5304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 return usprintf(buf, fmt, x);
5306}
5307
Tim Peters38fd5b62000-09-21 05:43:11 +00005308static PyObject*
5309formatlong(PyObject *val, int flags, int prec, int type)
5310{
5311 char *buf;
5312 int i, len;
5313 PyObject *str; /* temporary string object. */
5314 PyUnicodeObject *result;
5315
5316 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5317 if (!str)
5318 return NULL;
5319 result = _PyUnicode_New(len);
5320 for (i = 0; i < len; i++)
5321 result->str[i] = buf[i];
5322 result->str[len] = 0;
5323 Py_DECREF(str);
5324 return (PyObject*)result;
5325}
5326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327static int
5328formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005329 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 int flags,
5331 int prec,
5332 int type,
5333 PyObject *v)
5334{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005335 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005336 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5337 * + 1 + 1
5338 * = 24
5339 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005340 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 long x;
5342
5343 x = PyInt_AsLong(v);
5344 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005345 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00005346 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00005347 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00005348 "%u/%o/%x/%X of negative int will return "
5349 "a signed string in Python 2.4 and up") < 0)
5350 return -1;
5351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005353 prec = 1;
5354
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005355 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005356 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5357 */
5358 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005359 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005360 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005361 return -1;
5362 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005363
5364 if ((flags & F_ALT) &&
5365 (type == 'x' || type == 'X')) {
5366 /* When converting under %#x or %#X, there are a number
5367 * of issues that cause pain:
5368 * - when 0 is being converted, the C standard leaves off
5369 * the '0x' or '0X', which is inconsistent with other
5370 * %#x/%#X conversions and inconsistent with Python's
5371 * hex() function
5372 * - there are platforms that violate the standard and
5373 * convert 0 with the '0x' or '0X'
5374 * (Metrowerks, Compaq Tru64)
5375 * - there are platforms that give '0x' when converting
5376 * under %#X, but convert 0 in accordance with the
5377 * standard (OS/2 EMX)
5378 *
5379 * We can achieve the desired consistency by inserting our
5380 * own '0x' or '0X' prefix, and substituting %x/%X in place
5381 * of %#x/%#X.
5382 *
5383 * Note that this is the same approach as used in
5384 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005385 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005386 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5387 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005388 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005389 else {
5390 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5391 (flags&F_ALT) ? "#" : "",
5392 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 return usprintf(buf, fmt, x);
5395}
5396
5397static int
5398formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005399 size_t buflen,
5400 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005402 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005403 if (PyUnicode_Check(v)) {
5404 if (PyUnicode_GET_SIZE(v) != 1)
5405 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005409 else if (PyString_Check(v)) {
5410 if (PyString_GET_SIZE(v) != 1)
5411 goto onError;
5412 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
5415 else {
5416 /* Integer input truncated to a character */
5417 long x;
5418 x = PyInt_AsLong(v);
5419 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005420 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00005421#ifdef Py_UNICODE_WIDE
5422 if (x < 0 || x > 0x10ffff) {
5423 PyErr_SetString(PyExc_ValueError,
5424 "%c arg not in range(0x110000) "
5425 "(wide Python build)");
5426 return -1;
5427 }
5428#else
5429 if (x < 0 || x > 0xffff) {
5430 PyErr_SetString(PyExc_ValueError,
5431 "%c arg not in range(0x10000) "
5432 "(narrow Python build)");
5433 return -1;
5434 }
5435#endif
5436 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 }
5438 buf[1] = '\0';
5439 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005440
5441 onError:
5442 PyErr_SetString(PyExc_TypeError,
5443 "%c requires int or char");
5444 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445}
5446
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005447/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5448
5449 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5450 chars are formatted. XXX This is a magic number. Each formatting
5451 routine does bounds checking to ensure no overflow, but a better
5452 solution may be to malloc a buffer of appropriate size for each
5453 format. For now, the current solution is sufficient.
5454*/
5455#define FORMATBUFLEN (size_t)120
5456
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457PyObject *PyUnicode_Format(PyObject *format,
5458 PyObject *args)
5459{
5460 Py_UNICODE *fmt, *res;
5461 int fmtcnt, rescnt, reslen, arglen, argidx;
5462 int args_owned = 0;
5463 PyUnicodeObject *result = NULL;
5464 PyObject *dict = NULL;
5465 PyObject *uformat;
5466
5467 if (format == NULL || args == NULL) {
5468 PyErr_BadInternalCall();
5469 return NULL;
5470 }
5471 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005472 if (uformat == NULL)
5473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 fmt = PyUnicode_AS_UNICODE(uformat);
5475 fmtcnt = PyUnicode_GET_SIZE(uformat);
5476
5477 reslen = rescnt = fmtcnt + 100;
5478 result = _PyUnicode_New(reslen);
5479 if (result == NULL)
5480 goto onError;
5481 res = PyUnicode_AS_UNICODE(result);
5482
5483 if (PyTuple_Check(args)) {
5484 arglen = PyTuple_Size(args);
5485 argidx = 0;
5486 }
5487 else {
5488 arglen = -1;
5489 argidx = -2;
5490 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005491 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 dict = args;
5493
5494 while (--fmtcnt >= 0) {
5495 if (*fmt != '%') {
5496 if (--rescnt < 0) {
5497 rescnt = fmtcnt + 100;
5498 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005499 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 return NULL;
5501 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5502 --rescnt;
5503 }
5504 *res++ = *fmt++;
5505 }
5506 else {
5507 /* Got a format specifier */
5508 int flags = 0;
5509 int width = -1;
5510 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 Py_UNICODE c = '\0';
5512 Py_UNICODE fill;
5513 PyObject *v = NULL;
5514 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005515 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 Py_UNICODE sign;
5517 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005518 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
5520 fmt++;
5521 if (*fmt == '(') {
5522 Py_UNICODE *keystart;
5523 int keylen;
5524 PyObject *key;
5525 int pcount = 1;
5526
5527 if (dict == NULL) {
5528 PyErr_SetString(PyExc_TypeError,
5529 "format requires a mapping");
5530 goto onError;
5531 }
5532 ++fmt;
5533 --fmtcnt;
5534 keystart = fmt;
5535 /* Skip over balanced parentheses */
5536 while (pcount > 0 && --fmtcnt >= 0) {
5537 if (*fmt == ')')
5538 --pcount;
5539 else if (*fmt == '(')
5540 ++pcount;
5541 fmt++;
5542 }
5543 keylen = fmt - keystart - 1;
5544 if (fmtcnt < 0 || pcount > 0) {
5545 PyErr_SetString(PyExc_ValueError,
5546 "incomplete format key");
5547 goto onError;
5548 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005549#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005550 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 then looked up since Python uses strings to hold
5552 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005553 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 key = PyUnicode_EncodeUTF8(keystart,
5555 keylen,
5556 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005557#else
5558 key = PyUnicode_FromUnicode(keystart, keylen);
5559#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 if (key == NULL)
5561 goto onError;
5562 if (args_owned) {
5563 Py_DECREF(args);
5564 args_owned = 0;
5565 }
5566 args = PyObject_GetItem(dict, key);
5567 Py_DECREF(key);
5568 if (args == NULL) {
5569 goto onError;
5570 }
5571 args_owned = 1;
5572 arglen = -1;
5573 argidx = -2;
5574 }
5575 while (--fmtcnt >= 0) {
5576 switch (c = *fmt++) {
5577 case '-': flags |= F_LJUST; continue;
5578 case '+': flags |= F_SIGN; continue;
5579 case ' ': flags |= F_BLANK; continue;
5580 case '#': flags |= F_ALT; continue;
5581 case '0': flags |= F_ZERO; continue;
5582 }
5583 break;
5584 }
5585 if (c == '*') {
5586 v = getnextarg(args, arglen, &argidx);
5587 if (v == NULL)
5588 goto onError;
5589 if (!PyInt_Check(v)) {
5590 PyErr_SetString(PyExc_TypeError,
5591 "* wants int");
5592 goto onError;
5593 }
5594 width = PyInt_AsLong(v);
5595 if (width < 0) {
5596 flags |= F_LJUST;
5597 width = -width;
5598 }
5599 if (--fmtcnt >= 0)
5600 c = *fmt++;
5601 }
5602 else if (c >= '0' && c <= '9') {
5603 width = c - '0';
5604 while (--fmtcnt >= 0) {
5605 c = *fmt++;
5606 if (c < '0' || c > '9')
5607 break;
5608 if ((width*10) / 10 != width) {
5609 PyErr_SetString(PyExc_ValueError,
5610 "width too big");
5611 goto onError;
5612 }
5613 width = width*10 + (c - '0');
5614 }
5615 }
5616 if (c == '.') {
5617 prec = 0;
5618 if (--fmtcnt >= 0)
5619 c = *fmt++;
5620 if (c == '*') {
5621 v = getnextarg(args, arglen, &argidx);
5622 if (v == NULL)
5623 goto onError;
5624 if (!PyInt_Check(v)) {
5625 PyErr_SetString(PyExc_TypeError,
5626 "* wants int");
5627 goto onError;
5628 }
5629 prec = PyInt_AsLong(v);
5630 if (prec < 0)
5631 prec = 0;
5632 if (--fmtcnt >= 0)
5633 c = *fmt++;
5634 }
5635 else if (c >= '0' && c <= '9') {
5636 prec = c - '0';
5637 while (--fmtcnt >= 0) {
5638 c = Py_CHARMASK(*fmt++);
5639 if (c < '0' || c > '9')
5640 break;
5641 if ((prec*10) / 10 != prec) {
5642 PyErr_SetString(PyExc_ValueError,
5643 "prec too big");
5644 goto onError;
5645 }
5646 prec = prec*10 + (c - '0');
5647 }
5648 }
5649 } /* prec */
5650 if (fmtcnt >= 0) {
5651 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (--fmtcnt >= 0)
5653 c = *fmt++;
5654 }
5655 }
5656 if (fmtcnt < 0) {
5657 PyErr_SetString(PyExc_ValueError,
5658 "incomplete format");
5659 goto onError;
5660 }
5661 if (c != '%') {
5662 v = getnextarg(args, arglen, &argidx);
5663 if (v == NULL)
5664 goto onError;
5665 }
5666 sign = 0;
5667 fill = ' ';
5668 switch (c) {
5669
5670 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005671 pbuf = formatbuf;
5672 /* presume that buffer length is at least 1 */
5673 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 len = 1;
5675 break;
5676
5677 case 's':
5678 case 'r':
5679 if (PyUnicode_Check(v) && c == 's') {
5680 temp = v;
5681 Py_INCREF(temp);
5682 }
5683 else {
5684 PyObject *unicode;
5685 if (c == 's')
5686 temp = PyObject_Str(v);
5687 else
5688 temp = PyObject_Repr(v);
5689 if (temp == NULL)
5690 goto onError;
5691 if (!PyString_Check(temp)) {
5692 /* XXX Note: this should never happen, since
5693 PyObject_Repr() and PyObject_Str() assure
5694 this */
5695 Py_DECREF(temp);
5696 PyErr_SetString(PyExc_TypeError,
5697 "%s argument has non-string str()");
5698 goto onError;
5699 }
Fred Drakee4315f52000-05-09 19:53:39 +00005700 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005702 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 "strict");
5704 Py_DECREF(temp);
5705 temp = unicode;
5706 if (temp == NULL)
5707 goto onError;
5708 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005709 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 len = PyUnicode_GET_SIZE(temp);
5711 if (prec >= 0 && len > prec)
5712 len = prec;
5713 break;
5714
5715 case 'i':
5716 case 'd':
5717 case 'u':
5718 case 'o':
5719 case 'x':
5720 case 'X':
5721 if (c == 'i')
5722 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005723 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005724 temp = formatlong(v, flags, prec, c);
5725 if (!temp)
5726 goto onError;
5727 pbuf = PyUnicode_AS_UNICODE(temp);
5728 len = PyUnicode_GET_SIZE(temp);
5729 /* unbounded ints can always produce
5730 a sign character! */
5731 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005733 else {
5734 pbuf = formatbuf;
5735 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5736 flags, prec, c, v);
5737 if (len < 0)
5738 goto onError;
5739 /* only d conversion is signed */
5740 sign = c == 'd';
5741 }
5742 if (flags & F_ZERO)
5743 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 break;
5745
5746 case 'e':
5747 case 'E':
5748 case 'f':
5749 case 'g':
5750 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005751 pbuf = formatbuf;
5752 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5753 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 if (len < 0)
5755 goto onError;
5756 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005757 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 fill = '0';
5759 break;
5760
5761 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005762 pbuf = formatbuf;
5763 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 if (len < 0)
5765 goto onError;
5766 break;
5767
5768 default:
5769 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005770 "unsupported format character '%c' (0x%x) "
5771 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005772 (31<=c && c<=126) ? c : '?',
5773 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 goto onError;
5775 }
5776 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005777 if (*pbuf == '-' || *pbuf == '+') {
5778 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 len--;
5780 }
5781 else if (flags & F_SIGN)
5782 sign = '+';
5783 else if (flags & F_BLANK)
5784 sign = ' ';
5785 else
5786 sign = 0;
5787 }
5788 if (width < len)
5789 width = len;
5790 if (rescnt < width + (sign != 0)) {
5791 reslen -= rescnt;
5792 rescnt = width + fmtcnt + 100;
5793 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005794 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 return NULL;
5796 res = PyUnicode_AS_UNICODE(result)
5797 + reslen - rescnt;
5798 }
5799 if (sign) {
5800 if (fill != ' ')
5801 *res++ = sign;
5802 rescnt--;
5803 if (width > len)
5804 width--;
5805 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005806 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5807 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005808 assert(pbuf[1] == c);
5809 if (fill != ' ') {
5810 *res++ = *pbuf++;
5811 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005812 }
Tim Petersfff53252001-04-12 18:38:48 +00005813 rescnt -= 2;
5814 width -= 2;
5815 if (width < 0)
5816 width = 0;
5817 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 if (width > len && !(flags & F_LJUST)) {
5820 do {
5821 --rescnt;
5822 *res++ = fill;
5823 } while (--width > len);
5824 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005825 if (fill == ' ') {
5826 if (sign)
5827 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005828 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005829 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005830 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005831 *res++ = *pbuf++;
5832 *res++ = *pbuf++;
5833 }
5834 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005835 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 res += len;
5837 rescnt -= len;
5838 while (--width >= len) {
5839 --rescnt;
5840 *res++ = ' ';
5841 }
5842 if (dict && (argidx < arglen) && c != '%') {
5843 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005844 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 goto onError;
5846 }
5847 Py_XDECREF(temp);
5848 } /* '%' */
5849 } /* until end */
5850 if (argidx < arglen && !dict) {
5851 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005852 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 goto onError;
5854 }
5855
5856 if (args_owned) {
5857 Py_DECREF(args);
5858 }
5859 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005860 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 return (PyObject *)result;
5863
5864 onError:
5865 Py_XDECREF(result);
5866 Py_DECREF(uformat);
5867 if (args_owned) {
5868 Py_DECREF(args);
5869 }
5870 return NULL;
5871}
5872
5873static PyBufferProcs unicode_as_buffer = {
5874 (getreadbufferproc) unicode_buffer_getreadbuf,
5875 (getwritebufferproc) unicode_buffer_getwritebuf,
5876 (getsegcountproc) unicode_buffer_getsegcount,
5877 (getcharbufferproc) unicode_buffer_getcharbuf,
5878};
5879
Jeremy Hylton938ace62002-07-17 16:30:39 +00005880static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00005881unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5882
Tim Peters6d6c1a32001-08-02 04:15:00 +00005883static PyObject *
5884unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5885{
5886 PyObject *x = NULL;
5887 static char *kwlist[] = {"string", "encoding", "errors", 0};
5888 char *encoding = NULL;
5889 char *errors = NULL;
5890
Guido van Rossume023fe02001-08-30 03:12:59 +00005891 if (type != &PyUnicode_Type)
5892 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005893 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5894 kwlist, &x, &encoding, &errors))
5895 return NULL;
5896 if (x == NULL)
5897 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005898 if (encoding == NULL && errors == NULL)
5899 return PyObject_Unicode(x);
5900 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005901 return PyUnicode_FromEncodedObject(x, encoding, errors);
5902}
5903
Guido van Rossume023fe02001-08-30 03:12:59 +00005904static PyObject *
5905unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5906{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005907 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005908 int n;
5909
5910 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5911 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5912 if (tmp == NULL)
5913 return NULL;
5914 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005915 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5916 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005917 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005918 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5919 if (pnew->str == NULL) {
5920 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005921 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005922 return NULL;
5923 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005924 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5925 pnew->length = n;
5926 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005927 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005928 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005929}
5930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005931PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00005932"unicode(string [, encoding[, errors]]) -> object\n\
5933\n\
5934Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00005935encoding defaults to the current default string encoding.\n\
5936errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00005937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938PyTypeObject PyUnicode_Type = {
5939 PyObject_HEAD_INIT(&PyType_Type)
5940 0, /* ob_size */
5941 "unicode", /* tp_name */
5942 sizeof(PyUnicodeObject), /* tp_size */
5943 0, /* tp_itemsize */
5944 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005945 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005947 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 0, /* tp_setattr */
5949 (cmpfunc) unicode_compare, /* tp_compare */
5950 (reprfunc) unicode_repr, /* tp_repr */
5951 0, /* tp_as_number */
5952 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005953 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 (hashfunc) unicode_hash, /* tp_hash*/
5955 0, /* tp_call*/
5956 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005957 PyObject_GenericGetAttr, /* tp_getattro */
5958 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005960 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005961 unicode_doc, /* tp_doc */
5962 0, /* tp_traverse */
5963 0, /* tp_clear */
5964 0, /* tp_richcompare */
5965 0, /* tp_weaklistoffset */
5966 0, /* tp_iter */
5967 0, /* tp_iternext */
5968 unicode_methods, /* tp_methods */
5969 0, /* tp_members */
5970 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00005971 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005972 0, /* tp_dict */
5973 0, /* tp_descr_get */
5974 0, /* tp_descr_set */
5975 0, /* tp_dictoffset */
5976 0, /* tp_init */
5977 0, /* tp_alloc */
5978 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005979 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980};
5981
5982/* Initialize the Unicode implementation */
5983
Thomas Wouters78890102000-07-22 19:25:51 +00005984void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005986 int i;
5987
Fred Drakee4315f52000-05-09 19:53:39 +00005988 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005989 unicode_freelist = NULL;
5990 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005992 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005993 for (i = 0; i < 256; i++)
5994 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00005995 if (PyType_Ready(&PyUnicode_Type) < 0)
5996 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997}
5998
5999/* Finalize the Unicode implementation */
6000
6001void
Thomas Wouters78890102000-07-22 19:25:51 +00006002_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006004 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006005 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006007 Py_XDECREF(unicode_empty);
6008 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006009
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006010 for (i = 0; i < 256; i++) {
6011 if (unicode_latin1[i]) {
6012 Py_DECREF(unicode_latin1[i]);
6013 unicode_latin1[i] = NULL;
6014 }
6015 }
6016
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006017 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 PyUnicodeObject *v = u;
6019 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006020 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006021 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006022 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006023 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006025 unicode_freelist = NULL;
6026 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027}