blob: 920f9ea2d86449b468dde94e567d59be683261dd [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 int owned = 0;
456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Greg Steinaf36a3a2000-07-17 09:04:43 +0000513 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000515 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000519 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523}
524
525PyObject *PyUnicode_Decode(const char *s,
526 int size,
527 const char *encoding,
528 const char *errors)
529{
530 PyObject *buffer = NULL, *unicode;
531
Fred Drakee4315f52000-05-09 19:53:39 +0000532 if (encoding == NULL)
533 encoding = PyUnicode_GetDefaultEncoding();
534
535 /* Shortcuts for common default encodings */
536 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "latin-1") == 0)
539 return PyUnicode_DecodeLatin1(s, size, errors);
540 else if (strcmp(encoding, "ascii") == 0)
541 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000542
543 /* Decode via the codec registry */
544 buffer = PyBuffer_FromMemory((void *)s, size);
545 if (buffer == NULL)
546 goto onError;
547 unicode = PyCodec_Decode(buffer, encoding, errors);
548 if (unicode == NULL)
549 goto onError;
550 if (!PyUnicode_Check(unicode)) {
551 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000552 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 unicode->ob_type->tp_name);
554 Py_DECREF(unicode);
555 goto onError;
556 }
557 Py_DECREF(buffer);
558 return unicode;
559
560 onError:
561 Py_XDECREF(buffer);
562 return NULL;
563}
564
565PyObject *PyUnicode_Encode(const Py_UNICODE *s,
566 int size,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v, *unicode;
571
572 unicode = PyUnicode_FromUnicode(s, size);
573 if (unicode == NULL)
574 return NULL;
575 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
576 Py_DECREF(unicode);
577 return v;
578}
579
580PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
581 const char *encoding,
582 const char *errors)
583{
584 PyObject *v;
585
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
Fred Drakee4315f52000-05-09 19:53:39 +0000590
591 if (encoding == NULL)
592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (errors == NULL) {
596 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000597 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "latin-1") == 0)
599 return PyUnicode_AsLatin1String(unicode);
600 else if (strcmp(encoding, "ascii") == 0)
601 return PyUnicode_AsASCIIString(unicode);
602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
604 /* Encode via the codec registry */
605 v = PyCodec_Encode(unicode, encoding, errors);
606 if (v == NULL)
607 goto onError;
608 /* XXX Should we really enforce this ? */
609 if (!PyString_Check(v)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 v->ob_type->tp_name);
613 Py_DECREF(v);
614 goto onError;
615 }
616 return v;
617
618 onError:
619 return NULL;
620}
621
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
623 const char *errors)
624{
625 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
626
627 if (v)
628 return v;
629 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
630 if (v && errors == NULL)
631 ((PyUnicodeObject *)unicode)->defenc = v;
632 return v;
633}
634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
636{
637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
641 return PyUnicode_AS_UNICODE(unicode);
642
643 onError:
644 return NULL;
645}
646
647int PyUnicode_GetSize(PyObject *unicode)
648{
649 if (!PyUnicode_Check(unicode)) {
650 PyErr_BadArgument();
651 goto onError;
652 }
653 return PyUnicode_GET_SIZE(unicode);
654
655 onError:
656 return -1;
657}
658
Thomas Wouters78890102000-07-22 19:25:51 +0000659const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000660{
661 return unicode_default_encoding;
662}
663
664int PyUnicode_SetDefaultEncoding(const char *encoding)
665{
666 PyObject *v;
667
668 /* Make sure the encoding is valid. As side effect, this also
669 loads the encoding into the codec registry cache. */
670 v = _PyCodec_Lookup(encoding);
671 if (v == NULL)
672 goto onError;
673 Py_DECREF(v);
674 strncpy(unicode_default_encoding,
675 encoding,
676 sizeof(unicode_default_encoding));
677 return 0;
678
679 onError:
680 return -1;
681}
682
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000683/* --- UTF-7 Codec -------------------------------------------------------- */
684
685/* see RFC2152 for details */
686
687static
688char utf7_special[128] = {
689 /* indicate whether a UTF-7 character is special i.e. cannot be directly
690 encoded:
691 0 - not special
692 1 - special
693 2 - whitespace (optional)
694 3 - RFC2152 Set O (optional) */
695 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
696 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
699 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
701 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
703
704};
705
706#define SPECIAL(c, encodeO, encodeWS) \
707 (((c)>127 || utf7_special[(c)] == 1) || \
708 (encodeWS && (utf7_special[(c)] == 2)) || \
709 (encodeO && (utf7_special[(c)] == 3)))
710
711#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
712#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
713#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
714 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
715
716#define ENCODE(out, ch, bits) \
717 while (bits >= 6) { \
718 *out++ = B64(ch >> (bits-6)); \
719 bits -= 6; \
720 }
721
722#define DECODE(out, ch, bits, surrogate) \
723 while (bits >= 16) { \
724 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
725 bits -= 16; \
726 if (surrogate) { \
727 /* We have already generated an error for the high surrogate
728 so let's not bother seeing if the low surrogate is correct or not */\
729 surrogate = 0; \
730 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
731 /* This is a surrogate pair. Unfortunately we can't represent \
732 it in a 16-bit character */ \
733 surrogate = 1; \
734 errmsg = "code pairs are not supported"; \
735 goto utf7Error; \
736 } else { \
737 *out++ = outCh; \
738 } \
739 } \
740
741static
742int utf7_decoding_error(Py_UNICODE **dest,
743 const char *errors,
744 const char *details)
745{
746 if ((errors == NULL) ||
747 (strcmp(errors,"strict") == 0)) {
748 PyErr_Format(PyExc_UnicodeError,
749 "UTF-7 decoding error: %.400s",
750 details);
751 return -1;
752 }
753 else if (strcmp(errors,"ignore") == 0) {
754 return 0;
755 }
756 else if (strcmp(errors,"replace") == 0) {
757 if (dest != NULL) {
758 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
759 (*dest)++;
760 }
761 return 0;
762 }
763 else {
764 PyErr_Format(PyExc_ValueError,
765 "UTF-7 decoding error; unknown error handling code: %.400s",
766 errors);
767 return -1;
768 }
769}
770
771PyObject *PyUnicode_DecodeUTF7(const char *s,
772 int size,
773 const char *errors)
774{
775 const char *e;
776 PyUnicodeObject *unicode;
777 Py_UNICODE *p;
778 const char *errmsg = "";
779 int inShift = 0;
780 unsigned int bitsleft = 0;
781 unsigned long charsleft = 0;
782 int surrogate = 0;
783
784 unicode = _PyUnicode_New(size);
785 if (!unicode)
786 return NULL;
787 if (size == 0)
788 return (PyObject *)unicode;
789
790 p = unicode->str;
791 e = s + size;
792
793 while (s < e) {
794 Py_UNICODE ch = *s;
795
796 if (inShift) {
797 if ((ch == '-') || !B64CHAR(ch)) {
798 inShift = 0;
799 s++;
800
801 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
802 if (bitsleft >= 6) {
803 /* The shift sequence has a partial character in it. If
804 bitsleft < 6 then we could just classify it as padding
805 but that is not the case here */
806
807 errmsg = "partial character in shift sequence";
808 goto utf7Error;
809 }
810 /* According to RFC2152 the remaining bits should be zero. We
811 choose to signal an error/insert a replacement character
812 here so indicate the potential of a misencoded character. */
813
814 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
815 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
816 errmsg = "non-zero padding bits in shift sequence";
817 goto utf7Error;
818 }
819
820 if (ch == '-') {
821 if ((s < e) && (*(s) == '-')) {
822 *p++ = '-';
823 inShift = 1;
824 }
825 } else if (SPECIAL(ch,0,0)) {
826 errmsg = "unexpected special character";
827 goto utf7Error;
828 } else {
829 *p++ = ch;
830 }
831 } else {
832 charsleft = (charsleft << 6) | UB64(ch);
833 bitsleft += 6;
834 s++;
835 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
836 }
837 }
838 else if ( ch == '+' ) {
839 s++;
840 if (s < e && *s == '-') {
841 s++;
842 *p++ = '+';
843 } else
844 {
845 inShift = 1;
846 bitsleft = 0;
847 }
848 }
849 else if (SPECIAL(ch,0,0)) {
850 errmsg = "unexpected special character";
851 s++;
852 goto utf7Error;
853 }
854 else {
855 *p++ = ch;
856 s++;
857 }
858 continue;
859 utf7Error:
860 if (utf7_decoding_error(&p, errors, errmsg))
861 goto onError;
862 }
863
864 if (inShift) {
865 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
866 goto onError;
867 }
868
869 if (_PyUnicode_Resize(&unicode, p - unicode->str))
870 goto onError;
871
872 return (PyObject *)unicode;
873
874onError:
875 Py_DECREF(unicode);
876 return NULL;
877}
878
879
880PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
881 int size,
882 int encodeSetO,
883 int encodeWhiteSpace,
884 const char *errors)
885{
886 PyObject *v;
887 /* It might be possible to tighten this worst case */
888 unsigned int cbAllocated = 5 * size;
889 int inShift = 0;
890 int i = 0;
891 unsigned int bitsleft = 0;
892 unsigned long charsleft = 0;
893 char * out;
894 char * start;
895
896 if (size == 0)
897 return PyString_FromStringAndSize(NULL, 0);
898
899 v = PyString_FromStringAndSize(NULL, cbAllocated);
900 if (v == NULL)
901 return NULL;
902
903 start = out = PyString_AS_STRING(v);
904 for (;i < size; ++i) {
905 Py_UNICODE ch = s[i];
906
907 if (!inShift) {
908 if (ch == '+') {
909 *out++ = '+';
910 *out++ = '-';
911 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
912 charsleft = ch;
913 bitsleft = 16;
914 *out++ = '+';
915 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
916 inShift = bitsleft > 0;
917 } else {
918 *out++ = (char) ch;
919 }
920 } else {
921 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
922 *out++ = B64(charsleft << (6-bitsleft));
923 charsleft = 0;
924 bitsleft = 0;
925 /* Characters not in the BASE64 set implicitly unshift the sequence
926 so no '-' is required, except if the character is itself a '-' */
927 if (B64CHAR(ch) || ch == '-') {
928 *out++ = '-';
929 }
930 inShift = 0;
931 *out++ = (char) ch;
932 } else {
933 bitsleft += 16;
934 charsleft = (charsleft << 16) | ch;
935 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
936
937 /* If the next character is special then we dont' need to terminate
938 the shift sequence. If the next character is not a BASE64 character
939 or '-' then the shift sequence will be terminated implicitly and we
940 don't have to insert a '-'. */
941
942 if (bitsleft == 0) {
943 if (i + 1 < size) {
944 Py_UNICODE ch2 = s[i+1];
945
946 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
947
948 } else if (B64CHAR(ch2) || ch2 == '-') {
949 *out++ = '-';
950 inShift = 0;
951 } else {
952 inShift = 0;
953 }
954
955 }
956 else {
957 *out++ = '-';
958 inShift = 0;
959 }
960 }
961 }
962 }
963 }
964 if (bitsleft) {
965 *out++= B64(charsleft << (6-bitsleft) );
966 *out++ = '-';
967 }
968
Tim Peters5de98422002-04-27 18:44:32 +0000969 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000970 return v;
971}
972
973#undef SPECIAL
974#undef B64
975#undef B64CHAR
976#undef UB64
977#undef ENCODE
978#undef DECODE
979
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980/* --- UTF-8 Codec -------------------------------------------------------- */
981
982static
983char utf8_code_length[256] = {
984 /* Map UTF-8 encoded prefix byte to sequence length. zero means
985 illegal prefix. see RFC 2279 for details */
986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
987 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
988 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
989 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
990 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
991 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
992 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
993 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
994 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
996 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
997 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
998 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
999 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1000 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1001 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1002};
1003
1004static
1005int utf8_decoding_error(const char **source,
1006 Py_UNICODE **dest,
1007 const char *errors,
1008 const char *details)
1009{
1010 if ((errors == NULL) ||
1011 (strcmp(errors,"strict") == 0)) {
1012 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001013 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 details);
1015 return -1;
1016 }
1017 else if (strcmp(errors,"ignore") == 0) {
1018 (*source)++;
1019 return 0;
1020 }
1021 else if (strcmp(errors,"replace") == 0) {
1022 (*source)++;
1023 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1024 (*dest)++;
1025 return 0;
1026 }
1027 else {
1028 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001029 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 errors);
1031 return -1;
1032 }
1033}
1034
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035PyObject *PyUnicode_DecodeUTF8(const char *s,
1036 int size,
1037 const char *errors)
1038{
1039 int n;
1040 const char *e;
1041 PyUnicodeObject *unicode;
1042 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001043 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044
1045 /* Note: size will always be longer than the resulting Unicode
1046 character count */
1047 unicode = _PyUnicode_New(size);
1048 if (!unicode)
1049 return NULL;
1050 if (size == 0)
1051 return (PyObject *)unicode;
1052
1053 /* Unpack UTF-8 encoded data */
1054 p = unicode->str;
1055 e = s + size;
1056
1057 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001058 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001061 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062 s++;
1063 continue;
1064 }
1065
1066 n = utf8_code_length[ch];
1067
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001068 if (s + n > e) {
1069 errmsg = "unexpected end of data";
1070 goto utf8Error;
1071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072
1073 switch (n) {
1074
1075 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "unexpected code byte";
1077 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001080 errmsg = "internal error";
1081 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082
1083 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001084 if ((s[1] & 0xc0) != 0x80) {
1085 errmsg = "invalid data";
1086 goto utf8Error;
1087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001089 if (ch < 0x80) {
1090 errmsg = "illegal encoding";
1091 goto utf8Error;
1092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 break;
1096
1097 case 3:
1098 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001099 (s[2] & 0xc0) != 0x80) {
1100 errmsg = "invalid data";
1101 goto utf8Error;
1102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001104 if (ch < 0x0800) {
1105 /* Note: UTF-8 encodings of surrogates are considered
1106 legal UTF-8 sequences;
1107
1108 XXX For wide builds (UCS-4) we should probably try
1109 to recombine the surrogates into a single code
1110 unit.
1111 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001112 errmsg = "illegal encoding";
1113 goto utf8Error;
1114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001116 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001117 break;
1118
1119 case 4:
1120 if ((s[1] & 0xc0) != 0x80 ||
1121 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001122 (s[3] & 0xc0) != 0x80) {
1123 errmsg = "invalid data";
1124 goto utf8Error;
1125 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001126 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1127 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1128 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001129 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001130 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001131 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001132 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001133 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 errmsg = "illegal encoding";
1135 goto utf8Error;
1136 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001137#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001138 *p++ = (Py_UNICODE)ch;
1139#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001140 /* compute and append the two surrogates: */
1141
1142 /* translate from 10000..10FFFF to 0..FFFF */
1143 ch -= 0x10000;
1144
1145 /* high surrogate = top 10 bits added to D800 */
1146 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1147
1148 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001149 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001150#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 break;
1152
1153 default:
1154 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 errmsg = "unsupported Unicode code range";
1156 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 }
1158 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 continue;
1160
1161 utf8Error:
1162 if (utf8_decoding_error(&s, &p, errors, errmsg))
1163 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
1165
1166 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001167 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 goto onError;
1169
1170 return (PyObject *)unicode;
1171
1172onError:
1173 Py_DECREF(unicode);
1174 return NULL;
1175}
1176
Tim Peters602f7402002-04-27 18:03:26 +00001177/* Allocation strategy: if the string is short, convert into a stack buffer
1178 and allocate exactly as much space needed at the end. Else allocate the
1179 maximum possible needed (4 result bytes per Unicode character), and return
1180 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001181*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001182PyObject *
1183PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1184 int size,
1185 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186{
Tim Peters602f7402002-04-27 18:03:26 +00001187#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001188
Tim Peters602f7402002-04-27 18:03:26 +00001189 int i; /* index into s of next input byte */
1190 PyObject *v; /* result string object */
1191 char *p; /* next free byte in output buffer */
1192 int nallocated; /* number of result bytes allocated */
1193 int nneeded; /* number of result bytes needed */
1194 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001195
Tim Peters602f7402002-04-27 18:03:26 +00001196 assert(s != NULL);
1197 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
Tim Peters602f7402002-04-27 18:03:26 +00001199 if (size <= MAX_SHORT_UNICHARS) {
1200 /* Write into the stack buffer; nallocated can't overflow.
1201 * At the end, we'll allocate exactly as much heap space as it
1202 * turns out we need.
1203 */
1204 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1205 v = NULL; /* will allocate after we're done */
1206 p = stackbuf;
1207 }
1208 else {
1209 /* Overallocate on the heap, and give the excess back at the end. */
1210 nallocated = size * 4;
1211 if (nallocated / 4 != size) /* overflow! */
1212 return PyErr_NoMemory();
1213 v = PyString_FromStringAndSize(NULL, nallocated);
1214 if (v == NULL)
1215 return NULL;
1216 p = PyString_AS_STRING(v);
1217 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001218
Tim Peters602f7402002-04-27 18:03:26 +00001219 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001221
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001222 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001223 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001227 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001228 *p++ = (char)(0xc0 | (ch >> 6));
1229 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001230 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001231 else {
Tim Peters602f7402002-04-27 18:03:26 +00001232 /* Encode UCS2 Unicode ordinals */
1233 if (ch < 0x10000) {
1234 /* Special case: check for high surrogate */
1235 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1236 Py_UCS4 ch2 = s[i];
1237 /* Check for low surrogate and combine the two to
1238 form a UCS4 value */
1239 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001240 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001241 i++;
1242 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 }
Tim Peters602f7402002-04-27 18:03:26 +00001244 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001247 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1248 *p++ = (char)(0x80 | (ch & 0x3f));
1249 continue;
1250 }
1251encodeUCS4:
1252 /* Encode UCS4 Unicode ordinals */
1253 *p++ = (char)(0xf0 | (ch >> 18));
1254 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1255 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1256 *p++ = (char)(0x80 | (ch & 0x3f));
1257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001259
Tim Peters602f7402002-04-27 18:03:26 +00001260 if (v == NULL) {
1261 /* This was stack allocated. */
1262 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1263 assert(nneeded <= nallocated);
1264 v = PyString_FromStringAndSize(stackbuf, nneeded);
1265 }
1266 else {
1267 /* Cut back to size actually needed. */
1268 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1269 assert(nneeded <= nallocated);
1270 _PyString_Resize(&v, nneeded);
1271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001273
Tim Peters602f7402002-04-27 18:03:26 +00001274#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1278{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 if (!PyUnicode_Check(unicode)) {
1280 PyErr_BadArgument();
1281 return NULL;
1282 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001283 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1284 PyUnicode_GET_SIZE(unicode),
1285 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286}
1287
1288/* --- UTF-16 Codec ------------------------------------------------------- */
1289
1290static
Tim Peters772747b2001-08-09 22:21:55 +00001291int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 const char *errors,
1293 const char *details)
1294{
1295 if ((errors == NULL) ||
1296 (strcmp(errors,"strict") == 0)) {
1297 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001298 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 details);
1300 return -1;
1301 }
1302 else if (strcmp(errors,"ignore") == 0) {
1303 return 0;
1304 }
1305 else if (strcmp(errors,"replace") == 0) {
1306 if (dest) {
1307 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1308 (*dest)++;
1309 }
1310 return 0;
1311 }
1312 else {
1313 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001314 "UTF-16 decoding error; "
1315 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 errors);
1317 return -1;
1318 }
1319}
1320
Tim Peters772747b2001-08-09 22:21:55 +00001321PyObject *
1322PyUnicode_DecodeUTF16(const char *s,
1323 int size,
1324 const char *errors,
1325 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326{
1327 PyUnicodeObject *unicode;
1328 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001329 const unsigned char *q, *e;
1330 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001331 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001332 /* Offsets from q for retrieving byte pairs in the right order. */
1333#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1334 int ihi = 1, ilo = 0;
1335#else
1336 int ihi = 0, ilo = 1;
1337#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
1339 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001340 if (size & 1) {
1341 if (utf16_decoding_error(NULL, errors, "truncated data"))
1342 return NULL;
1343 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 }
1345
1346 /* Note: size will always be longer than the resulting Unicode
1347 character count */
1348 unicode = _PyUnicode_New(size);
1349 if (!unicode)
1350 return NULL;
1351 if (size == 0)
1352 return (PyObject *)unicode;
1353
1354 /* Unpack UTF-16 encoded data */
1355 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001356 q = (unsigned char *)s;
1357 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
1359 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001360 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001362 /* Check for BOM marks (U+FEFF) in the input and adjust current
1363 byte order setting accordingly. In native mode, the leading BOM
1364 mark is skipped, in all other modes, it is copied to the output
1365 stream as-is (giving a ZWNBSP character). */
1366 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001367 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001368#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001369 if (bom == 0xFEFF) {
1370 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001371 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001372 }
1373 else if (bom == 0xFFFE) {
1374 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001375 bo = 1;
1376 }
1377#else
Tim Peters772747b2001-08-09 22:21:55 +00001378 if (bom == 0xFEFF) {
1379 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001380 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001381 }
1382 else if (bom == 0xFFFE) {
1383 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001384 bo = -1;
1385 }
1386#endif
1387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388
Tim Peters772747b2001-08-09 22:21:55 +00001389 if (bo == -1) {
1390 /* force LE */
1391 ihi = 1;
1392 ilo = 0;
1393 }
1394 else if (bo == 1) {
1395 /* force BE */
1396 ihi = 0;
1397 ilo = 1;
1398 }
1399
1400 while (q < e) {
1401 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1402 q += 2;
1403
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 if (ch < 0xD800 || ch > 0xDFFF) {
1405 *p++ = ch;
1406 continue;
1407 }
1408
1409 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001410 if (q >= e) {
1411 errmsg = "unexpected end of data";
1412 goto utf16Error;
1413 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001414 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001415 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1416 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001417 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001418#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001419 *p++ = ch;
1420 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001421#else
1422 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001423#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001424 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001425 }
1426 else {
1427 errmsg = "illegal UTF-16 surrogate";
1428 goto utf16Error;
1429 }
1430
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001432 errmsg = "illegal encoding";
1433 /* Fall through to report the error */
1434
1435 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001436 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 }
1439
1440 if (byteorder)
1441 *byteorder = bo;
1442
1443 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001444 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 goto onError;
1446
1447 return (PyObject *)unicode;
1448
1449onError:
1450 Py_DECREF(unicode);
1451 return NULL;
1452}
1453
Tim Peters772747b2001-08-09 22:21:55 +00001454PyObject *
1455PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1456 int size,
1457 const char *errors,
1458 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459{
1460 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001461 unsigned char *p;
1462 int i, pairs;
1463 /* Offsets from p for storing byte pairs in the right order. */
1464#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1465 int ihi = 1, ilo = 0;
1466#else
1467 int ihi = 0, ilo = 1;
1468#endif
1469
1470#define STORECHAR(CH) \
1471 do { \
1472 p[ihi] = ((CH) >> 8) & 0xff; \
1473 p[ilo] = (CH) & 0xff; \
1474 p += 2; \
1475 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001477 for (i = pairs = 0; i < size; i++)
1478 if (s[i] >= 0x10000)
1479 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001481 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 if (v == NULL)
1483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Tim Peters772747b2001-08-09 22:21:55 +00001485 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001487 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001488 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001489 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001490
1491 if (byteorder == -1) {
1492 /* force LE */
1493 ihi = 1;
1494 ilo = 0;
1495 }
1496 else if (byteorder == 1) {
1497 /* force BE */
1498 ihi = 0;
1499 ilo = 1;
1500 }
1501
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001502 while (size-- > 0) {
1503 Py_UNICODE ch = *s++;
1504 Py_UNICODE ch2 = 0;
1505 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001506 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1507 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508 }
Tim Peters772747b2001-08-09 22:21:55 +00001509 STORECHAR(ch);
1510 if (ch2)
1511 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001514#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515}
1516
1517PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1518{
1519 if (!PyUnicode_Check(unicode)) {
1520 PyErr_BadArgument();
1521 return NULL;
1522 }
1523 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1524 PyUnicode_GET_SIZE(unicode),
1525 NULL,
1526 0);
1527}
1528
1529/* --- Unicode Escape Codec ----------------------------------------------- */
1530
1531static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001532int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 const char *errors,
1534 const char *details)
1535{
1536 if ((errors == NULL) ||
1537 (strcmp(errors,"strict") == 0)) {
1538 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001539 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 details);
1541 return -1;
1542 }
1543 else if (strcmp(errors,"ignore") == 0) {
1544 return 0;
1545 }
1546 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001547 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1548 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 return 0;
1550 }
1551 else {
1552 PyErr_Format(PyExc_ValueError,
1553 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001554 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 errors);
1556 return -1;
1557 }
1558}
1559
Fredrik Lundh06d12682001-01-24 07:59:11 +00001560static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001561
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1563 int size,
1564 const char *errors)
1565{
1566 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001567 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001569 char* message;
1570 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1571
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572 /* Escaped strings will always be longer than the resulting
1573 Unicode string, so we start with size here and then reduce the
1574 length after conversion to the true value. */
1575 v = _PyUnicode_New(size);
1576 if (v == NULL)
1577 goto onError;
1578 if (size == 0)
1579 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001580
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 p = buf = PyUnicode_AS_UNICODE(v);
1582 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001583
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 while (s < end) {
1585 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001586 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001587 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588
1589 /* Non-escape characters are interpreted as Unicode ordinals */
1590 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001591 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 continue;
1593 }
1594
1595 /* \ - Escapes */
1596 s++;
1597 switch (*s++) {
1598
1599 /* \x escapes */
1600 case '\n': break;
1601 case '\\': *p++ = '\\'; break;
1602 case '\'': *p++ = '\''; break;
1603 case '\"': *p++ = '\"'; break;
1604 case 'b': *p++ = '\b'; break;
1605 case 'f': *p++ = '\014'; break; /* FF */
1606 case 't': *p++ = '\t'; break;
1607 case 'n': *p++ = '\n'; break;
1608 case 'r': *p++ = '\r'; break;
1609 case 'v': *p++ = '\013'; break; /* VT */
1610 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1611
1612 /* \OOO (octal) escapes */
1613 case '0': case '1': case '2': case '3':
1614 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001615 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001617 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001619 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001621 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 break;
1623
Fredrik Lundhccc74732001-02-18 22:13:49 +00001624 /* hex escapes */
1625 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001627 digits = 2;
1628 message = "truncated \\xXX escape";
1629 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 digits = 4;
1634 message = "truncated \\uXXXX escape";
1635 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636
Fredrik Lundhccc74732001-02-18 22:13:49 +00001637 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001638 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001639 digits = 8;
1640 message = "truncated \\UXXXXXXXX escape";
1641 hexescape:
1642 chr = 0;
1643 for (i = 0; i < digits; i++) {
1644 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001646 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001648 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 i++;
1650 break;
1651 }
1652 chr = (chr<<4) & ~0xF;
1653 if (c >= '0' && c <= '9')
1654 chr += c - '0';
1655 else if (c >= 'a' && c <= 'f')
1656 chr += 10 + c - 'a';
1657 else
1658 chr += 10 + c - 'A';
1659 }
1660 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001661 if (chr == 0xffffffff)
1662 /* _decoding_error will have already written into the
1663 target buffer. */
1664 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001665 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001666 /* when we get here, chr is a 32-bit unicode character */
1667 if (chr <= 0xffff)
1668 /* UCS-2 character */
1669 *p++ = (Py_UNICODE) chr;
1670 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001671 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001672 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001673#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001674 *p++ = chr;
1675#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001676 chr -= 0x10000L;
1677 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001678 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001679#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001680 } else {
1681 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001682 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001683 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001684 )
1685 goto onError;
1686 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001687 break;
1688
1689 /* \N{name} */
1690 case 'N':
1691 message = "malformed \\N character escape";
1692 if (ucnhash_CAPI == NULL) {
1693 /* load the unicode data module */
1694 PyObject *m, *v;
1695 m = PyImport_ImportModule("unicodedata");
1696 if (m == NULL)
1697 goto ucnhashError;
1698 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1699 Py_DECREF(m);
1700 if (v == NULL)
1701 goto ucnhashError;
1702 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1703 Py_DECREF(v);
1704 if (ucnhash_CAPI == NULL)
1705 goto ucnhashError;
1706 }
1707 if (*s == '{') {
1708 const char *start = s+1;
1709 /* look for the closing brace */
1710 while (*s != '}' && s < end)
1711 s++;
1712 if (s > start && s < end && *s == '}') {
1713 /* found a name. look it up in the unicode database */
1714 message = "unknown Unicode character name";
1715 s++;
1716 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1717 goto store;
1718 }
1719 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001720 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722 break;
1723
1724 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001725 if (s > end) {
1726 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1727 goto onError;
1728 }
1729 else {
1730 *p++ = '\\';
1731 *p++ = (unsigned char)s[-1];
1732 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 }
1735 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001736 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001737 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001739
Fredrik Lundhccc74732001-02-18 22:13:49 +00001740ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001741 PyErr_SetString(
1742 PyExc_UnicodeError,
1743 "\\N escapes not supported (can't load unicodedata module)"
1744 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001745 return NULL;
1746
Fredrik Lundhccc74732001-02-18 22:13:49 +00001747onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 Py_XDECREF(v);
1749 return NULL;
1750}
1751
1752/* Return a Unicode-Escape string version of the Unicode object.
1753
1754 If quotes is true, the string is enclosed in u"" or u'' quotes as
1755 appropriate.
1756
1757*/
1758
Barry Warsaw51ac5802000-03-20 16:36:48 +00001759static const Py_UNICODE *findchar(const Py_UNICODE *s,
1760 int size,
1761 Py_UNICODE ch);
1762
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763static
1764PyObject *unicodeescape_string(const Py_UNICODE *s,
1765 int size,
1766 int quotes)
1767{
1768 PyObject *repr;
1769 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001771 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772
1773 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1774 if (repr == NULL)
1775 return NULL;
1776
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001777 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778
1779 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 *p++ = 'u';
1781 *p++ = (findchar(s, size, '\'') &&
1782 !findchar(s, size, '"')) ? '"' : '\'';
1783 }
1784 while (size-- > 0) {
1785 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001786
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001788 if (quotes &&
1789 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 *p++ = '\\';
1791 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001792 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001794
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001795#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001796 /* Map 21-bit characters to '\U00xxxxxx' */
1797 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001798 int offset = p - PyString_AS_STRING(repr);
1799
1800 /* Resize the string if necessary */
1801 if (offset + 12 > PyString_GET_SIZE(repr)) {
1802 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001803 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001804 p = PyString_AS_STRING(repr) + offset;
1805 }
1806
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001807 *p++ = '\\';
1808 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001809 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1810 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1811 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1812 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1813 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1814 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1815 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001816 *p++ = hexdigit[ch & 0x0000000F];
1817 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001818 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001819#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001820 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1821 else if (ch >= 0xD800 && ch < 0xDC00) {
1822 Py_UNICODE ch2;
1823 Py_UCS4 ucs;
1824
1825 ch2 = *s++;
1826 size--;
1827 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1828 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1829 *p++ = '\\';
1830 *p++ = 'U';
1831 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1832 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1833 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1834 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1835 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1836 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1837 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1838 *p++ = hexdigit[ucs & 0x0000000F];
1839 continue;
1840 }
1841 /* Fall through: isolated surrogates are copied as-is */
1842 s--;
1843 size++;
1844 }
1845
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001847 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 *p++ = '\\';
1849 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001850 *p++ = hexdigit[(ch >> 12) & 0x000F];
1851 *p++ = hexdigit[(ch >> 8) & 0x000F];
1852 *p++ = hexdigit[(ch >> 4) & 0x000F];
1853 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001855
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001856 /* Map special whitespace to '\t', \n', '\r' */
1857 else if (ch == '\t') {
1858 *p++ = '\\';
1859 *p++ = 't';
1860 }
1861 else if (ch == '\n') {
1862 *p++ = '\\';
1863 *p++ = 'n';
1864 }
1865 else if (ch == '\r') {
1866 *p++ = '\\';
1867 *p++ = 'r';
1868 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001870 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001871 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001873 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001874 *p++ = hexdigit[(ch >> 4) & 0x000F];
1875 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001877
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* Copy everything else as-is */
1879 else
1880 *p++ = (char) ch;
1881 }
1882 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001883 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884
1885 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001886 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 return repr;
1888}
1889
1890PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1891 int size)
1892{
1893 return unicodeescape_string(s, size, 0);
1894}
1895
1896PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1897{
1898 if (!PyUnicode_Check(unicode)) {
1899 PyErr_BadArgument();
1900 return NULL;
1901 }
1902 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1903 PyUnicode_GET_SIZE(unicode));
1904}
1905
1906/* --- Raw Unicode Escape Codec ------------------------------------------- */
1907
1908PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1909 int size,
1910 const char *errors)
1911{
1912 PyUnicodeObject *v;
1913 Py_UNICODE *p, *buf;
1914 const char *end;
1915 const char *bs;
1916
1917 /* Escaped strings will always be longer than the resulting
1918 Unicode string, so we start with size here and then reduce the
1919 length after conversion to the true value. */
1920 v = _PyUnicode_New(size);
1921 if (v == NULL)
1922 goto onError;
1923 if (size == 0)
1924 return (PyObject *)v;
1925 p = buf = PyUnicode_AS_UNICODE(v);
1926 end = s + size;
1927 while (s < end) {
1928 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001929 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 int i;
1931
1932 /* Non-escape characters are interpreted as Unicode ordinals */
1933 if (*s != '\\') {
1934 *p++ = (unsigned char)*s++;
1935 continue;
1936 }
1937
1938 /* \u-escapes are only interpreted iff the number of leading
1939 backslashes if odd */
1940 bs = s;
1941 for (;s < end;) {
1942 if (*s != '\\')
1943 break;
1944 *p++ = (unsigned char)*s++;
1945 }
1946 if (((s - bs) & 1) == 0 ||
1947 s >= end ||
1948 *s != 'u') {
1949 continue;
1950 }
1951 p--;
1952 s++;
1953
1954 /* \uXXXX with 4 hex digits */
1955 for (x = 0, i = 0; i < 4; i++) {
1956 c = (unsigned char)s[i];
1957 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001958 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 "truncated \\uXXXX"))
1960 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001961 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 i++;
1963 break;
1964 }
1965 x = (x<<4) & ~0xF;
1966 if (c >= '0' && c <= '9')
1967 x += c - '0';
1968 else if (c >= 'a' && c <= 'f')
1969 x += 10 + c - 'a';
1970 else
1971 x += 10 + c - 'A';
1972 }
1973 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001974 if (x != 0xffffffff)
1975 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001977 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return (PyObject *)v;
1980
1981 onError:
1982 Py_XDECREF(v);
1983 return NULL;
1984}
1985
1986PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1987 int size)
1988{
1989 PyObject *repr;
1990 char *p;
1991 char *q;
1992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001993 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 repr = PyString_FromStringAndSize(NULL, 6 * size);
1996 if (repr == NULL)
1997 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001998 if (size == 0)
1999 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 p = q = PyString_AS_STRING(repr);
2002 while (size-- > 0) {
2003 Py_UNICODE ch = *s++;
2004 /* Map 16-bit characters to '\uxxxx' */
2005 if (ch >= 256) {
2006 *p++ = '\\';
2007 *p++ = 'u';
2008 *p++ = hexdigit[(ch >> 12) & 0xf];
2009 *p++ = hexdigit[(ch >> 8) & 0xf];
2010 *p++ = hexdigit[(ch >> 4) & 0xf];
2011 *p++ = hexdigit[ch & 15];
2012 }
2013 /* Copy everything else as-is */
2014 else
2015 *p++ = (char) ch;
2016 }
2017 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002018 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 return repr;
2020}
2021
2022PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2023{
2024 if (!PyUnicode_Check(unicode)) {
2025 PyErr_BadArgument();
2026 return NULL;
2027 }
2028 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2029 PyUnicode_GET_SIZE(unicode));
2030}
2031
2032/* --- Latin-1 Codec ------------------------------------------------------ */
2033
2034PyObject *PyUnicode_DecodeLatin1(const char *s,
2035 int size,
2036 const char *errors)
2037{
2038 PyUnicodeObject *v;
2039 Py_UNICODE *p;
2040
2041 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002042 if (size == 1 && *(unsigned char*)s < 256) {
2043 Py_UNICODE r = *(unsigned char*)s;
2044 return PyUnicode_FromUnicode(&r, 1);
2045 }
2046
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 v = _PyUnicode_New(size);
2048 if (v == NULL)
2049 goto onError;
2050 if (size == 0)
2051 return (PyObject *)v;
2052 p = PyUnicode_AS_UNICODE(v);
2053 while (size-- > 0)
2054 *p++ = (unsigned char)*s++;
2055 return (PyObject *)v;
2056
2057 onError:
2058 Py_XDECREF(v);
2059 return NULL;
2060}
2061
2062static
2063int latin1_encoding_error(const Py_UNICODE **source,
2064 char **dest,
2065 const char *errors,
2066 const char *details)
2067{
2068 if ((errors == NULL) ||
2069 (strcmp(errors,"strict") == 0)) {
2070 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002071 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 details);
2073 return -1;
2074 }
2075 else if (strcmp(errors,"ignore") == 0) {
2076 return 0;
2077 }
2078 else if (strcmp(errors,"replace") == 0) {
2079 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002080 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return 0;
2082 }
2083 else {
2084 PyErr_Format(PyExc_ValueError,
2085 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002086 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 errors);
2088 return -1;
2089 }
2090}
2091
2092PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2093 int size,
2094 const char *errors)
2095{
2096 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002097 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 repr = PyString_FromStringAndSize(NULL, size);
2100 if (repr == NULL)
2101 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002102 if (size == 0)
2103 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
2105 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002106 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 while (size-- > 0) {
2108 Py_UNICODE ch = *p++;
2109 if (ch >= 256) {
2110 if (latin1_encoding_error(&p, &s, errors,
2111 "ordinal not in range(256)"))
2112 goto onError;
2113 }
2114 else
2115 *s++ = (char)ch;
2116 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002117 /* Resize if error handling skipped some characters */
2118 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002119 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 return repr;
2121
2122 onError:
2123 Py_DECREF(repr);
2124 return NULL;
2125}
2126
2127PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2128{
2129 if (!PyUnicode_Check(unicode)) {
2130 PyErr_BadArgument();
2131 return NULL;
2132 }
2133 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2134 PyUnicode_GET_SIZE(unicode),
2135 NULL);
2136}
2137
2138/* --- 7-bit ASCII Codec -------------------------------------------------- */
2139
2140static
2141int ascii_decoding_error(const char **source,
2142 Py_UNICODE **dest,
2143 const char *errors,
2144 const char *details)
2145{
2146 if ((errors == NULL) ||
2147 (strcmp(errors,"strict") == 0)) {
2148 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002149 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 details);
2151 return -1;
2152 }
2153 else if (strcmp(errors,"ignore") == 0) {
2154 return 0;
2155 }
2156 else if (strcmp(errors,"replace") == 0) {
2157 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2158 (*dest)++;
2159 return 0;
2160 }
2161 else {
2162 PyErr_Format(PyExc_ValueError,
2163 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 errors);
2166 return -1;
2167 }
2168}
2169
2170PyObject *PyUnicode_DecodeASCII(const char *s,
2171 int size,
2172 const char *errors)
2173{
2174 PyUnicodeObject *v;
2175 Py_UNICODE *p;
2176
2177 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002178 if (size == 1 && *(unsigned char*)s < 128) {
2179 Py_UNICODE r = *(unsigned char*)s;
2180 return PyUnicode_FromUnicode(&r, 1);
2181 }
2182
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 v = _PyUnicode_New(size);
2184 if (v == NULL)
2185 goto onError;
2186 if (size == 0)
2187 return (PyObject *)v;
2188 p = PyUnicode_AS_UNICODE(v);
2189 while (size-- > 0) {
2190 register unsigned char c;
2191
2192 c = (unsigned char)*s++;
2193 if (c < 128)
2194 *p++ = c;
2195 else if (ascii_decoding_error(&s, &p, errors,
2196 "ordinal not in range(128)"))
2197 goto onError;
2198 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002199 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 return (PyObject *)v;
2203
2204 onError:
2205 Py_XDECREF(v);
2206 return NULL;
2207}
2208
2209static
2210int ascii_encoding_error(const Py_UNICODE **source,
2211 char **dest,
2212 const char *errors,
2213 const char *details)
2214{
2215 if ((errors == NULL) ||
2216 (strcmp(errors,"strict") == 0)) {
2217 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002218 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 details);
2220 return -1;
2221 }
2222 else if (strcmp(errors,"ignore") == 0) {
2223 return 0;
2224 }
2225 else if (strcmp(errors,"replace") == 0) {
2226 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002227 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return 0;
2229 }
2230 else {
2231 PyErr_Format(PyExc_ValueError,
2232 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002233 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 errors);
2235 return -1;
2236 }
2237}
2238
2239PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2240 int size,
2241 const char *errors)
2242{
2243 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002244 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002245
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 repr = PyString_FromStringAndSize(NULL, size);
2247 if (repr == NULL)
2248 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002249 if (size == 0)
2250 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251
2252 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002253 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 while (size-- > 0) {
2255 Py_UNICODE ch = *p++;
2256 if (ch >= 128) {
2257 if (ascii_encoding_error(&p, &s, errors,
2258 "ordinal not in range(128)"))
2259 goto onError;
2260 }
2261 else
2262 *s++ = (char)ch;
2263 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002264 /* Resize if error handling skipped some characters */
2265 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002266 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 return repr;
2268
2269 onError:
2270 Py_DECREF(repr);
2271 return NULL;
2272}
2273
2274PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode),
2282 NULL);
2283}
2284
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002285#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002286
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002287/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002288
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002289PyObject *PyUnicode_DecodeMBCS(const char *s,
2290 int size,
2291 const char *errors)
2292{
2293 PyUnicodeObject *v;
2294 Py_UNICODE *p;
2295
2296 /* First get the size of the result */
2297 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002298 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002299 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2300
2301 v = _PyUnicode_New(usize);
2302 if (v == NULL)
2303 return NULL;
2304 if (usize == 0)
2305 return (PyObject *)v;
2306 p = PyUnicode_AS_UNICODE(v);
2307 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2308 Py_DECREF(v);
2309 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2310 }
2311
2312 return (PyObject *)v;
2313}
2314
2315PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2316 int size,
2317 const char *errors)
2318{
2319 PyObject *repr;
2320 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002321 DWORD mbcssize;
2322
2323 /* If there are no characters, bail now! */
2324 if (size==0)
2325 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002326
2327 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002328 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002329 if (mbcssize==0)
2330 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2331
2332 repr = PyString_FromStringAndSize(NULL, mbcssize);
2333 if (repr == NULL)
2334 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002335 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002336 return repr;
2337
2338 /* Do the conversion */
2339 s = PyString_AS_STRING(repr);
2340 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2341 Py_DECREF(repr);
2342 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2343 }
2344 return repr;
2345}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002346
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002347#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349/* --- Character Mapping Codec -------------------------------------------- */
2350
2351static
2352int charmap_decoding_error(const char **source,
2353 Py_UNICODE **dest,
2354 const char *errors,
2355 const char *details)
2356{
2357 if ((errors == NULL) ||
2358 (strcmp(errors,"strict") == 0)) {
2359 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002360 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361 details);
2362 return -1;
2363 }
2364 else if (strcmp(errors,"ignore") == 0) {
2365 return 0;
2366 }
2367 else if (strcmp(errors,"replace") == 0) {
2368 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2369 (*dest)++;
2370 return 0;
2371 }
2372 else {
2373 PyErr_Format(PyExc_ValueError,
2374 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002375 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 errors);
2377 return -1;
2378 }
2379}
2380
2381PyObject *PyUnicode_DecodeCharmap(const char *s,
2382 int size,
2383 PyObject *mapping,
2384 const char *errors)
2385{
2386 PyUnicodeObject *v;
2387 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002388 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
2390 /* Default to Latin-1 */
2391 if (mapping == NULL)
2392 return PyUnicode_DecodeLatin1(s, size, errors);
2393
2394 v = _PyUnicode_New(size);
2395 if (v == NULL)
2396 goto onError;
2397 if (size == 0)
2398 return (PyObject *)v;
2399 p = PyUnicode_AS_UNICODE(v);
2400 while (size-- > 0) {
2401 unsigned char ch = *s++;
2402 PyObject *w, *x;
2403
2404 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2405 w = PyInt_FromLong((long)ch);
2406 if (w == NULL)
2407 goto onError;
2408 x = PyObject_GetItem(mapping, w);
2409 Py_DECREF(w);
2410 if (x == NULL) {
2411 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002412 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002414 x = Py_None;
2415 Py_INCREF(x);
2416 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
2419
2420 /* Apply mapping */
2421 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002422 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 if (value < 0 || value > 65535) {
2424 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002425 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 Py_DECREF(x);
2427 goto onError;
2428 }
2429 *p++ = (Py_UNICODE)value;
2430 }
2431 else if (x == Py_None) {
2432 /* undefined mapping */
2433 if (charmap_decoding_error(&s, &p, errors,
2434 "character maps to <undefined>")) {
2435 Py_DECREF(x);
2436 goto onError;
2437 }
2438 }
2439 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002440 int targetsize = PyUnicode_GET_SIZE(x);
2441
2442 if (targetsize == 1)
2443 /* 1-1 mapping */
2444 *p++ = *PyUnicode_AS_UNICODE(x);
2445
2446 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002448 if (targetsize > extrachars) {
2449 /* resize first */
2450 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2451 int needed = (targetsize - extrachars) + \
2452 (targetsize << 2);
2453 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002454 if (_PyUnicode_Resize(&v,
2455 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002456 Py_DECREF(x);
2457 goto onError;
2458 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002459 p = PyUnicode_AS_UNICODE(v) + oldpos;
2460 }
2461 Py_UNICODE_COPY(p,
2462 PyUnicode_AS_UNICODE(x),
2463 targetsize);
2464 p += targetsize;
2465 extrachars -= targetsize;
2466 }
2467 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
2469 else {
2470 /* wrong return value */
2471 PyErr_SetString(PyExc_TypeError,
2472 "character mapping must return integer, None or unicode");
2473 Py_DECREF(x);
2474 goto onError;
2475 }
2476 Py_DECREF(x);
2477 }
2478 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002479 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 goto onError;
2481 return (PyObject *)v;
2482
2483 onError:
2484 Py_XDECREF(v);
2485 return NULL;
2486}
2487
2488static
2489int charmap_encoding_error(const Py_UNICODE **source,
2490 char **dest,
2491 const char *errors,
2492 const char *details)
2493{
2494 if ((errors == NULL) ||
2495 (strcmp(errors,"strict") == 0)) {
2496 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002497 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 details);
2499 return -1;
2500 }
2501 else if (strcmp(errors,"ignore") == 0) {
2502 return 0;
2503 }
2504 else if (strcmp(errors,"replace") == 0) {
2505 **dest = '?';
2506 (*dest)++;
2507 return 0;
2508 }
2509 else {
2510 PyErr_Format(PyExc_ValueError,
2511 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002512 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 errors);
2514 return -1;
2515 }
2516}
2517
2518PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2519 int size,
2520 PyObject *mapping,
2521 const char *errors)
2522{
2523 PyObject *v;
2524 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002525 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526
2527 /* Default to Latin-1 */
2528 if (mapping == NULL)
2529 return PyUnicode_EncodeLatin1(p, size, errors);
2530
2531 v = PyString_FromStringAndSize(NULL, size);
2532 if (v == NULL)
2533 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002534 if (size == 0)
2535 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 s = PyString_AS_STRING(v);
2537 while (size-- > 0) {
2538 Py_UNICODE ch = *p++;
2539 PyObject *w, *x;
2540
2541 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2542 w = PyInt_FromLong((long)ch);
2543 if (w == NULL)
2544 goto onError;
2545 x = PyObject_GetItem(mapping, w);
2546 Py_DECREF(w);
2547 if (x == NULL) {
2548 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002549 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002551 x = Py_None;
2552 Py_INCREF(x);
2553 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002554 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 }
2556
2557 /* Apply mapping */
2558 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002559 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 if (value < 0 || value > 255) {
2561 PyErr_SetString(PyExc_TypeError,
2562 "character mapping must be in range(256)");
2563 Py_DECREF(x);
2564 goto onError;
2565 }
2566 *s++ = (char)value;
2567 }
2568 else if (x == Py_None) {
2569 /* undefined mapping */
2570 if (charmap_encoding_error(&p, &s, errors,
2571 "character maps to <undefined>")) {
2572 Py_DECREF(x);
2573 goto onError;
2574 }
2575 }
2576 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002577 int targetsize = PyString_GET_SIZE(x);
2578
2579 if (targetsize == 1)
2580 /* 1-1 mapping */
2581 *s++ = *PyString_AS_STRING(x);
2582
2583 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002585 if (targetsize > extrachars) {
2586 /* resize first */
2587 int oldpos = (int)(s - PyString_AS_STRING(v));
2588 int needed = (targetsize - extrachars) + \
2589 (targetsize << 2);
2590 extrachars += needed;
2591 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002592 Py_DECREF(x);
2593 goto onError;
2594 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002595 s = PyString_AS_STRING(v) + oldpos;
2596 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002597 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002598 s += targetsize;
2599 extrachars -= targetsize;
2600 }
2601 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
2603 else {
2604 /* wrong return value */
2605 PyErr_SetString(PyExc_TypeError,
2606 "character mapping must return integer, None or unicode");
2607 Py_DECREF(x);
2608 goto onError;
2609 }
2610 Py_DECREF(x);
2611 }
2612 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
Tim Peters5de98422002-04-27 18:44:32 +00002613 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 return v;
2615
2616 onError:
Tim Peters5de98422002-04-27 18:44:32 +00002617 Py_XDECREF(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 return NULL;
2619}
2620
2621PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2622 PyObject *mapping)
2623{
2624 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2625 PyErr_BadArgument();
2626 return NULL;
2627 }
2628 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2629 PyUnicode_GET_SIZE(unicode),
2630 mapping,
2631 NULL);
2632}
2633
2634static
2635int translate_error(const Py_UNICODE **source,
2636 Py_UNICODE **dest,
2637 const char *errors,
2638 const char *details)
2639{
2640 if ((errors == NULL) ||
2641 (strcmp(errors,"strict") == 0)) {
2642 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002643 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 details);
2645 return -1;
2646 }
2647 else if (strcmp(errors,"ignore") == 0) {
2648 return 0;
2649 }
2650 else if (strcmp(errors,"replace") == 0) {
2651 **dest = '?';
2652 (*dest)++;
2653 return 0;
2654 }
2655 else {
2656 PyErr_Format(PyExc_ValueError,
2657 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002658 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 errors);
2660 return -1;
2661 }
2662}
2663
2664PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2665 int size,
2666 PyObject *mapping,
2667 const char *errors)
2668{
2669 PyUnicodeObject *v;
2670 Py_UNICODE *p;
2671
2672 if (mapping == NULL) {
2673 PyErr_BadArgument();
2674 return NULL;
2675 }
2676
2677 /* Output will never be longer than input */
2678 v = _PyUnicode_New(size);
2679 if (v == NULL)
2680 goto onError;
2681 if (size == 0)
2682 goto done;
2683 p = PyUnicode_AS_UNICODE(v);
2684 while (size-- > 0) {
2685 Py_UNICODE ch = *s++;
2686 PyObject *w, *x;
2687
2688 /* Get mapping */
2689 w = PyInt_FromLong(ch);
2690 if (w == NULL)
2691 goto onError;
2692 x = PyObject_GetItem(mapping, w);
2693 Py_DECREF(w);
2694 if (x == NULL) {
2695 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2696 /* No mapping found: default to 1-1 mapping */
2697 PyErr_Clear();
2698 *p++ = ch;
2699 continue;
2700 }
2701 goto onError;
2702 }
2703
2704 /* Apply mapping */
2705 if (PyInt_Check(x))
2706 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2707 else if (x == Py_None) {
2708 /* undefined mapping */
2709 if (translate_error(&s, &p, errors,
2710 "character maps to <undefined>")) {
2711 Py_DECREF(x);
2712 goto onError;
2713 }
2714 }
2715 else if (PyUnicode_Check(x)) {
2716 if (PyUnicode_GET_SIZE(x) != 1) {
2717 /* 1-n mapping */
2718 PyErr_SetString(PyExc_NotImplementedError,
2719 "1-n mappings are currently not implemented");
2720 Py_DECREF(x);
2721 goto onError;
2722 }
2723 *p++ = *PyUnicode_AS_UNICODE(x);
2724 }
2725 else {
2726 /* wrong return value */
2727 PyErr_SetString(PyExc_TypeError,
2728 "translate mapping must return integer, None or unicode");
2729 Py_DECREF(x);
2730 goto onError;
2731 }
2732 Py_DECREF(x);
2733 }
2734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002735 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
2738 done:
2739 return (PyObject *)v;
2740
2741 onError:
2742 Py_XDECREF(v);
2743 return NULL;
2744}
2745
2746PyObject *PyUnicode_Translate(PyObject *str,
2747 PyObject *mapping,
2748 const char *errors)
2749{
2750 PyObject *result;
2751
2752 str = PyUnicode_FromObject(str);
2753 if (str == NULL)
2754 goto onError;
2755 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2756 PyUnicode_GET_SIZE(str),
2757 mapping,
2758 errors);
2759 Py_DECREF(str);
2760 return result;
2761
2762 onError:
2763 Py_XDECREF(str);
2764 return NULL;
2765}
2766
Guido van Rossum9e896b32000-04-05 20:11:21 +00002767/* --- Decimal Encoder ---------------------------------------------------- */
2768
2769int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2770 int length,
2771 char *output,
2772 const char *errors)
2773{
2774 Py_UNICODE *p, *end;
2775
2776 if (output == NULL) {
2777 PyErr_BadArgument();
2778 return -1;
2779 }
2780
2781 p = s;
2782 end = s + length;
2783 while (p < end) {
2784 register Py_UNICODE ch = *p++;
2785 int decimal;
2786
2787 if (Py_UNICODE_ISSPACE(ch)) {
2788 *output++ = ' ';
2789 continue;
2790 }
2791 decimal = Py_UNICODE_TODECIMAL(ch);
2792 if (decimal >= 0) {
2793 *output++ = '0' + decimal;
2794 continue;
2795 }
Guido van Rossumba477042000-04-06 18:18:10 +00002796 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002797 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002798 continue;
2799 }
2800 /* All other characters are considered invalid */
2801 if (errors == NULL || strcmp(errors, "strict") == 0) {
2802 PyErr_SetString(PyExc_ValueError,
2803 "invalid decimal Unicode string");
2804 goto onError;
2805 }
2806 else if (strcmp(errors, "ignore") == 0)
2807 continue;
2808 else if (strcmp(errors, "replace") == 0) {
2809 *output++ = '?';
2810 continue;
2811 }
2812 }
2813 /* 0-terminate the output string */
2814 *output++ = '\0';
2815 return 0;
2816
2817 onError:
2818 return -1;
2819}
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821/* --- Helpers ------------------------------------------------------------ */
2822
2823static
2824int count(PyUnicodeObject *self,
2825 int start,
2826 int end,
2827 PyUnicodeObject *substring)
2828{
2829 int count = 0;
2830
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002831 if (start < 0)
2832 start += self->length;
2833 if (start < 0)
2834 start = 0;
2835 if (end > self->length)
2836 end = self->length;
2837 if (end < 0)
2838 end += self->length;
2839 if (end < 0)
2840 end = 0;
2841
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002842 if (substring->length == 0)
2843 return (end - start + 1);
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 end -= substring->length;
2846
2847 while (start <= end)
2848 if (Py_UNICODE_MATCH(self, start, substring)) {
2849 count++;
2850 start += substring->length;
2851 } else
2852 start++;
2853
2854 return count;
2855}
2856
2857int PyUnicode_Count(PyObject *str,
2858 PyObject *substr,
2859 int start,
2860 int end)
2861{
2862 int result;
2863
2864 str = PyUnicode_FromObject(str);
2865 if (str == NULL)
2866 return -1;
2867 substr = PyUnicode_FromObject(substr);
2868 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002869 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 return -1;
2871 }
2872
2873 result = count((PyUnicodeObject *)str,
2874 start, end,
2875 (PyUnicodeObject *)substr);
2876
2877 Py_DECREF(str);
2878 Py_DECREF(substr);
2879 return result;
2880}
2881
2882static
2883int findstring(PyUnicodeObject *self,
2884 PyUnicodeObject *substring,
2885 int start,
2886 int end,
2887 int direction)
2888{
2889 if (start < 0)
2890 start += self->length;
2891 if (start < 0)
2892 start = 0;
2893
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 if (end > self->length)
2895 end = self->length;
2896 if (end < 0)
2897 end += self->length;
2898 if (end < 0)
2899 end = 0;
2900
Guido van Rossum76afbd92002-08-20 17:29:29 +00002901 if (substring->length == 0)
2902 return (direction > 0) ? start : end;
2903
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 end -= substring->length;
2905
2906 if (direction < 0) {
2907 for (; end >= start; end--)
2908 if (Py_UNICODE_MATCH(self, end, substring))
2909 return end;
2910 } else {
2911 for (; start <= end; start++)
2912 if (Py_UNICODE_MATCH(self, start, substring))
2913 return start;
2914 }
2915
2916 return -1;
2917}
2918
2919int PyUnicode_Find(PyObject *str,
2920 PyObject *substr,
2921 int start,
2922 int end,
2923 int direction)
2924{
2925 int result;
2926
2927 str = PyUnicode_FromObject(str);
2928 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002929 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 substr = PyUnicode_FromObject(substr);
2931 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00002932 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002933 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 }
2935
2936 result = findstring((PyUnicodeObject *)str,
2937 (PyUnicodeObject *)substr,
2938 start, end, direction);
2939 Py_DECREF(str);
2940 Py_DECREF(substr);
2941 return result;
2942}
2943
2944static
2945int tailmatch(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int start,
2948 int end,
2949 int direction)
2950{
2951 if (start < 0)
2952 start += self->length;
2953 if (start < 0)
2954 start = 0;
2955
2956 if (substring->length == 0)
2957 return 1;
2958
2959 if (end > self->length)
2960 end = self->length;
2961 if (end < 0)
2962 end += self->length;
2963 if (end < 0)
2964 end = 0;
2965
2966 end -= substring->length;
2967 if (end < start)
2968 return 0;
2969
2970 if (direction > 0) {
2971 if (Py_UNICODE_MATCH(self, end, substring))
2972 return 1;
2973 } else {
2974 if (Py_UNICODE_MATCH(self, start, substring))
2975 return 1;
2976 }
2977
2978 return 0;
2979}
2980
2981int PyUnicode_Tailmatch(PyObject *str,
2982 PyObject *substr,
2983 int start,
2984 int end,
2985 int direction)
2986{
2987 int result;
2988
2989 str = PyUnicode_FromObject(str);
2990 if (str == NULL)
2991 return -1;
2992 substr = PyUnicode_FromObject(substr);
2993 if (substr == NULL) {
2994 Py_DECREF(substr);
2995 return -1;
2996 }
2997
2998 result = tailmatch((PyUnicodeObject *)str,
2999 (PyUnicodeObject *)substr,
3000 start, end, direction);
3001 Py_DECREF(str);
3002 Py_DECREF(substr);
3003 return result;
3004}
3005
3006static
3007const Py_UNICODE *findchar(const Py_UNICODE *s,
3008 int size,
3009 Py_UNICODE ch)
3010{
3011 /* like wcschr, but doesn't stop at NULL characters */
3012
3013 while (size-- > 0) {
3014 if (*s == ch)
3015 return s;
3016 s++;
3017 }
3018
3019 return NULL;
3020}
3021
3022/* Apply fixfct filter to the Unicode object self and return a
3023 reference to the modified object */
3024
3025static
3026PyObject *fixup(PyUnicodeObject *self,
3027 int (*fixfct)(PyUnicodeObject *s))
3028{
3029
3030 PyUnicodeObject *u;
3031
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003032 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 if (u == NULL)
3034 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003035
3036 Py_UNICODE_COPY(u->str, self->str, self->length);
3037
Tim Peters7a29bd52001-09-12 03:03:31 +00003038 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 /* fixfct should return TRUE if it modified the buffer. If
3040 FALSE, return a reference to the original buffer instead
3041 (to save space, not time) */
3042 Py_INCREF(self);
3043 Py_DECREF(u);
3044 return (PyObject*) self;
3045 }
3046 return (PyObject*) u;
3047}
3048
3049static
3050int fixupper(PyUnicodeObject *self)
3051{
3052 int len = self->length;
3053 Py_UNICODE *s = self->str;
3054 int status = 0;
3055
3056 while (len-- > 0) {
3057 register Py_UNICODE ch;
3058
3059 ch = Py_UNICODE_TOUPPER(*s);
3060 if (ch != *s) {
3061 status = 1;
3062 *s = ch;
3063 }
3064 s++;
3065 }
3066
3067 return status;
3068}
3069
3070static
3071int fixlower(PyUnicodeObject *self)
3072{
3073 int len = self->length;
3074 Py_UNICODE *s = self->str;
3075 int status = 0;
3076
3077 while (len-- > 0) {
3078 register Py_UNICODE ch;
3079
3080 ch = Py_UNICODE_TOLOWER(*s);
3081 if (ch != *s) {
3082 status = 1;
3083 *s = ch;
3084 }
3085 s++;
3086 }
3087
3088 return status;
3089}
3090
3091static
3092int fixswapcase(PyUnicodeObject *self)
3093{
3094 int len = self->length;
3095 Py_UNICODE *s = self->str;
3096 int status = 0;
3097
3098 while (len-- > 0) {
3099 if (Py_UNICODE_ISUPPER(*s)) {
3100 *s = Py_UNICODE_TOLOWER(*s);
3101 status = 1;
3102 } else if (Py_UNICODE_ISLOWER(*s)) {
3103 *s = Py_UNICODE_TOUPPER(*s);
3104 status = 1;
3105 }
3106 s++;
3107 }
3108
3109 return status;
3110}
3111
3112static
3113int fixcapitalize(PyUnicodeObject *self)
3114{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003115 int len = self->length;
3116 Py_UNICODE *s = self->str;
3117 int status = 0;
3118
3119 if (len == 0)
3120 return 0;
3121 if (Py_UNICODE_ISLOWER(*s)) {
3122 *s = Py_UNICODE_TOUPPER(*s);
3123 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003125 s++;
3126 while (--len > 0) {
3127 if (Py_UNICODE_ISUPPER(*s)) {
3128 *s = Py_UNICODE_TOLOWER(*s);
3129 status = 1;
3130 }
3131 s++;
3132 }
3133 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134}
3135
3136static
3137int fixtitle(PyUnicodeObject *self)
3138{
3139 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3140 register Py_UNICODE *e;
3141 int previous_is_cased;
3142
3143 /* Shortcut for single character strings */
3144 if (PyUnicode_GET_SIZE(self) == 1) {
3145 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3146 if (*p != ch) {
3147 *p = ch;
3148 return 1;
3149 }
3150 else
3151 return 0;
3152 }
3153
3154 e = p + PyUnicode_GET_SIZE(self);
3155 previous_is_cased = 0;
3156 for (; p < e; p++) {
3157 register const Py_UNICODE ch = *p;
3158
3159 if (previous_is_cased)
3160 *p = Py_UNICODE_TOLOWER(ch);
3161 else
3162 *p = Py_UNICODE_TOTITLE(ch);
3163
3164 if (Py_UNICODE_ISLOWER(ch) ||
3165 Py_UNICODE_ISUPPER(ch) ||
3166 Py_UNICODE_ISTITLE(ch))
3167 previous_is_cased = 1;
3168 else
3169 previous_is_cased = 0;
3170 }
3171 return 1;
3172}
3173
3174PyObject *PyUnicode_Join(PyObject *separator,
3175 PyObject *seq)
3176{
3177 Py_UNICODE *sep;
3178 int seplen;
3179 PyUnicodeObject *res = NULL;
3180 int reslen = 0;
3181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 int sz = 100;
3183 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003184 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
Tim Peters2cfe3682001-05-05 05:36:48 +00003186 it = PyObject_GetIter(seq);
3187 if (it == NULL)
3188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189
3190 if (separator == NULL) {
3191 Py_UNICODE blank = ' ';
3192 sep = &blank;
3193 seplen = 1;
3194 }
3195 else {
3196 separator = PyUnicode_FromObject(separator);
3197 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 sep = PyUnicode_AS_UNICODE(separator);
3200 seplen = PyUnicode_GET_SIZE(separator);
3201 }
3202
3203 res = _PyUnicode_New(sz);
3204 if (res == NULL)
3205 goto onError;
3206 p = PyUnicode_AS_UNICODE(res);
3207 reslen = 0;
3208
Tim Peters2cfe3682001-05-05 05:36:48 +00003209 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003211 PyObject *item = PyIter_Next(it);
3212 if (item == NULL) {
3213 if (PyErr_Occurred())
3214 goto onError;
3215 break;
3216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 if (!PyUnicode_Check(item)) {
3218 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003219 if (!PyString_Check(item)) {
3220 PyErr_Format(PyExc_TypeError,
3221 "sequence item %i: expected string or Unicode,"
3222 " %.80s found",
3223 i, item->ob_type->tp_name);
3224 Py_DECREF(item);
3225 goto onError;
3226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 v = PyUnicode_FromObject(item);
3228 Py_DECREF(item);
3229 item = v;
3230 if (item == NULL)
3231 goto onError;
3232 }
3233 itemlen = PyUnicode_GET_SIZE(item);
3234 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003235 if (_PyUnicode_Resize(&res, sz*2)) {
3236 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 sz *= 2;
3240 p = PyUnicode_AS_UNICODE(res) + reslen;
3241 }
3242 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003243 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 p += seplen;
3245 reslen += seplen;
3246 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003247 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 p += itemlen;
3249 reslen += itemlen;
3250 Py_DECREF(item);
3251 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003252 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 goto onError;
3254
3255 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003256 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 return (PyObject *)res;
3258
3259 onError:
3260 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003261 Py_XDECREF(res);
3262 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 return NULL;
3264}
3265
3266static
3267PyUnicodeObject *pad(PyUnicodeObject *self,
3268 int left,
3269 int right,
3270 Py_UNICODE fill)
3271{
3272 PyUnicodeObject *u;
3273
3274 if (left < 0)
3275 left = 0;
3276 if (right < 0)
3277 right = 0;
3278
Tim Peters7a29bd52001-09-12 03:03:31 +00003279 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 Py_INCREF(self);
3281 return self;
3282 }
3283
3284 u = _PyUnicode_New(left + self->length + right);
3285 if (u) {
3286 if (left)
3287 Py_UNICODE_FILL(u->str, fill, left);
3288 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3289 if (right)
3290 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3291 }
3292
3293 return u;
3294}
3295
3296#define SPLIT_APPEND(data, left, right) \
3297 str = PyUnicode_FromUnicode(data + left, right - left); \
3298 if (!str) \
3299 goto onError; \
3300 if (PyList_Append(list, str)) { \
3301 Py_DECREF(str); \
3302 goto onError; \
3303 } \
3304 else \
3305 Py_DECREF(str);
3306
3307static
3308PyObject *split_whitespace(PyUnicodeObject *self,
3309 PyObject *list,
3310 int maxcount)
3311{
3312 register int i;
3313 register int j;
3314 int len = self->length;
3315 PyObject *str;
3316
3317 for (i = j = 0; i < len; ) {
3318 /* find a token */
3319 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3320 i++;
3321 j = i;
3322 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3323 i++;
3324 if (j < i) {
3325 if (maxcount-- <= 0)
3326 break;
3327 SPLIT_APPEND(self->str, j, i);
3328 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3329 i++;
3330 j = i;
3331 }
3332 }
3333 if (j < len) {
3334 SPLIT_APPEND(self->str, j, len);
3335 }
3336 return list;
3337
3338 onError:
3339 Py_DECREF(list);
3340 return NULL;
3341}
3342
3343PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003344 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345{
3346 register int i;
3347 register int j;
3348 int len;
3349 PyObject *list;
3350 PyObject *str;
3351 Py_UNICODE *data;
3352
3353 string = PyUnicode_FromObject(string);
3354 if (string == NULL)
3355 return NULL;
3356 data = PyUnicode_AS_UNICODE(string);
3357 len = PyUnicode_GET_SIZE(string);
3358
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 list = PyList_New(0);
3360 if (!list)
3361 goto onError;
3362
3363 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003364 int eol;
3365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 /* Find a line and append it */
3367 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3368 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369
3370 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003371 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 if (i < len) {
3373 if (data[i] == '\r' && i + 1 < len &&
3374 data[i+1] == '\n')
3375 i += 2;
3376 else
3377 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003378 if (keepends)
3379 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
Guido van Rossum86662912000-04-11 15:38:46 +00003381 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 j = i;
3383 }
3384 if (j < len) {
3385 SPLIT_APPEND(data, j, len);
3386 }
3387
3388 Py_DECREF(string);
3389 return list;
3390
3391 onError:
3392 Py_DECREF(list);
3393 Py_DECREF(string);
3394 return NULL;
3395}
3396
3397static
3398PyObject *split_char(PyUnicodeObject *self,
3399 PyObject *list,
3400 Py_UNICODE ch,
3401 int maxcount)
3402{
3403 register int i;
3404 register int j;
3405 int len = self->length;
3406 PyObject *str;
3407
3408 for (i = j = 0; i < len; ) {
3409 if (self->str[i] == ch) {
3410 if (maxcount-- <= 0)
3411 break;
3412 SPLIT_APPEND(self->str, j, i);
3413 i = j = i + 1;
3414 } else
3415 i++;
3416 }
3417 if (j <= len) {
3418 SPLIT_APPEND(self->str, j, len);
3419 }
3420 return list;
3421
3422 onError:
3423 Py_DECREF(list);
3424 return NULL;
3425}
3426
3427static
3428PyObject *split_substring(PyUnicodeObject *self,
3429 PyObject *list,
3430 PyUnicodeObject *substring,
3431 int maxcount)
3432{
3433 register int i;
3434 register int j;
3435 int len = self->length;
3436 int sublen = substring->length;
3437 PyObject *str;
3438
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003439 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 if (Py_UNICODE_MATCH(self, i, substring)) {
3441 if (maxcount-- <= 0)
3442 break;
3443 SPLIT_APPEND(self->str, j, i);
3444 i = j = i + sublen;
3445 } else
3446 i++;
3447 }
3448 if (j <= len) {
3449 SPLIT_APPEND(self->str, j, len);
3450 }
3451 return list;
3452
3453 onError:
3454 Py_DECREF(list);
3455 return NULL;
3456}
3457
3458#undef SPLIT_APPEND
3459
3460static
3461PyObject *split(PyUnicodeObject *self,
3462 PyUnicodeObject *substring,
3463 int maxcount)
3464{
3465 PyObject *list;
3466
3467 if (maxcount < 0)
3468 maxcount = INT_MAX;
3469
3470 list = PyList_New(0);
3471 if (!list)
3472 return NULL;
3473
3474 if (substring == NULL)
3475 return split_whitespace(self,list,maxcount);
3476
3477 else if (substring->length == 1)
3478 return split_char(self,list,substring->str[0],maxcount);
3479
3480 else if (substring->length == 0) {
3481 Py_DECREF(list);
3482 PyErr_SetString(PyExc_ValueError, "empty separator");
3483 return NULL;
3484 }
3485 else
3486 return split_substring(self,list,substring,maxcount);
3487}
3488
3489static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490PyObject *replace(PyUnicodeObject *self,
3491 PyUnicodeObject *str1,
3492 PyUnicodeObject *str2,
3493 int maxcount)
3494{
3495 PyUnicodeObject *u;
3496
3497 if (maxcount < 0)
3498 maxcount = INT_MAX;
3499
3500 if (str1->length == 1 && str2->length == 1) {
3501 int i;
3502
3503 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003504 if (!findchar(self->str, self->length, str1->str[0]) &&
3505 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 /* nothing to replace, return original string */
3507 Py_INCREF(self);
3508 u = self;
3509 } else {
3510 Py_UNICODE u1 = str1->str[0];
3511 Py_UNICODE u2 = str2->str[0];
3512
3513 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003514 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 self->length
3516 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003517 if (u != NULL) {
3518 Py_UNICODE_COPY(u->str, self->str,
3519 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 for (i = 0; i < u->length; i++)
3521 if (u->str[i] == u1) {
3522 if (--maxcount < 0)
3523 break;
3524 u->str[i] = u2;
3525 }
3526 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528
3529 } else {
3530 int n, i;
3531 Py_UNICODE *p;
3532
3533 /* replace strings */
3534 n = count(self, 0, self->length, str1);
3535 if (n > maxcount)
3536 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00003537 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00003539 if (PyUnicode_CheckExact(self)) {
3540 Py_INCREF(self);
3541 u = self;
3542 }
3543 else {
3544 u = (PyUnicodeObject *)
3545 PyUnicode_FromUnicode(self->str, self->length);
3546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 } else {
3548 u = _PyUnicode_New(
3549 self->length + n * (str2->length - str1->length));
3550 if (u) {
3551 i = 0;
3552 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00003553 if (str1->length > 0) {
3554 while (i <= self->length - str1->length)
3555 if (Py_UNICODE_MATCH(self, i, str1)) {
3556 /* replace string segment */
3557 Py_UNICODE_COPY(p, str2->str, str2->length);
3558 p += str2->length;
3559 i += str1->length;
3560 if (--n <= 0) {
3561 /* copy remaining part */
3562 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3563 break;
3564 }
3565 } else
3566 *p++ = self->str[i++];
3567 } else {
3568 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 Py_UNICODE_COPY(p, str2->str, str2->length);
3570 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00003571 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00003574 }
3575 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 }
3578 }
3579 }
3580
3581 return (PyObject *) u;
3582}
3583
3584/* --- Unicode Object Methods --------------------------------------------- */
3585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003586PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587"S.title() -> unicode\n\
3588\n\
3589Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003590characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591
3592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003593unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 return fixup(self, fixtitle);
3596}
3597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003598PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599"S.capitalize() -> unicode\n\
3600\n\
3601Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003602have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603
3604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003605unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 return fixup(self, fixcapitalize);
3608}
3609
3610#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003611PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612"S.capwords() -> unicode\n\
3613\n\
3614Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003615normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616
3617static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003618unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619{
3620 PyObject *list;
3621 PyObject *item;
3622 int i;
3623
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 /* Split into words */
3625 list = split(self, NULL, -1);
3626 if (!list)
3627 return NULL;
3628
3629 /* Capitalize each word */
3630 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3631 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3632 fixcapitalize);
3633 if (item == NULL)
3634 goto onError;
3635 Py_DECREF(PyList_GET_ITEM(list, i));
3636 PyList_SET_ITEM(list, i, item);
3637 }
3638
3639 /* Join the words to form a new string */
3640 item = PyUnicode_Join(NULL, list);
3641
3642onError:
3643 Py_DECREF(list);
3644 return (PyObject *)item;
3645}
3646#endif
3647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003648PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649"S.center(width) -> unicode\n\
3650\n\
3651Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003652using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653
3654static PyObject *
3655unicode_center(PyUnicodeObject *self, PyObject *args)
3656{
3657 int marg, left;
3658 int width;
3659
3660 if (!PyArg_ParseTuple(args, "i:center", &width))
3661 return NULL;
3662
Tim Peters7a29bd52001-09-12 03:03:31 +00003663 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 Py_INCREF(self);
3665 return (PyObject*) self;
3666 }
3667
3668 marg = width - self->length;
3669 left = marg / 2 + (marg & width & 1);
3670
3671 return (PyObject*) pad(self, left, marg - left, ' ');
3672}
3673
Marc-André Lemburge5034372000-08-08 08:04:29 +00003674#if 0
3675
3676/* This code should go into some future Unicode collation support
3677 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003678 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003679
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003680/* speedy UTF-16 code point order comparison */
3681/* gleaned from: */
3682/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3683
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003684static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003685{
3686 0, 0, 0, 0, 0, 0, 0, 0,
3687 0, 0, 0, 0, 0, 0, 0, 0,
3688 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003689 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003690};
3691
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692static int
3693unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3694{
3695 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 Py_UNICODE *s1 = str1->str;
3698 Py_UNICODE *s2 = str2->str;
3699
3700 len1 = str1->length;
3701 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003704 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003705
3706 c1 = *s1++;
3707 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003708
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003709 if (c1 > (1<<11) * 26)
3710 c1 += utf16Fixup[c1>>11];
3711 if (c2 > (1<<11) * 26)
3712 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003713 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003714
3715 if (c1 != c2)
3716 return (c1 < c2) ? -1 : 1;
3717
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003718 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 }
3720
3721 return (len1 < len2) ? -1 : (len1 != len2);
3722}
3723
Marc-André Lemburge5034372000-08-08 08:04:29 +00003724#else
3725
3726static int
3727unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3728{
3729 register int len1, len2;
3730
3731 Py_UNICODE *s1 = str1->str;
3732 Py_UNICODE *s2 = str2->str;
3733
3734 len1 = str1->length;
3735 len2 = str2->length;
3736
3737 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003738 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003739
Fredrik Lundh45714e92001-06-26 16:39:36 +00003740 c1 = *s1++;
3741 c2 = *s2++;
3742
3743 if (c1 != c2)
3744 return (c1 < c2) ? -1 : 1;
3745
Marc-André Lemburge5034372000-08-08 08:04:29 +00003746 len1--; len2--;
3747 }
3748
3749 return (len1 < len2) ? -1 : (len1 != len2);
3750}
3751
3752#endif
3753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754int PyUnicode_Compare(PyObject *left,
3755 PyObject *right)
3756{
3757 PyUnicodeObject *u = NULL, *v = NULL;
3758 int result;
3759
3760 /* Coerce the two arguments */
3761 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3762 if (u == NULL)
3763 goto onError;
3764 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3765 if (v == NULL)
3766 goto onError;
3767
Thomas Wouters7e474022000-07-16 12:04:32 +00003768 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 if (v == u) {
3770 Py_DECREF(u);
3771 Py_DECREF(v);
3772 return 0;
3773 }
3774
3775 result = unicode_compare(u, v);
3776
3777 Py_DECREF(u);
3778 Py_DECREF(v);
3779 return result;
3780
3781onError:
3782 Py_XDECREF(u);
3783 Py_XDECREF(v);
3784 return -1;
3785}
3786
Guido van Rossum403d68b2000-03-13 15:55:09 +00003787int PyUnicode_Contains(PyObject *container,
3788 PyObject *element)
3789{
3790 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00003791 int result, size;
3792 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00003793
3794 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003795 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003796 if (v == NULL) {
3797 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00003798 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003799 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003800 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003801 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3802 if (u == NULL) {
3803 Py_DECREF(v);
3804 goto onError;
3805 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003806
Barry Warsaw817918c2002-08-06 16:58:21 +00003807 size = PyUnicode_GET_SIZE(v);
3808 rhs = PyUnicode_AS_UNICODE(v);
3809 lhs = PyUnicode_AS_UNICODE(u);
3810
Guido van Rossum403d68b2000-03-13 15:55:09 +00003811 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00003812 if (size == 1) {
3813 end = lhs + PyUnicode_GET_SIZE(u);
3814 while (lhs < end) {
3815 if (*lhs++ == *rhs) {
3816 result = 1;
3817 break;
3818 }
3819 }
3820 }
3821 else {
3822 end = lhs + (PyUnicode_GET_SIZE(u) - size);
3823 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00003824 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00003825 result = 1;
3826 break;
3827 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003828 }
3829 }
3830
3831 Py_DECREF(u);
3832 Py_DECREF(v);
3833 return result;
3834
3835onError:
3836 Py_XDECREF(u);
3837 Py_XDECREF(v);
3838 return -1;
3839}
3840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841/* Concat to string or Unicode object giving a new Unicode object. */
3842
3843PyObject *PyUnicode_Concat(PyObject *left,
3844 PyObject *right)
3845{
3846 PyUnicodeObject *u = NULL, *v = NULL, *w;
3847
3848 /* Coerce the two arguments */
3849 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3850 if (u == NULL)
3851 goto onError;
3852 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3853 if (v == NULL)
3854 goto onError;
3855
3856 /* Shortcuts */
3857 if (v == unicode_empty) {
3858 Py_DECREF(v);
3859 return (PyObject *)u;
3860 }
3861 if (u == unicode_empty) {
3862 Py_DECREF(u);
3863 return (PyObject *)v;
3864 }
3865
3866 /* Concat the two Unicode strings */
3867 w = _PyUnicode_New(u->length + v->length);
3868 if (w == NULL)
3869 goto onError;
3870 Py_UNICODE_COPY(w->str, u->str, u->length);
3871 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3872
3873 Py_DECREF(u);
3874 Py_DECREF(v);
3875 return (PyObject *)w;
3876
3877onError:
3878 Py_XDECREF(u);
3879 Py_XDECREF(v);
3880 return NULL;
3881}
3882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003883PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884"S.count(sub[, start[, end]]) -> int\n\
3885\n\
3886Return the number of occurrences of substring sub in Unicode string\n\
3887S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003888interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889
3890static PyObject *
3891unicode_count(PyUnicodeObject *self, PyObject *args)
3892{
3893 PyUnicodeObject *substring;
3894 int start = 0;
3895 int end = INT_MAX;
3896 PyObject *result;
3897
Guido van Rossumb8872e62000-05-09 14:14:27 +00003898 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3899 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 return NULL;
3901
3902 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3903 (PyObject *)substring);
3904 if (substring == NULL)
3905 return NULL;
3906
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 if (start < 0)
3908 start += self->length;
3909 if (start < 0)
3910 start = 0;
3911 if (end > self->length)
3912 end = self->length;
3913 if (end < 0)
3914 end += self->length;
3915 if (end < 0)
3916 end = 0;
3917
3918 result = PyInt_FromLong((long) count(self, start, end, substring));
3919
3920 Py_DECREF(substring);
3921 return result;
3922}
3923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003924PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925"S.encode([encoding[,errors]]) -> string\n\
3926\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003927Return an encoded string version of S. Default encoding is the current\n\
3928default string encoding. errors may be given to set a different error\n\
3929handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003930a ValueError. Other possible values are 'ignore' and 'replace'.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931
3932static PyObject *
3933unicode_encode(PyUnicodeObject *self, PyObject *args)
3934{
3935 char *encoding = NULL;
3936 char *errors = NULL;
3937 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3938 return NULL;
3939 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3940}
3941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003942PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943"S.expandtabs([tabsize]) -> unicode\n\
3944\n\
3945Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003946If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947
3948static PyObject*
3949unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3950{
3951 Py_UNICODE *e;
3952 Py_UNICODE *p;
3953 Py_UNICODE *q;
3954 int i, j;
3955 PyUnicodeObject *u;
3956 int tabsize = 8;
3957
3958 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3959 return NULL;
3960
Thomas Wouters7e474022000-07-16 12:04:32 +00003961 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 i = j = 0;
3963 e = self->str + self->length;
3964 for (p = self->str; p < e; p++)
3965 if (*p == '\t') {
3966 if (tabsize > 0)
3967 j += tabsize - (j % tabsize);
3968 }
3969 else {
3970 j++;
3971 if (*p == '\n' || *p == '\r') {
3972 i += j;
3973 j = 0;
3974 }
3975 }
3976
3977 /* Second pass: create output string and fill it */
3978 u = _PyUnicode_New(i + j);
3979 if (!u)
3980 return NULL;
3981
3982 j = 0;
3983 q = u->str;
3984
3985 for (p = self->str; p < e; p++)
3986 if (*p == '\t') {
3987 if (tabsize > 0) {
3988 i = tabsize - (j % tabsize);
3989 j += i;
3990 while (i--)
3991 *q++ = ' ';
3992 }
3993 }
3994 else {
3995 j++;
3996 *q++ = *p;
3997 if (*p == '\n' || *p == '\r')
3998 j = 0;
3999 }
4000
4001 return (PyObject*) u;
4002}
4003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004004PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005"S.find(sub [,start [,end]]) -> int\n\
4006\n\
4007Return the lowest index in S where substring sub is found,\n\
4008such that sub is contained within s[start,end]. Optional\n\
4009arguments start and end are interpreted as in slice notation.\n\
4010\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004011Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012
4013static PyObject *
4014unicode_find(PyUnicodeObject *self, PyObject *args)
4015{
4016 PyUnicodeObject *substring;
4017 int start = 0;
4018 int end = INT_MAX;
4019 PyObject *result;
4020
Guido van Rossumb8872e62000-05-09 14:14:27 +00004021 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4022 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 return NULL;
4024 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4025 (PyObject *)substring);
4026 if (substring == NULL)
4027 return NULL;
4028
4029 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4030
4031 Py_DECREF(substring);
4032 return result;
4033}
4034
4035static PyObject *
4036unicode_getitem(PyUnicodeObject *self, int index)
4037{
4038 if (index < 0 || index >= self->length) {
4039 PyErr_SetString(PyExc_IndexError, "string index out of range");
4040 return NULL;
4041 }
4042
4043 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4044}
4045
4046static long
4047unicode_hash(PyUnicodeObject *self)
4048{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004049 /* Since Unicode objects compare equal to their ASCII string
4050 counterparts, they should use the individual character values
4051 as basis for their hash value. This is needed to assure that
4052 strings and Unicode objects behave in the same way as
4053 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054
Fredrik Lundhdde61642000-07-10 18:27:47 +00004055 register int len;
4056 register Py_UNICODE *p;
4057 register long x;
4058
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 if (self->hash != -1)
4060 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004061 len = PyUnicode_GET_SIZE(self);
4062 p = PyUnicode_AS_UNICODE(self);
4063 x = *p << 7;
4064 while (--len >= 0)
4065 x = (1000003*x) ^ *p++;
4066 x ^= PyUnicode_GET_SIZE(self);
4067 if (x == -1)
4068 x = -2;
4069 self->hash = x;
4070 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071}
4072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004073PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074"S.index(sub [,start [,end]]) -> int\n\
4075\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004076Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077
4078static PyObject *
4079unicode_index(PyUnicodeObject *self, PyObject *args)
4080{
4081 int result;
4082 PyUnicodeObject *substring;
4083 int start = 0;
4084 int end = INT_MAX;
4085
Guido van Rossumb8872e62000-05-09 14:14:27 +00004086 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4087 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 return NULL;
4089
4090 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4091 (PyObject *)substring);
4092 if (substring == NULL)
4093 return NULL;
4094
4095 result = findstring(self, substring, start, end, 1);
4096
4097 Py_DECREF(substring);
4098 if (result < 0) {
4099 PyErr_SetString(PyExc_ValueError, "substring not found");
4100 return NULL;
4101 }
4102 return PyInt_FromLong(result);
4103}
4104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004105PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004106"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004108Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004109at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110
4111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004112unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113{
4114 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4115 register const Py_UNICODE *e;
4116 int cased;
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 /* Shortcut for single character strings */
4119 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004120 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004122 /* Special case for empty strings */
4123 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004124 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004125
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 e = p + PyUnicode_GET_SIZE(self);
4127 cased = 0;
4128 for (; p < e; p++) {
4129 register const Py_UNICODE ch = *p;
4130
4131 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004132 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 else if (!cased && Py_UNICODE_ISLOWER(ch))
4134 cased = 1;
4135 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004136 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137}
4138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004139PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004140"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004142Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004143at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144
4145static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004146unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
4148 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4149 register const Py_UNICODE *e;
4150 int cased;
4151
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 /* Shortcut for single character strings */
4153 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004154 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004156 /* Special case for empty strings */
4157 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004158 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004159
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 e = p + PyUnicode_GET_SIZE(self);
4161 cased = 0;
4162 for (; p < e; p++) {
4163 register const Py_UNICODE ch = *p;
4164
4165 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004166 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 else if (!cased && Py_UNICODE_ISUPPER(ch))
4168 cased = 1;
4169 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004170 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171}
4172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004173PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004174"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004176Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4177characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004178only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179
4180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004181unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182{
4183 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4184 register const Py_UNICODE *e;
4185 int cased, previous_is_cased;
4186
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 /* Shortcut for single character strings */
4188 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004189 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4190 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004192 /* Special case for empty strings */
4193 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004194 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004195
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 e = p + PyUnicode_GET_SIZE(self);
4197 cased = 0;
4198 previous_is_cased = 0;
4199 for (; p < e; p++) {
4200 register const Py_UNICODE ch = *p;
4201
4202 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4203 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 previous_is_cased = 1;
4206 cased = 1;
4207 }
4208 else if (Py_UNICODE_ISLOWER(ch)) {
4209 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004210 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 previous_is_cased = 1;
4212 cased = 1;
4213 }
4214 else
4215 previous_is_cased = 0;
4216 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218}
4219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004220PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004221"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004223Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004224False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225
4226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004227unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228{
4229 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4230 register const Py_UNICODE *e;
4231
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 /* Shortcut for single character strings */
4233 if (PyUnicode_GET_SIZE(self) == 1 &&
4234 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004235 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004237 /* Special case for empty strings */
4238 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004239 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004240
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 e = p + PyUnicode_GET_SIZE(self);
4242 for (; p < e; p++) {
4243 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004244 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247}
4248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004249PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004250"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004251\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004252Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004253and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004254
4255static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004256unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004257{
4258 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4259 register const Py_UNICODE *e;
4260
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004261 /* Shortcut for single character strings */
4262 if (PyUnicode_GET_SIZE(self) == 1 &&
4263 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004264 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004265
4266 /* Special case for empty strings */
4267 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004268 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004269
4270 e = p + PyUnicode_GET_SIZE(self);
4271 for (; p < e; p++) {
4272 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004273 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004276}
4277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004278PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004279"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004280\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004281Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004282and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004283
4284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004285unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004286{
4287 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4288 register const Py_UNICODE *e;
4289
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004290 /* Shortcut for single character strings */
4291 if (PyUnicode_GET_SIZE(self) == 1 &&
4292 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004293 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004294
4295 /* Special case for empty strings */
4296 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004297 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004298
4299 e = p + PyUnicode_GET_SIZE(self);
4300 for (; p < e; p++) {
4301 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004302 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004303 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004305}
4306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004307PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004308"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004310Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004311False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312
4313static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004314unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315{
4316 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4317 register const Py_UNICODE *e;
4318
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 /* Shortcut for single character strings */
4320 if (PyUnicode_GET_SIZE(self) == 1 &&
4321 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004322 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004324 /* Special case for empty strings */
4325 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004326 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004327
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 e = p + PyUnicode_GET_SIZE(self);
4329 for (; p < e; p++) {
4330 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004331 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004333 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334}
4335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004336PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004337"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004339Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004340False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341
4342static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004343unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344{
4345 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4346 register const Py_UNICODE *e;
4347
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 /* Shortcut for single character strings */
4349 if (PyUnicode_GET_SIZE(self) == 1 &&
4350 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004351 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004353 /* Special case for empty strings */
4354 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004355 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004356
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 e = p + PyUnicode_GET_SIZE(self);
4358 for (; p < e; p++) {
4359 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004360 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004362 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363}
4364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004365PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004366"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004368Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004369False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370
4371static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004372unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373{
4374 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4375 register const Py_UNICODE *e;
4376
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 /* Shortcut for single character strings */
4378 if (PyUnicode_GET_SIZE(self) == 1 &&
4379 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004380 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004382 /* Special case for empty strings */
4383 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004384 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004385
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386 e = p + PyUnicode_GET_SIZE(self);
4387 for (; p < e; p++) {
4388 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004389 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004391 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392}
4393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004394PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395"S.join(sequence) -> unicode\n\
4396\n\
4397Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004398sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399
4400static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004401unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004403 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404}
4405
4406static int
4407unicode_length(PyUnicodeObject *self)
4408{
4409 return self->length;
4410}
4411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004412PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413"S.ljust(width) -> unicode\n\
4414\n\
4415Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004416done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417
4418static PyObject *
4419unicode_ljust(PyUnicodeObject *self, PyObject *args)
4420{
4421 int width;
4422 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4423 return NULL;
4424
Tim Peters7a29bd52001-09-12 03:03:31 +00004425 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 Py_INCREF(self);
4427 return (PyObject*) self;
4428 }
4429
4430 return (PyObject*) pad(self, 0, width - self->length, ' ');
4431}
4432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004433PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434"S.lower() -> unicode\n\
4435\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004436Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437
4438static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004439unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 return fixup(self, fixlower);
4442}
4443
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004444#define LEFTSTRIP 0
4445#define RIGHTSTRIP 1
4446#define BOTHSTRIP 2
4447
4448/* Arrays indexed by above */
4449static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4450
4451#define STRIPNAME(i) (stripformat[i]+3)
4452
4453static const Py_UNICODE *
4454unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4455{
Tim Peters030a5ce2002-04-22 19:00:10 +00004456 size_t i;
4457 for (i = 0; i < n; ++i)
4458 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004459 return s+i;
4460 return NULL;
4461}
4462
4463/* externally visible for str.strip(unicode) */
4464PyObject *
4465_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4466{
4467 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4468 int len = PyUnicode_GET_SIZE(self);
4469 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4470 int seplen = PyUnicode_GET_SIZE(sepobj);
4471 int i, j;
4472
4473 i = 0;
4474 if (striptype != RIGHTSTRIP) {
4475 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4476 i++;
4477 }
4478 }
4479
4480 j = len;
4481 if (striptype != LEFTSTRIP) {
4482 do {
4483 j--;
4484 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4485 j++;
4486 }
4487
4488 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4489 Py_INCREF(self);
4490 return (PyObject*)self;
4491 }
4492 else
4493 return PyUnicode_FromUnicode(s+i, j-i);
4494}
4495
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496
4497static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004498do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004500 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4501 int len = PyUnicode_GET_SIZE(self), i, j;
4502
4503 i = 0;
4504 if (striptype != RIGHTSTRIP) {
4505 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4506 i++;
4507 }
4508 }
4509
4510 j = len;
4511 if (striptype != LEFTSTRIP) {
4512 do {
4513 j--;
4514 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4515 j++;
4516 }
4517
4518 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4519 Py_INCREF(self);
4520 return (PyObject*)self;
4521 }
4522 else
4523 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524}
4525
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004526
4527static PyObject *
4528do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4529{
4530 PyObject *sep = NULL;
4531
4532 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4533 return NULL;
4534
4535 if (sep != NULL && sep != Py_None) {
4536 if (PyUnicode_Check(sep))
4537 return _PyUnicode_XStrip(self, striptype, sep);
4538 else if (PyString_Check(sep)) {
4539 PyObject *res;
4540 sep = PyUnicode_FromObject(sep);
4541 if (sep==NULL)
4542 return NULL;
4543 res = _PyUnicode_XStrip(self, striptype, sep);
4544 Py_DECREF(sep);
4545 return res;
4546 }
4547 else {
4548 PyErr_Format(PyExc_TypeError,
4549 "%s arg must be None, unicode or str",
4550 STRIPNAME(striptype));
4551 return NULL;
4552 }
4553 }
4554
4555 return do_strip(self, striptype);
4556}
4557
4558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004559PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004560"S.strip([sep]) -> unicode\n\
4561\n\
4562Return a copy of the string S with leading and trailing\n\
4563whitespace removed.\n\
4564If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004565If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004566
4567static PyObject *
4568unicode_strip(PyUnicodeObject *self, PyObject *args)
4569{
4570 if (PyTuple_GET_SIZE(args) == 0)
4571 return do_strip(self, BOTHSTRIP); /* Common case */
4572 else
4573 return do_argstrip(self, BOTHSTRIP, args);
4574}
4575
4576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004577PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004578"S.lstrip([sep]) -> unicode\n\
4579\n\
4580Return a copy of the string S with leading whitespace removed.\n\
4581If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004582If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004583
4584static PyObject *
4585unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4586{
4587 if (PyTuple_GET_SIZE(args) == 0)
4588 return do_strip(self, LEFTSTRIP); /* Common case */
4589 else
4590 return do_argstrip(self, LEFTSTRIP, args);
4591}
4592
4593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004594PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004595"S.rstrip([sep]) -> unicode\n\
4596\n\
4597Return a copy of the string S with trailing whitespace removed.\n\
4598If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004599If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004600
4601static PyObject *
4602unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4603{
4604 if (PyTuple_GET_SIZE(args) == 0)
4605 return do_strip(self, RIGHTSTRIP); /* Common case */
4606 else
4607 return do_argstrip(self, RIGHTSTRIP, args);
4608}
4609
4610
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611static PyObject*
4612unicode_repeat(PyUnicodeObject *str, int len)
4613{
4614 PyUnicodeObject *u;
4615 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004616 int nchars;
4617 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618
4619 if (len < 0)
4620 len = 0;
4621
Tim Peters7a29bd52001-09-12 03:03:31 +00004622 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 /* no repeat, return original string */
4624 Py_INCREF(str);
4625 return (PyObject*) str;
4626 }
Tim Peters8f422462000-09-09 06:13:41 +00004627
4628 /* ensure # of chars needed doesn't overflow int and # of bytes
4629 * needed doesn't overflow size_t
4630 */
4631 nchars = len * str->length;
4632 if (len && nchars / len != str->length) {
4633 PyErr_SetString(PyExc_OverflowError,
4634 "repeated string is too long");
4635 return NULL;
4636 }
4637 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4638 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4639 PyErr_SetString(PyExc_OverflowError,
4640 "repeated string is too long");
4641 return NULL;
4642 }
4643 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644 if (!u)
4645 return NULL;
4646
4647 p = u->str;
4648
4649 while (len-- > 0) {
4650 Py_UNICODE_COPY(p, str->str, str->length);
4651 p += str->length;
4652 }
4653
4654 return (PyObject*) u;
4655}
4656
4657PyObject *PyUnicode_Replace(PyObject *obj,
4658 PyObject *subobj,
4659 PyObject *replobj,
4660 int maxcount)
4661{
4662 PyObject *self;
4663 PyObject *str1;
4664 PyObject *str2;
4665 PyObject *result;
4666
4667 self = PyUnicode_FromObject(obj);
4668 if (self == NULL)
4669 return NULL;
4670 str1 = PyUnicode_FromObject(subobj);
4671 if (str1 == NULL) {
4672 Py_DECREF(self);
4673 return NULL;
4674 }
4675 str2 = PyUnicode_FromObject(replobj);
4676 if (str2 == NULL) {
4677 Py_DECREF(self);
4678 Py_DECREF(str1);
4679 return NULL;
4680 }
4681 result = replace((PyUnicodeObject *)self,
4682 (PyUnicodeObject *)str1,
4683 (PyUnicodeObject *)str2,
4684 maxcount);
4685 Py_DECREF(self);
4686 Py_DECREF(str1);
4687 Py_DECREF(str2);
4688 return result;
4689}
4690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004691PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692"S.replace (old, new[, maxsplit]) -> unicode\n\
4693\n\
4694Return a copy of S with all occurrences of substring\n\
4695old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004696given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
4698static PyObject*
4699unicode_replace(PyUnicodeObject *self, PyObject *args)
4700{
4701 PyUnicodeObject *str1;
4702 PyUnicodeObject *str2;
4703 int maxcount = -1;
4704 PyObject *result;
4705
4706 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4707 return NULL;
4708 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4709 if (str1 == NULL)
4710 return NULL;
4711 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4712 if (str2 == NULL)
4713 return NULL;
4714
4715 result = replace(self, str1, str2, maxcount);
4716
4717 Py_DECREF(str1);
4718 Py_DECREF(str2);
4719 return result;
4720}
4721
4722static
4723PyObject *unicode_repr(PyObject *unicode)
4724{
4725 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4726 PyUnicode_GET_SIZE(unicode),
4727 1);
4728}
4729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004730PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731"S.rfind(sub [,start [,end]]) -> int\n\
4732\n\
4733Return the highest index in S where substring sub is found,\n\
4734such that sub is contained within s[start,end]. Optional\n\
4735arguments start and end are interpreted as in slice notation.\n\
4736\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004737Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738
4739static PyObject *
4740unicode_rfind(PyUnicodeObject *self, PyObject *args)
4741{
4742 PyUnicodeObject *substring;
4743 int start = 0;
4744 int end = INT_MAX;
4745 PyObject *result;
4746
Guido van Rossumb8872e62000-05-09 14:14:27 +00004747 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4748 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 return NULL;
4750 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4751 (PyObject *)substring);
4752 if (substring == NULL)
4753 return NULL;
4754
4755 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4756
4757 Py_DECREF(substring);
4758 return result;
4759}
4760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004761PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762"S.rindex(sub [,start [,end]]) -> int\n\
4763\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004764Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765
4766static PyObject *
4767unicode_rindex(PyUnicodeObject *self, PyObject *args)
4768{
4769 int result;
4770 PyUnicodeObject *substring;
4771 int start = 0;
4772 int end = INT_MAX;
4773
Guido van Rossumb8872e62000-05-09 14:14:27 +00004774 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4775 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 return NULL;
4777 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4778 (PyObject *)substring);
4779 if (substring == NULL)
4780 return NULL;
4781
4782 result = findstring(self, substring, start, end, -1);
4783
4784 Py_DECREF(substring);
4785 if (result < 0) {
4786 PyErr_SetString(PyExc_ValueError, "substring not found");
4787 return NULL;
4788 }
4789 return PyInt_FromLong(result);
4790}
4791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004792PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793"S.rjust(width) -> unicode\n\
4794\n\
4795Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004796done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
4798static PyObject *
4799unicode_rjust(PyUnicodeObject *self, PyObject *args)
4800{
4801 int width;
4802 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4803 return NULL;
4804
Tim Peters7a29bd52001-09-12 03:03:31 +00004805 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 Py_INCREF(self);
4807 return (PyObject*) self;
4808 }
4809
4810 return (PyObject*) pad(self, width - self->length, 0, ' ');
4811}
4812
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813static PyObject*
4814unicode_slice(PyUnicodeObject *self, int start, int end)
4815{
4816 /* standard clamping */
4817 if (start < 0)
4818 start = 0;
4819 if (end < 0)
4820 end = 0;
4821 if (end > self->length)
4822 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004823 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 /* full slice, return original string */
4825 Py_INCREF(self);
4826 return (PyObject*) self;
4827 }
4828 if (start > end)
4829 start = end;
4830 /* copy slice */
4831 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4832 end - start);
4833}
4834
4835PyObject *PyUnicode_Split(PyObject *s,
4836 PyObject *sep,
4837 int maxsplit)
4838{
4839 PyObject *result;
4840
4841 s = PyUnicode_FromObject(s);
4842 if (s == NULL)
4843 return NULL;
4844 if (sep != NULL) {
4845 sep = PyUnicode_FromObject(sep);
4846 if (sep == NULL) {
4847 Py_DECREF(s);
4848 return NULL;
4849 }
4850 }
4851
4852 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4853
4854 Py_DECREF(s);
4855 Py_XDECREF(sep);
4856 return result;
4857}
4858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004859PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860"S.split([sep [,maxsplit]]) -> list of strings\n\
4861\n\
4862Return a list of the words in S, using sep as the\n\
4863delimiter string. If maxsplit is given, at most maxsplit\n\
4864splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004865is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
4867static PyObject*
4868unicode_split(PyUnicodeObject *self, PyObject *args)
4869{
4870 PyObject *substring = Py_None;
4871 int maxcount = -1;
4872
4873 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4874 return NULL;
4875
4876 if (substring == Py_None)
4877 return split(self, NULL, maxcount);
4878 else if (PyUnicode_Check(substring))
4879 return split(self, (PyUnicodeObject *)substring, maxcount);
4880 else
4881 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4882}
4883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004884PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00004885"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886\n\
4887Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004888Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004889is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890
4891static PyObject*
4892unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4893{
Guido van Rossum86662912000-04-11 15:38:46 +00004894 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895
Guido van Rossum86662912000-04-11 15:38:46 +00004896 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 return NULL;
4898
Guido van Rossum86662912000-04-11 15:38:46 +00004899 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900}
4901
4902static
4903PyObject *unicode_str(PyUnicodeObject *self)
4904{
Fred Drakee4315f52000-05-09 19:53:39 +00004905 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906}
4907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004908PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909"S.swapcase() -> unicode\n\
4910\n\
4911Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004912and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
4914static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004915unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 return fixup(self, fixswapcase);
4918}
4919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004920PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921"S.translate(table) -> unicode\n\
4922\n\
4923Return a copy of the string S, where all characters have been mapped\n\
4924through the given translation table, which must be a mapping of\n\
4925Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004926are left untouched. Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927
4928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004929unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 return PyUnicode_TranslateCharmap(self->str,
4932 self->length,
4933 table,
4934 "ignore");
4935}
4936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004937PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938"S.upper() -> unicode\n\
4939\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004940Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941
4942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004943unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 return fixup(self, fixupper);
4946}
4947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004948PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949"S.zfill(width) -> unicode\n\
4950\n\
4951Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004952of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953
4954static PyObject *
4955unicode_zfill(PyUnicodeObject *self, PyObject *args)
4956{
4957 int fill;
4958 PyUnicodeObject *u;
4959
4960 int width;
4961 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4962 return NULL;
4963
4964 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004965 if (PyUnicode_CheckExact(self)) {
4966 Py_INCREF(self);
4967 return (PyObject*) self;
4968 }
4969 else
4970 return PyUnicode_FromUnicode(
4971 PyUnicode_AS_UNICODE(self),
4972 PyUnicode_GET_SIZE(self)
4973 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 }
4975
4976 fill = width - self->length;
4977
4978 u = pad(self, fill, 0, '0');
4979
Walter Dörwald068325e2002-04-15 13:36:47 +00004980 if (u == NULL)
4981 return NULL;
4982
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 if (u->str[fill] == '+' || u->str[fill] == '-') {
4984 /* move sign to beginning of string */
4985 u->str[0] = u->str[fill];
4986 u->str[fill] = '0';
4987 }
4988
4989 return (PyObject*) u;
4990}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991
4992#if 0
4993static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004994unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 return PyInt_FromLong(unicode_freelist_size);
4997}
4998#endif
4999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005000PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005001"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005003Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005005comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006
5007static PyObject *
5008unicode_startswith(PyUnicodeObject *self,
5009 PyObject *args)
5010{
5011 PyUnicodeObject *substring;
5012 int start = 0;
5013 int end = INT_MAX;
5014 PyObject *result;
5015
Guido van Rossumb8872e62000-05-09 14:14:27 +00005016 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5017 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 return NULL;
5019 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5020 (PyObject *)substring);
5021 if (substring == NULL)
5022 return NULL;
5023
Guido van Rossum77f6a652002-04-03 22:41:51 +00005024 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025
5026 Py_DECREF(substring);
5027 return result;
5028}
5029
5030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005031PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005034Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005036comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037
5038static PyObject *
5039unicode_endswith(PyUnicodeObject *self,
5040 PyObject *args)
5041{
5042 PyUnicodeObject *substring;
5043 int start = 0;
5044 int end = INT_MAX;
5045 PyObject *result;
5046
Guido van Rossumb8872e62000-05-09 14:14:27 +00005047 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5048 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 return NULL;
5050 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5051 (PyObject *)substring);
5052 if (substring == NULL)
5053 return NULL;
5054
Guido van Rossum77f6a652002-04-03 22:41:51 +00005055 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056
5057 Py_DECREF(substring);
5058 return result;
5059}
5060
5061
5062static PyMethodDef unicode_methods[] = {
5063
5064 /* Order is according to common usage: often used methods should
5065 appear first, since lookup is done sequentially. */
5066
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005067 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5068 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5069 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5070 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5071 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5072 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5073 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5074 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5075 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5076 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5077 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5078 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5079 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005080 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005081/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5082 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5083 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5084 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005085 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005086 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005087 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005088 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5089 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5090 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5091 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5092 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5093 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5094 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5095 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5096 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5097 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5098 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5099 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5100 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5101 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005102 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005103#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005104 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105#endif
5106
5107#if 0
5108 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005109 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110#endif
5111
5112 {NULL, NULL}
5113};
5114
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115static PySequenceMethods unicode_as_sequence = {
5116 (inquiry) unicode_length, /* sq_length */
5117 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5118 (intargfunc) unicode_repeat, /* sq_repeat */
5119 (intargfunc) unicode_getitem, /* sq_item */
5120 (intintargfunc) unicode_slice, /* sq_slice */
5121 0, /* sq_ass_item */
5122 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005123 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124};
5125
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005126static PyObject*
5127unicode_subscript(PyUnicodeObject* self, PyObject* item)
5128{
5129 if (PyInt_Check(item)) {
5130 long i = PyInt_AS_LONG(item);
5131 if (i < 0)
5132 i += PyString_GET_SIZE(self);
5133 return unicode_getitem(self, i);
5134 } else if (PyLong_Check(item)) {
5135 long i = PyLong_AsLong(item);
5136 if (i == -1 && PyErr_Occurred())
5137 return NULL;
5138 if (i < 0)
5139 i += PyString_GET_SIZE(self);
5140 return unicode_getitem(self, i);
5141 } else if (PySlice_Check(item)) {
5142 int start, stop, step, slicelength, cur, i;
5143 Py_UNICODE* source_buf;
5144 Py_UNICODE* result_buf;
5145 PyObject* result;
5146
5147 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5148 &start, &stop, &step, &slicelength) < 0) {
5149 return NULL;
5150 }
5151
5152 if (slicelength <= 0) {
5153 return PyUnicode_FromUnicode(NULL, 0);
5154 } else {
5155 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5156 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5157
5158 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5159 result_buf[i] = source_buf[cur];
5160 }
5161
5162 result = PyUnicode_FromUnicode(result_buf, slicelength);
5163 PyMem_FREE(result_buf);
5164 return result;
5165 }
5166 } else {
5167 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5168 return NULL;
5169 }
5170}
5171
5172static PyMappingMethods unicode_as_mapping = {
5173 (inquiry)unicode_length, /* mp_length */
5174 (binaryfunc)unicode_subscript, /* mp_subscript */
5175 (objobjargproc)0, /* mp_ass_subscript */
5176};
5177
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178static int
5179unicode_buffer_getreadbuf(PyUnicodeObject *self,
5180 int index,
5181 const void **ptr)
5182{
5183 if (index != 0) {
5184 PyErr_SetString(PyExc_SystemError,
5185 "accessing non-existent unicode segment");
5186 return -1;
5187 }
5188 *ptr = (void *) self->str;
5189 return PyUnicode_GET_DATA_SIZE(self);
5190}
5191
5192static int
5193unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5194 const void **ptr)
5195{
5196 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005197 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 return -1;
5199}
5200
5201static int
5202unicode_buffer_getsegcount(PyUnicodeObject *self,
5203 int *lenp)
5204{
5205 if (lenp)
5206 *lenp = PyUnicode_GET_DATA_SIZE(self);
5207 return 1;
5208}
5209
5210static int
5211unicode_buffer_getcharbuf(PyUnicodeObject *self,
5212 int index,
5213 const void **ptr)
5214{
5215 PyObject *str;
5216
5217 if (index != 0) {
5218 PyErr_SetString(PyExc_SystemError,
5219 "accessing non-existent unicode segment");
5220 return -1;
5221 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005222 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 if (str == NULL)
5224 return -1;
5225 *ptr = (void *) PyString_AS_STRING(str);
5226 return PyString_GET_SIZE(str);
5227}
5228
5229/* Helpers for PyUnicode_Format() */
5230
5231static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005232getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233{
5234 int argidx = *p_argidx;
5235 if (argidx < arglen) {
5236 (*p_argidx)++;
5237 if (arglen < 0)
5238 return args;
5239 else
5240 return PyTuple_GetItem(args, argidx);
5241 }
5242 PyErr_SetString(PyExc_TypeError,
5243 "not enough arguments for format string");
5244 return NULL;
5245}
5246
5247#define F_LJUST (1<<0)
5248#define F_SIGN (1<<1)
5249#define F_BLANK (1<<2)
5250#define F_ALT (1<<3)
5251#define F_ZERO (1<<4)
5252
5253static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255{
5256 register int i;
5257 int len;
5258 va_list va;
5259 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
5262 /* First, format the string as char array, then expand to Py_UNICODE
5263 array. */
5264 charbuffer = (char *)buffer;
5265 len = vsprintf(charbuffer, format, va);
5266 for (i = len - 1; i >= 0; i--)
5267 buffer[i] = (Py_UNICODE) charbuffer[i];
5268
5269 va_end(va);
5270 return len;
5271}
5272
Guido van Rossum078151d2002-08-11 04:24:12 +00005273/* XXX To save some code duplication, formatfloat/long/int could have been
5274 shared with stringobject.c, converting from 8-bit to Unicode after the
5275 formatting is done. */
5276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277static int
5278formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005279 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 int flags,
5281 int prec,
5282 int type,
5283 PyObject *v)
5284{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005285 /* fmt = '%#.' + `prec` + `type`
5286 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 char fmt[20];
5288 double x;
5289
5290 x = PyFloat_AsDouble(v);
5291 if (x == -1.0 && PyErr_Occurred())
5292 return -1;
5293 if (prec < 0)
5294 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5296 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005297 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5298 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005299 /* worst case length calc to ensure no buffer overrun:
5300 fmt = %#.<prec>g
5301 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5302 for any double rep.)
5303 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5304 If prec=0 the effective precision is 1 (the leading digit is
5305 always given), therefore increase by one to 10+prec. */
5306 if (buflen <= (size_t)10 + (size_t)prec) {
5307 PyErr_SetString(PyExc_OverflowError,
5308 "formatted float is too long (precision too long?)");
5309 return -1;
5310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 return usprintf(buf, fmt, x);
5312}
5313
Tim Peters38fd5b62000-09-21 05:43:11 +00005314static PyObject*
5315formatlong(PyObject *val, int flags, int prec, int type)
5316{
5317 char *buf;
5318 int i, len;
5319 PyObject *str; /* temporary string object. */
5320 PyUnicodeObject *result;
5321
5322 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5323 if (!str)
5324 return NULL;
5325 result = _PyUnicode_New(len);
5326 for (i = 0; i < len; i++)
5327 result->str[i] = buf[i];
5328 result->str[len] = 0;
5329 Py_DECREF(str);
5330 return (PyObject*)result;
5331}
5332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333static int
5334formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005335 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 int flags,
5337 int prec,
5338 int type,
5339 PyObject *v)
5340{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005341 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005342 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5343 * + 1 + 1
5344 * = 24
5345 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005346 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 long x;
5348
5349 x = PyInt_AsLong(v);
5350 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005351 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00005352 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00005353 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00005354 "%u/%o/%x/%X of negative int will return "
5355 "a signed string in Python 2.4 and up") < 0)
5356 return -1;
5357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005359 prec = 1;
5360
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005361 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005362 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5363 */
5364 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005365 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005366 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005367 return -1;
5368 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005369
5370 if ((flags & F_ALT) &&
5371 (type == 'x' || type == 'X')) {
5372 /* When converting under %#x or %#X, there are a number
5373 * of issues that cause pain:
5374 * - when 0 is being converted, the C standard leaves off
5375 * the '0x' or '0X', which is inconsistent with other
5376 * %#x/%#X conversions and inconsistent with Python's
5377 * hex() function
5378 * - there are platforms that violate the standard and
5379 * convert 0 with the '0x' or '0X'
5380 * (Metrowerks, Compaq Tru64)
5381 * - there are platforms that give '0x' when converting
5382 * under %#X, but convert 0 in accordance with the
5383 * standard (OS/2 EMX)
5384 *
5385 * We can achieve the desired consistency by inserting our
5386 * own '0x' or '0X' prefix, and substituting %x/%X in place
5387 * of %#x/%#X.
5388 *
5389 * Note that this is the same approach as used in
5390 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005391 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005392 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5393 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005394 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005395 else {
5396 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5397 (flags&F_ALT) ? "#" : "",
5398 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 return usprintf(buf, fmt, x);
5401}
5402
5403static int
5404formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005405 size_t buflen,
5406 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005408 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005409 if (PyUnicode_Check(v)) {
5410 if (PyUnicode_GET_SIZE(v) != 1)
5411 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005415 else if (PyString_Check(v)) {
5416 if (PyString_GET_SIZE(v) != 1)
5417 goto onError;
5418 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
5421 else {
5422 /* Integer input truncated to a character */
5423 long x;
5424 x = PyInt_AsLong(v);
5425 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005426 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00005427#ifdef Py_UNICODE_WIDE
5428 if (x < 0 || x > 0x10ffff) {
5429 PyErr_SetString(PyExc_ValueError,
5430 "%c arg not in range(0x110000) "
5431 "(wide Python build)");
5432 return -1;
5433 }
5434#else
5435 if (x < 0 || x > 0xffff) {
5436 PyErr_SetString(PyExc_ValueError,
5437 "%c arg not in range(0x10000) "
5438 "(narrow Python build)");
5439 return -1;
5440 }
5441#endif
5442 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 }
5444 buf[1] = '\0';
5445 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005446
5447 onError:
5448 PyErr_SetString(PyExc_TypeError,
5449 "%c requires int or char");
5450 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451}
5452
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005453/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5454
5455 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5456 chars are formatted. XXX This is a magic number. Each formatting
5457 routine does bounds checking to ensure no overflow, but a better
5458 solution may be to malloc a buffer of appropriate size for each
5459 format. For now, the current solution is sufficient.
5460*/
5461#define FORMATBUFLEN (size_t)120
5462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463PyObject *PyUnicode_Format(PyObject *format,
5464 PyObject *args)
5465{
5466 Py_UNICODE *fmt, *res;
5467 int fmtcnt, rescnt, reslen, arglen, argidx;
5468 int args_owned = 0;
5469 PyUnicodeObject *result = NULL;
5470 PyObject *dict = NULL;
5471 PyObject *uformat;
5472
5473 if (format == NULL || args == NULL) {
5474 PyErr_BadInternalCall();
5475 return NULL;
5476 }
5477 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005478 if (uformat == NULL)
5479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 fmt = PyUnicode_AS_UNICODE(uformat);
5481 fmtcnt = PyUnicode_GET_SIZE(uformat);
5482
5483 reslen = rescnt = fmtcnt + 100;
5484 result = _PyUnicode_New(reslen);
5485 if (result == NULL)
5486 goto onError;
5487 res = PyUnicode_AS_UNICODE(result);
5488
5489 if (PyTuple_Check(args)) {
5490 arglen = PyTuple_Size(args);
5491 argidx = 0;
5492 }
5493 else {
5494 arglen = -1;
5495 argidx = -2;
5496 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005497 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 dict = args;
5499
5500 while (--fmtcnt >= 0) {
5501 if (*fmt != '%') {
5502 if (--rescnt < 0) {
5503 rescnt = fmtcnt + 100;
5504 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005505 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 return NULL;
5507 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5508 --rescnt;
5509 }
5510 *res++ = *fmt++;
5511 }
5512 else {
5513 /* Got a format specifier */
5514 int flags = 0;
5515 int width = -1;
5516 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 Py_UNICODE c = '\0';
5518 Py_UNICODE fill;
5519 PyObject *v = NULL;
5520 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005521 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 Py_UNICODE sign;
5523 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005524 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526 fmt++;
5527 if (*fmt == '(') {
5528 Py_UNICODE *keystart;
5529 int keylen;
5530 PyObject *key;
5531 int pcount = 1;
5532
5533 if (dict == NULL) {
5534 PyErr_SetString(PyExc_TypeError,
5535 "format requires a mapping");
5536 goto onError;
5537 }
5538 ++fmt;
5539 --fmtcnt;
5540 keystart = fmt;
5541 /* Skip over balanced parentheses */
5542 while (pcount > 0 && --fmtcnt >= 0) {
5543 if (*fmt == ')')
5544 --pcount;
5545 else if (*fmt == '(')
5546 ++pcount;
5547 fmt++;
5548 }
5549 keylen = fmt - keystart - 1;
5550 if (fmtcnt < 0 || pcount > 0) {
5551 PyErr_SetString(PyExc_ValueError,
5552 "incomplete format key");
5553 goto onError;
5554 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005555#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005556 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 then looked up since Python uses strings to hold
5558 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005559 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 key = PyUnicode_EncodeUTF8(keystart,
5561 keylen,
5562 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005563#else
5564 key = PyUnicode_FromUnicode(keystart, keylen);
5565#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 if (key == NULL)
5567 goto onError;
5568 if (args_owned) {
5569 Py_DECREF(args);
5570 args_owned = 0;
5571 }
5572 args = PyObject_GetItem(dict, key);
5573 Py_DECREF(key);
5574 if (args == NULL) {
5575 goto onError;
5576 }
5577 args_owned = 1;
5578 arglen = -1;
5579 argidx = -2;
5580 }
5581 while (--fmtcnt >= 0) {
5582 switch (c = *fmt++) {
5583 case '-': flags |= F_LJUST; continue;
5584 case '+': flags |= F_SIGN; continue;
5585 case ' ': flags |= F_BLANK; continue;
5586 case '#': flags |= F_ALT; continue;
5587 case '0': flags |= F_ZERO; continue;
5588 }
5589 break;
5590 }
5591 if (c == '*') {
5592 v = getnextarg(args, arglen, &argidx);
5593 if (v == NULL)
5594 goto onError;
5595 if (!PyInt_Check(v)) {
5596 PyErr_SetString(PyExc_TypeError,
5597 "* wants int");
5598 goto onError;
5599 }
5600 width = PyInt_AsLong(v);
5601 if (width < 0) {
5602 flags |= F_LJUST;
5603 width = -width;
5604 }
5605 if (--fmtcnt >= 0)
5606 c = *fmt++;
5607 }
5608 else if (c >= '0' && c <= '9') {
5609 width = c - '0';
5610 while (--fmtcnt >= 0) {
5611 c = *fmt++;
5612 if (c < '0' || c > '9')
5613 break;
5614 if ((width*10) / 10 != width) {
5615 PyErr_SetString(PyExc_ValueError,
5616 "width too big");
5617 goto onError;
5618 }
5619 width = width*10 + (c - '0');
5620 }
5621 }
5622 if (c == '.') {
5623 prec = 0;
5624 if (--fmtcnt >= 0)
5625 c = *fmt++;
5626 if (c == '*') {
5627 v = getnextarg(args, arglen, &argidx);
5628 if (v == NULL)
5629 goto onError;
5630 if (!PyInt_Check(v)) {
5631 PyErr_SetString(PyExc_TypeError,
5632 "* wants int");
5633 goto onError;
5634 }
5635 prec = PyInt_AsLong(v);
5636 if (prec < 0)
5637 prec = 0;
5638 if (--fmtcnt >= 0)
5639 c = *fmt++;
5640 }
5641 else if (c >= '0' && c <= '9') {
5642 prec = c - '0';
5643 while (--fmtcnt >= 0) {
5644 c = Py_CHARMASK(*fmt++);
5645 if (c < '0' || c > '9')
5646 break;
5647 if ((prec*10) / 10 != prec) {
5648 PyErr_SetString(PyExc_ValueError,
5649 "prec too big");
5650 goto onError;
5651 }
5652 prec = prec*10 + (c - '0');
5653 }
5654 }
5655 } /* prec */
5656 if (fmtcnt >= 0) {
5657 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 if (--fmtcnt >= 0)
5659 c = *fmt++;
5660 }
5661 }
5662 if (fmtcnt < 0) {
5663 PyErr_SetString(PyExc_ValueError,
5664 "incomplete format");
5665 goto onError;
5666 }
5667 if (c != '%') {
5668 v = getnextarg(args, arglen, &argidx);
5669 if (v == NULL)
5670 goto onError;
5671 }
5672 sign = 0;
5673 fill = ' ';
5674 switch (c) {
5675
5676 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005677 pbuf = formatbuf;
5678 /* presume that buffer length is at least 1 */
5679 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 len = 1;
5681 break;
5682
5683 case 's':
5684 case 'r':
5685 if (PyUnicode_Check(v) && c == 's') {
5686 temp = v;
5687 Py_INCREF(temp);
5688 }
5689 else {
5690 PyObject *unicode;
5691 if (c == 's')
5692 temp = PyObject_Str(v);
5693 else
5694 temp = PyObject_Repr(v);
5695 if (temp == NULL)
5696 goto onError;
5697 if (!PyString_Check(temp)) {
5698 /* XXX Note: this should never happen, since
5699 PyObject_Repr() and PyObject_Str() assure
5700 this */
5701 Py_DECREF(temp);
5702 PyErr_SetString(PyExc_TypeError,
5703 "%s argument has non-string str()");
5704 goto onError;
5705 }
Fred Drakee4315f52000-05-09 19:53:39 +00005706 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005708 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 "strict");
5710 Py_DECREF(temp);
5711 temp = unicode;
5712 if (temp == NULL)
5713 goto onError;
5714 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005715 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 len = PyUnicode_GET_SIZE(temp);
5717 if (prec >= 0 && len > prec)
5718 len = prec;
5719 break;
5720
5721 case 'i':
5722 case 'd':
5723 case 'u':
5724 case 'o':
5725 case 'x':
5726 case 'X':
5727 if (c == 'i')
5728 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005729 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005730 temp = formatlong(v, flags, prec, c);
5731 if (!temp)
5732 goto onError;
5733 pbuf = PyUnicode_AS_UNICODE(temp);
5734 len = PyUnicode_GET_SIZE(temp);
5735 /* unbounded ints can always produce
5736 a sign character! */
5737 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005739 else {
5740 pbuf = formatbuf;
5741 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5742 flags, prec, c, v);
5743 if (len < 0)
5744 goto onError;
5745 /* only d conversion is signed */
5746 sign = c == 'd';
5747 }
5748 if (flags & F_ZERO)
5749 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 break;
5751
5752 case 'e':
5753 case 'E':
5754 case 'f':
5755 case 'g':
5756 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005757 pbuf = formatbuf;
5758 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5759 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 if (len < 0)
5761 goto onError;
5762 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005763 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 fill = '0';
5765 break;
5766
5767 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005768 pbuf = formatbuf;
5769 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 if (len < 0)
5771 goto onError;
5772 break;
5773
5774 default:
5775 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005776 "unsupported format character '%c' (0x%x) "
5777 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005778 (31<=c && c<=126) ? c : '?',
5779 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 goto onError;
5781 }
5782 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005783 if (*pbuf == '-' || *pbuf == '+') {
5784 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 len--;
5786 }
5787 else if (flags & F_SIGN)
5788 sign = '+';
5789 else if (flags & F_BLANK)
5790 sign = ' ';
5791 else
5792 sign = 0;
5793 }
5794 if (width < len)
5795 width = len;
5796 if (rescnt < width + (sign != 0)) {
5797 reslen -= rescnt;
5798 rescnt = width + fmtcnt + 100;
5799 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005800 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 return NULL;
5802 res = PyUnicode_AS_UNICODE(result)
5803 + reslen - rescnt;
5804 }
5805 if (sign) {
5806 if (fill != ' ')
5807 *res++ = sign;
5808 rescnt--;
5809 if (width > len)
5810 width--;
5811 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005812 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5813 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005814 assert(pbuf[1] == c);
5815 if (fill != ' ') {
5816 *res++ = *pbuf++;
5817 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005818 }
Tim Petersfff53252001-04-12 18:38:48 +00005819 rescnt -= 2;
5820 width -= 2;
5821 if (width < 0)
5822 width = 0;
5823 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 if (width > len && !(flags & F_LJUST)) {
5826 do {
5827 --rescnt;
5828 *res++ = fill;
5829 } while (--width > len);
5830 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005831 if (fill == ' ') {
5832 if (sign)
5833 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005834 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005835 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005836 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005837 *res++ = *pbuf++;
5838 *res++ = *pbuf++;
5839 }
5840 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005841 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 res += len;
5843 rescnt -= len;
5844 while (--width >= len) {
5845 --rescnt;
5846 *res++ = ' ';
5847 }
5848 if (dict && (argidx < arglen) && c != '%') {
5849 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005850 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 goto onError;
5852 }
5853 Py_XDECREF(temp);
5854 } /* '%' */
5855 } /* until end */
5856 if (argidx < arglen && !dict) {
5857 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005858 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 goto onError;
5860 }
5861
5862 if (args_owned) {
5863 Py_DECREF(args);
5864 }
5865 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005866 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 return (PyObject *)result;
5869
5870 onError:
5871 Py_XDECREF(result);
5872 Py_DECREF(uformat);
5873 if (args_owned) {
5874 Py_DECREF(args);
5875 }
5876 return NULL;
5877}
5878
5879static PyBufferProcs unicode_as_buffer = {
5880 (getreadbufferproc) unicode_buffer_getreadbuf,
5881 (getwritebufferproc) unicode_buffer_getwritebuf,
5882 (getsegcountproc) unicode_buffer_getsegcount,
5883 (getcharbufferproc) unicode_buffer_getcharbuf,
5884};
5885
Jeremy Hylton938ace62002-07-17 16:30:39 +00005886static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00005887unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5888
Tim Peters6d6c1a32001-08-02 04:15:00 +00005889static PyObject *
5890unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5891{
5892 PyObject *x = NULL;
5893 static char *kwlist[] = {"string", "encoding", "errors", 0};
5894 char *encoding = NULL;
5895 char *errors = NULL;
5896
Guido van Rossume023fe02001-08-30 03:12:59 +00005897 if (type != &PyUnicode_Type)
5898 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005899 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5900 kwlist, &x, &encoding, &errors))
5901 return NULL;
5902 if (x == NULL)
5903 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005904 if (encoding == NULL && errors == NULL)
5905 return PyObject_Unicode(x);
5906 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005907 return PyUnicode_FromEncodedObject(x, encoding, errors);
5908}
5909
Guido van Rossume023fe02001-08-30 03:12:59 +00005910static PyObject *
5911unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5912{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005913 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005914 int n;
5915
5916 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5917 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5918 if (tmp == NULL)
5919 return NULL;
5920 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005921 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5922 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005923 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005924 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5925 if (pnew->str == NULL) {
5926 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005927 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005928 return NULL;
5929 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005930 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5931 pnew->length = n;
5932 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005933 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005934 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005935}
5936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005937PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00005938"unicode(string [, encoding[, errors]]) -> object\n\
5939\n\
5940Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00005941encoding defaults to the current default string encoding.\n\
5942errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00005943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944PyTypeObject PyUnicode_Type = {
5945 PyObject_HEAD_INIT(&PyType_Type)
5946 0, /* ob_size */
5947 "unicode", /* tp_name */
5948 sizeof(PyUnicodeObject), /* tp_size */
5949 0, /* tp_itemsize */
5950 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005951 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005953 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 0, /* tp_setattr */
5955 (cmpfunc) unicode_compare, /* tp_compare */
5956 (reprfunc) unicode_repr, /* tp_repr */
5957 0, /* tp_as_number */
5958 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005959 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 (hashfunc) unicode_hash, /* tp_hash*/
5961 0, /* tp_call*/
5962 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005963 PyObject_GenericGetAttr, /* tp_getattro */
5964 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005966 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005967 unicode_doc, /* tp_doc */
5968 0, /* tp_traverse */
5969 0, /* tp_clear */
5970 0, /* tp_richcompare */
5971 0, /* tp_weaklistoffset */
5972 0, /* tp_iter */
5973 0, /* tp_iternext */
5974 unicode_methods, /* tp_methods */
5975 0, /* tp_members */
5976 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00005977 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005978 0, /* tp_dict */
5979 0, /* tp_descr_get */
5980 0, /* tp_descr_set */
5981 0, /* tp_dictoffset */
5982 0, /* tp_init */
5983 0, /* tp_alloc */
5984 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005985 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986};
5987
5988/* Initialize the Unicode implementation */
5989
Thomas Wouters78890102000-07-22 19:25:51 +00005990void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005992 int i;
5993
Fred Drakee4315f52000-05-09 19:53:39 +00005994 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005995 unicode_freelist = NULL;
5996 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005998 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005999 for (i = 0; i < 256; i++)
6000 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006001 if (PyType_Ready(&PyUnicode_Type) < 0)
6002 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
6005/* Finalize the Unicode implementation */
6006
6007void
Thomas Wouters78890102000-07-22 19:25:51 +00006008_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006010 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006011 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006013 Py_XDECREF(unicode_empty);
6014 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006015
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006016 for (i = 0; i < 256; i++) {
6017 if (unicode_latin1[i]) {
6018 Py_DECREF(unicode_latin1[i]);
6019 unicode_latin1[i] = NULL;
6020 }
6021 }
6022
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006023 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 PyUnicodeObject *v = u;
6025 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006026 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006027 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006028 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006029 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006031 unicode_freelist = NULL;
6032 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033}