blob: 4ac12a05518309f04be6be5dd6ec75bd8ccc7aa0 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 int owned = 0;
456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Greg Steinaf36a3a2000-07-17 09:04:43 +0000513 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000515 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000519 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523}
524
525PyObject *PyUnicode_Decode(const char *s,
526 int size,
527 const char *encoding,
528 const char *errors)
529{
530 PyObject *buffer = NULL, *unicode;
531
Fred Drakee4315f52000-05-09 19:53:39 +0000532 if (encoding == NULL)
533 encoding = PyUnicode_GetDefaultEncoding();
534
535 /* Shortcuts for common default encodings */
536 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "latin-1") == 0)
539 return PyUnicode_DecodeLatin1(s, size, errors);
540 else if (strcmp(encoding, "ascii") == 0)
541 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000542
543 /* Decode via the codec registry */
544 buffer = PyBuffer_FromMemory((void *)s, size);
545 if (buffer == NULL)
546 goto onError;
547 unicode = PyCodec_Decode(buffer, encoding, errors);
548 if (unicode == NULL)
549 goto onError;
550 if (!PyUnicode_Check(unicode)) {
551 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000552 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 unicode->ob_type->tp_name);
554 Py_DECREF(unicode);
555 goto onError;
556 }
557 Py_DECREF(buffer);
558 return unicode;
559
560 onError:
561 Py_XDECREF(buffer);
562 return NULL;
563}
564
565PyObject *PyUnicode_Encode(const Py_UNICODE *s,
566 int size,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v, *unicode;
571
572 unicode = PyUnicode_FromUnicode(s, size);
573 if (unicode == NULL)
574 return NULL;
575 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
576 Py_DECREF(unicode);
577 return v;
578}
579
580PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
581 const char *encoding,
582 const char *errors)
583{
584 PyObject *v;
585
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
Fred Drakee4315f52000-05-09 19:53:39 +0000590
591 if (encoding == NULL)
592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (errors == NULL) {
596 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000597 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "latin-1") == 0)
599 return PyUnicode_AsLatin1String(unicode);
600 else if (strcmp(encoding, "ascii") == 0)
601 return PyUnicode_AsASCIIString(unicode);
602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
604 /* Encode via the codec registry */
605 v = PyCodec_Encode(unicode, encoding, errors);
606 if (v == NULL)
607 goto onError;
608 /* XXX Should we really enforce this ? */
609 if (!PyString_Check(v)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 v->ob_type->tp_name);
613 Py_DECREF(v);
614 goto onError;
615 }
616 return v;
617
618 onError:
619 return NULL;
620}
621
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
623 const char *errors)
624{
625 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
626
627 if (v)
628 return v;
629 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
630 if (v && errors == NULL)
631 ((PyUnicodeObject *)unicode)->defenc = v;
632 return v;
633}
634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
636{
637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
641 return PyUnicode_AS_UNICODE(unicode);
642
643 onError:
644 return NULL;
645}
646
647int PyUnicode_GetSize(PyObject *unicode)
648{
649 if (!PyUnicode_Check(unicode)) {
650 PyErr_BadArgument();
651 goto onError;
652 }
653 return PyUnicode_GET_SIZE(unicode);
654
655 onError:
656 return -1;
657}
658
Thomas Wouters78890102000-07-22 19:25:51 +0000659const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000660{
661 return unicode_default_encoding;
662}
663
664int PyUnicode_SetDefaultEncoding(const char *encoding)
665{
666 PyObject *v;
667
668 /* Make sure the encoding is valid. As side effect, this also
669 loads the encoding into the codec registry cache. */
670 v = _PyCodec_Lookup(encoding);
671 if (v == NULL)
672 goto onError;
673 Py_DECREF(v);
674 strncpy(unicode_default_encoding,
675 encoding,
676 sizeof(unicode_default_encoding));
677 return 0;
678
679 onError:
680 return -1;
681}
682
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000683/* --- UTF-7 Codec -------------------------------------------------------- */
684
685/* see RFC2152 for details */
686
687static
688char utf7_special[128] = {
689 /* indicate whether a UTF-7 character is special i.e. cannot be directly
690 encoded:
691 0 - not special
692 1 - special
693 2 - whitespace (optional)
694 3 - RFC2152 Set O (optional) */
695 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
696 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
697 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
699 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
701 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
703
704};
705
706#define SPECIAL(c, encodeO, encodeWS) \
707 (((c)>127 || utf7_special[(c)] == 1) || \
708 (encodeWS && (utf7_special[(c)] == 2)) || \
709 (encodeO && (utf7_special[(c)] == 3)))
710
711#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
712#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
713#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
714 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
715
716#define ENCODE(out, ch, bits) \
717 while (bits >= 6) { \
718 *out++ = B64(ch >> (bits-6)); \
719 bits -= 6; \
720 }
721
722#define DECODE(out, ch, bits, surrogate) \
723 while (bits >= 16) { \
724 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
725 bits -= 16; \
726 if (surrogate) { \
727 /* We have already generated an error for the high surrogate
728 so let's not bother seeing if the low surrogate is correct or not */\
729 surrogate = 0; \
730 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
731 /* This is a surrogate pair. Unfortunately we can't represent \
732 it in a 16-bit character */ \
733 surrogate = 1; \
734 errmsg = "code pairs are not supported"; \
735 goto utf7Error; \
736 } else { \
737 *out++ = outCh; \
738 } \
739 } \
740
741static
742int utf7_decoding_error(Py_UNICODE **dest,
743 const char *errors,
744 const char *details)
745{
746 if ((errors == NULL) ||
747 (strcmp(errors,"strict") == 0)) {
748 PyErr_Format(PyExc_UnicodeError,
749 "UTF-7 decoding error: %.400s",
750 details);
751 return -1;
752 }
753 else if (strcmp(errors,"ignore") == 0) {
754 return 0;
755 }
756 else if (strcmp(errors,"replace") == 0) {
757 if (dest != NULL) {
758 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
759 (*dest)++;
760 }
761 return 0;
762 }
763 else {
764 PyErr_Format(PyExc_ValueError,
765 "UTF-7 decoding error; unknown error handling code: %.400s",
766 errors);
767 return -1;
768 }
769}
770
771PyObject *PyUnicode_DecodeUTF7(const char *s,
772 int size,
773 const char *errors)
774{
775 const char *e;
776 PyUnicodeObject *unicode;
777 Py_UNICODE *p;
778 const char *errmsg = "";
779 int inShift = 0;
780 unsigned int bitsleft = 0;
781 unsigned long charsleft = 0;
782 int surrogate = 0;
783
784 unicode = _PyUnicode_New(size);
785 if (!unicode)
786 return NULL;
787 if (size == 0)
788 return (PyObject *)unicode;
789
790 p = unicode->str;
791 e = s + size;
792
793 while (s < e) {
794 Py_UNICODE ch = *s;
795
796 if (inShift) {
797 if ((ch == '-') || !B64CHAR(ch)) {
798 inShift = 0;
799 s++;
800
801 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
802 if (bitsleft >= 6) {
803 /* The shift sequence has a partial character in it. If
804 bitsleft < 6 then we could just classify it as padding
805 but that is not the case here */
806
807 errmsg = "partial character in shift sequence";
808 goto utf7Error;
809 }
810 /* According to RFC2152 the remaining bits should be zero. We
811 choose to signal an error/insert a replacement character
812 here so indicate the potential of a misencoded character. */
813
814 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
815 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
816 errmsg = "non-zero padding bits in shift sequence";
817 goto utf7Error;
818 }
819
820 if (ch == '-') {
821 if ((s < e) && (*(s) == '-')) {
822 *p++ = '-';
823 inShift = 1;
824 }
825 } else if (SPECIAL(ch,0,0)) {
826 errmsg = "unexpected special character";
827 goto utf7Error;
828 } else {
829 *p++ = ch;
830 }
831 } else {
832 charsleft = (charsleft << 6) | UB64(ch);
833 bitsleft += 6;
834 s++;
835 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
836 }
837 }
838 else if ( ch == '+' ) {
839 s++;
840 if (s < e && *s == '-') {
841 s++;
842 *p++ = '+';
843 } else
844 {
845 inShift = 1;
846 bitsleft = 0;
847 }
848 }
849 else if (SPECIAL(ch,0,0)) {
850 errmsg = "unexpected special character";
851 s++;
852 goto utf7Error;
853 }
854 else {
855 *p++ = ch;
856 s++;
857 }
858 continue;
859 utf7Error:
860 if (utf7_decoding_error(&p, errors, errmsg))
861 goto onError;
862 }
863
864 if (inShift) {
865 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
866 goto onError;
867 }
868
869 if (_PyUnicode_Resize(&unicode, p - unicode->str))
870 goto onError;
871
872 return (PyObject *)unicode;
873
874onError:
875 Py_DECREF(unicode);
876 return NULL;
877}
878
879
880PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
881 int size,
882 int encodeSetO,
883 int encodeWhiteSpace,
884 const char *errors)
885{
886 PyObject *v;
887 /* It might be possible to tighten this worst case */
888 unsigned int cbAllocated = 5 * size;
889 int inShift = 0;
890 int i = 0;
891 unsigned int bitsleft = 0;
892 unsigned long charsleft = 0;
893 char * out;
894 char * start;
895
896 if (size == 0)
897 return PyString_FromStringAndSize(NULL, 0);
898
899 v = PyString_FromStringAndSize(NULL, cbAllocated);
900 if (v == NULL)
901 return NULL;
902
903 start = out = PyString_AS_STRING(v);
904 for (;i < size; ++i) {
905 Py_UNICODE ch = s[i];
906
907 if (!inShift) {
908 if (ch == '+') {
909 *out++ = '+';
910 *out++ = '-';
911 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
912 charsleft = ch;
913 bitsleft = 16;
914 *out++ = '+';
915 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
916 inShift = bitsleft > 0;
917 } else {
918 *out++ = (char) ch;
919 }
920 } else {
921 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
922 *out++ = B64(charsleft << (6-bitsleft));
923 charsleft = 0;
924 bitsleft = 0;
925 /* Characters not in the BASE64 set implicitly unshift the sequence
926 so no '-' is required, except if the character is itself a '-' */
927 if (B64CHAR(ch) || ch == '-') {
928 *out++ = '-';
929 }
930 inShift = 0;
931 *out++ = (char) ch;
932 } else {
933 bitsleft += 16;
934 charsleft = (charsleft << 16) | ch;
935 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
936
937 /* If the next character is special then we dont' need to terminate
938 the shift sequence. If the next character is not a BASE64 character
939 or '-' then the shift sequence will be terminated implicitly and we
940 don't have to insert a '-'. */
941
942 if (bitsleft == 0) {
943 if (i + 1 < size) {
944 Py_UNICODE ch2 = s[i+1];
945
946 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
947
948 } else if (B64CHAR(ch2) || ch2 == '-') {
949 *out++ = '-';
950 inShift = 0;
951 } else {
952 inShift = 0;
953 }
954
955 }
956 else {
957 *out++ = '-';
958 inShift = 0;
959 }
960 }
961 }
962 }
963 }
964 if (bitsleft) {
965 *out++= B64(charsleft << (6-bitsleft) );
966 *out++ = '-';
967 }
968
Tim Peters5de98422002-04-27 18:44:32 +0000969 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000970 return v;
971}
972
973#undef SPECIAL
974#undef B64
975#undef B64CHAR
976#undef UB64
977#undef ENCODE
978#undef DECODE
979
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980/* --- UTF-8 Codec -------------------------------------------------------- */
981
982static
983char utf8_code_length[256] = {
984 /* Map UTF-8 encoded prefix byte to sequence length. zero means
985 illegal prefix. see RFC 2279 for details */
986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
987 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
988 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
989 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
990 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
991 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
992 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
993 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
994 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
996 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
997 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
998 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
999 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1000 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1001 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1002};
1003
1004static
1005int utf8_decoding_error(const char **source,
1006 Py_UNICODE **dest,
1007 const char *errors,
1008 const char *details)
1009{
1010 if ((errors == NULL) ||
1011 (strcmp(errors,"strict") == 0)) {
1012 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001013 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 details);
1015 return -1;
1016 }
1017 else if (strcmp(errors,"ignore") == 0) {
1018 (*source)++;
1019 return 0;
1020 }
1021 else if (strcmp(errors,"replace") == 0) {
1022 (*source)++;
1023 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1024 (*dest)++;
1025 return 0;
1026 }
1027 else {
1028 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001029 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 errors);
1031 return -1;
1032 }
1033}
1034
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035PyObject *PyUnicode_DecodeUTF8(const char *s,
1036 int size,
1037 const char *errors)
1038{
1039 int n;
1040 const char *e;
1041 PyUnicodeObject *unicode;
1042 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001043 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044
1045 /* Note: size will always be longer than the resulting Unicode
1046 character count */
1047 unicode = _PyUnicode_New(size);
1048 if (!unicode)
1049 return NULL;
1050 if (size == 0)
1051 return (PyObject *)unicode;
1052
1053 /* Unpack UTF-8 encoded data */
1054 p = unicode->str;
1055 e = s + size;
1056
1057 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001058 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001061 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062 s++;
1063 continue;
1064 }
1065
1066 n = utf8_code_length[ch];
1067
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001068 if (s + n > e) {
1069 errmsg = "unexpected end of data";
1070 goto utf8Error;
1071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072
1073 switch (n) {
1074
1075 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "unexpected code byte";
1077 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001080 errmsg = "internal error";
1081 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082
1083 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001084 if ((s[1] & 0xc0) != 0x80) {
1085 errmsg = "invalid data";
1086 goto utf8Error;
1087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001089 if (ch < 0x80) {
1090 errmsg = "illegal encoding";
1091 goto utf8Error;
1092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 break;
1096
1097 case 3:
1098 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001099 (s[2] & 0xc0) != 0x80) {
1100 errmsg = "invalid data";
1101 goto utf8Error;
1102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001104 if (ch < 0x0800) {
1105 /* Note: UTF-8 encodings of surrogates are considered
1106 legal UTF-8 sequences;
1107
1108 XXX For wide builds (UCS-4) we should probably try
1109 to recombine the surrogates into a single code
1110 unit.
1111 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001112 errmsg = "illegal encoding";
1113 goto utf8Error;
1114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001116 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001117 break;
1118
1119 case 4:
1120 if ((s[1] & 0xc0) != 0x80 ||
1121 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001122 (s[3] & 0xc0) != 0x80) {
1123 errmsg = "invalid data";
1124 goto utf8Error;
1125 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001126 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1127 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1128 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001129 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001130 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001131 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001132 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001133 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 errmsg = "illegal encoding";
1135 goto utf8Error;
1136 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001137#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001138 *p++ = (Py_UNICODE)ch;
1139#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001140 /* compute and append the two surrogates: */
1141
1142 /* translate from 10000..10FFFF to 0..FFFF */
1143 ch -= 0x10000;
1144
1145 /* high surrogate = top 10 bits added to D800 */
1146 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1147
1148 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001149 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001150#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 break;
1152
1153 default:
1154 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 errmsg = "unsupported Unicode code range";
1156 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 }
1158 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 continue;
1160
1161 utf8Error:
1162 if (utf8_decoding_error(&s, &p, errors, errmsg))
1163 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
1165
1166 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001167 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 goto onError;
1169
1170 return (PyObject *)unicode;
1171
1172onError:
1173 Py_DECREF(unicode);
1174 return NULL;
1175}
1176
Tim Peters602f7402002-04-27 18:03:26 +00001177/* Allocation strategy: if the string is short, convert into a stack buffer
1178 and allocate exactly as much space needed at the end. Else allocate the
1179 maximum possible needed (4 result bytes per Unicode character), and return
1180 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001181*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001182PyObject *
1183PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1184 int size,
1185 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186{
Tim Peters602f7402002-04-27 18:03:26 +00001187#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001188
Tim Peters602f7402002-04-27 18:03:26 +00001189 int i; /* index into s of next input byte */
1190 PyObject *v; /* result string object */
1191 char *p; /* next free byte in output buffer */
1192 int nallocated; /* number of result bytes allocated */
1193 int nneeded; /* number of result bytes needed */
1194 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001195
Tim Peters602f7402002-04-27 18:03:26 +00001196 assert(s != NULL);
1197 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
Tim Peters602f7402002-04-27 18:03:26 +00001199 if (size <= MAX_SHORT_UNICHARS) {
1200 /* Write into the stack buffer; nallocated can't overflow.
1201 * At the end, we'll allocate exactly as much heap space as it
1202 * turns out we need.
1203 */
1204 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1205 v = NULL; /* will allocate after we're done */
1206 p = stackbuf;
1207 }
1208 else {
1209 /* Overallocate on the heap, and give the excess back at the end. */
1210 nallocated = size * 4;
1211 if (nallocated / 4 != size) /* overflow! */
1212 return PyErr_NoMemory();
1213 v = PyString_FromStringAndSize(NULL, nallocated);
1214 if (v == NULL)
1215 return NULL;
1216 p = PyString_AS_STRING(v);
1217 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001218
Tim Peters602f7402002-04-27 18:03:26 +00001219 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001221
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001222 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001223 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001227 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001228 *p++ = (char)(0xc0 | (ch >> 6));
1229 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001230 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001231 else {
Tim Peters602f7402002-04-27 18:03:26 +00001232 /* Encode UCS2 Unicode ordinals */
1233 if (ch < 0x10000) {
1234 /* Special case: check for high surrogate */
1235 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1236 Py_UCS4 ch2 = s[i];
1237 /* Check for low surrogate and combine the two to
1238 form a UCS4 value */
1239 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001240 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001241 i++;
1242 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 }
Tim Peters602f7402002-04-27 18:03:26 +00001244 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001247 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1248 *p++ = (char)(0x80 | (ch & 0x3f));
1249 continue;
1250 }
1251encodeUCS4:
1252 /* Encode UCS4 Unicode ordinals */
1253 *p++ = (char)(0xf0 | (ch >> 18));
1254 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1255 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1256 *p++ = (char)(0x80 | (ch & 0x3f));
1257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001259
Tim Peters602f7402002-04-27 18:03:26 +00001260 if (v == NULL) {
1261 /* This was stack allocated. */
1262 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1263 assert(nneeded <= nallocated);
1264 v = PyString_FromStringAndSize(stackbuf, nneeded);
1265 }
1266 else {
1267 /* Cut back to size actually needed. */
1268 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1269 assert(nneeded <= nallocated);
1270 _PyString_Resize(&v, nneeded);
1271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001273
Tim Peters602f7402002-04-27 18:03:26 +00001274#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1278{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 if (!PyUnicode_Check(unicode)) {
1280 PyErr_BadArgument();
1281 return NULL;
1282 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001283 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1284 PyUnicode_GET_SIZE(unicode),
1285 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286}
1287
1288/* --- UTF-16 Codec ------------------------------------------------------- */
1289
1290static
Tim Peters772747b2001-08-09 22:21:55 +00001291int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 const char *errors,
1293 const char *details)
1294{
1295 if ((errors == NULL) ||
1296 (strcmp(errors,"strict") == 0)) {
1297 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001298 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 details);
1300 return -1;
1301 }
1302 else if (strcmp(errors,"ignore") == 0) {
1303 return 0;
1304 }
1305 else if (strcmp(errors,"replace") == 0) {
1306 if (dest) {
1307 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1308 (*dest)++;
1309 }
1310 return 0;
1311 }
1312 else {
1313 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001314 "UTF-16 decoding error; "
1315 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 errors);
1317 return -1;
1318 }
1319}
1320
Tim Peters772747b2001-08-09 22:21:55 +00001321PyObject *
1322PyUnicode_DecodeUTF16(const char *s,
1323 int size,
1324 const char *errors,
1325 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326{
1327 PyUnicodeObject *unicode;
1328 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001329 const unsigned char *q, *e;
1330 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001331 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001332 /* Offsets from q for retrieving byte pairs in the right order. */
1333#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1334 int ihi = 1, ilo = 0;
1335#else
1336 int ihi = 0, ilo = 1;
1337#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
1339 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001340 if (size & 1) {
1341 if (utf16_decoding_error(NULL, errors, "truncated data"))
1342 return NULL;
1343 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 }
1345
1346 /* Note: size will always be longer than the resulting Unicode
1347 character count */
1348 unicode = _PyUnicode_New(size);
1349 if (!unicode)
1350 return NULL;
1351 if (size == 0)
1352 return (PyObject *)unicode;
1353
1354 /* Unpack UTF-16 encoded data */
1355 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001356 q = (unsigned char *)s;
1357 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
1359 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001360 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001362 /* Check for BOM marks (U+FEFF) in the input and adjust current
1363 byte order setting accordingly. In native mode, the leading BOM
1364 mark is skipped, in all other modes, it is copied to the output
1365 stream as-is (giving a ZWNBSP character). */
1366 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001367 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001368#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001369 if (bom == 0xFEFF) {
1370 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001371 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001372 }
1373 else if (bom == 0xFFFE) {
1374 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001375 bo = 1;
1376 }
1377#else
Tim Peters772747b2001-08-09 22:21:55 +00001378 if (bom == 0xFEFF) {
1379 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001380 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001381 }
1382 else if (bom == 0xFFFE) {
1383 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001384 bo = -1;
1385 }
1386#endif
1387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388
Tim Peters772747b2001-08-09 22:21:55 +00001389 if (bo == -1) {
1390 /* force LE */
1391 ihi = 1;
1392 ilo = 0;
1393 }
1394 else if (bo == 1) {
1395 /* force BE */
1396 ihi = 0;
1397 ilo = 1;
1398 }
1399
1400 while (q < e) {
1401 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1402 q += 2;
1403
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 if (ch < 0xD800 || ch > 0xDFFF) {
1405 *p++ = ch;
1406 continue;
1407 }
1408
1409 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001410 if (q >= e) {
1411 errmsg = "unexpected end of data";
1412 goto utf16Error;
1413 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001414 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001415 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1416 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001417 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001418#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001419 *p++ = ch;
1420 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001421#else
1422 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001423#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001424 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001425 }
1426 else {
1427 errmsg = "illegal UTF-16 surrogate";
1428 goto utf16Error;
1429 }
1430
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001432 errmsg = "illegal encoding";
1433 /* Fall through to report the error */
1434
1435 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001436 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 }
1439
1440 if (byteorder)
1441 *byteorder = bo;
1442
1443 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001444 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 goto onError;
1446
1447 return (PyObject *)unicode;
1448
1449onError:
1450 Py_DECREF(unicode);
1451 return NULL;
1452}
1453
Tim Peters772747b2001-08-09 22:21:55 +00001454PyObject *
1455PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1456 int size,
1457 const char *errors,
1458 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459{
1460 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001461 unsigned char *p;
1462 int i, pairs;
1463 /* Offsets from p for storing byte pairs in the right order. */
1464#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1465 int ihi = 1, ilo = 0;
1466#else
1467 int ihi = 0, ilo = 1;
1468#endif
1469
1470#define STORECHAR(CH) \
1471 do { \
1472 p[ihi] = ((CH) >> 8) & 0xff; \
1473 p[ilo] = (CH) & 0xff; \
1474 p += 2; \
1475 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001477 for (i = pairs = 0; i < size; i++)
1478 if (s[i] >= 0x10000)
1479 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001481 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 if (v == NULL)
1483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Tim Peters772747b2001-08-09 22:21:55 +00001485 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001487 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001488 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001489 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001490
1491 if (byteorder == -1) {
1492 /* force LE */
1493 ihi = 1;
1494 ilo = 0;
1495 }
1496 else if (byteorder == 1) {
1497 /* force BE */
1498 ihi = 0;
1499 ilo = 1;
1500 }
1501
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001502 while (size-- > 0) {
1503 Py_UNICODE ch = *s++;
1504 Py_UNICODE ch2 = 0;
1505 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001506 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1507 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508 }
Tim Peters772747b2001-08-09 22:21:55 +00001509 STORECHAR(ch);
1510 if (ch2)
1511 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001514#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515}
1516
1517PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1518{
1519 if (!PyUnicode_Check(unicode)) {
1520 PyErr_BadArgument();
1521 return NULL;
1522 }
1523 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1524 PyUnicode_GET_SIZE(unicode),
1525 NULL,
1526 0);
1527}
1528
1529/* --- Unicode Escape Codec ----------------------------------------------- */
1530
1531static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001532int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 const char *errors,
1534 const char *details)
1535{
1536 if ((errors == NULL) ||
1537 (strcmp(errors,"strict") == 0)) {
1538 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001539 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 details);
1541 return -1;
1542 }
1543 else if (strcmp(errors,"ignore") == 0) {
1544 return 0;
1545 }
1546 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001547 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1548 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 return 0;
1550 }
1551 else {
1552 PyErr_Format(PyExc_ValueError,
1553 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001554 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 errors);
1556 return -1;
1557 }
1558}
1559
Fredrik Lundh06d12682001-01-24 07:59:11 +00001560static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001561
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1563 int size,
1564 const char *errors)
1565{
1566 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001567 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001569 char* message;
1570 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1571
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572 /* Escaped strings will always be longer than the resulting
1573 Unicode string, so we start with size here and then reduce the
1574 length after conversion to the true value. */
1575 v = _PyUnicode_New(size);
1576 if (v == NULL)
1577 goto onError;
1578 if (size == 0)
1579 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001580
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 p = buf = PyUnicode_AS_UNICODE(v);
1582 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001583
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 while (s < end) {
1585 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001586 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001587 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588
1589 /* Non-escape characters are interpreted as Unicode ordinals */
1590 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001591 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 continue;
1593 }
1594
1595 /* \ - Escapes */
1596 s++;
1597 switch (*s++) {
1598
1599 /* \x escapes */
1600 case '\n': break;
1601 case '\\': *p++ = '\\'; break;
1602 case '\'': *p++ = '\''; break;
1603 case '\"': *p++ = '\"'; break;
1604 case 'b': *p++ = '\b'; break;
1605 case 'f': *p++ = '\014'; break; /* FF */
1606 case 't': *p++ = '\t'; break;
1607 case 'n': *p++ = '\n'; break;
1608 case 'r': *p++ = '\r'; break;
1609 case 'v': *p++ = '\013'; break; /* VT */
1610 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1611
1612 /* \OOO (octal) escapes */
1613 case '0': case '1': case '2': case '3':
1614 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001615 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001617 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001619 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001621 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 break;
1623
Fredrik Lundhccc74732001-02-18 22:13:49 +00001624 /* hex escapes */
1625 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001627 digits = 2;
1628 message = "truncated \\xXX escape";
1629 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 digits = 4;
1634 message = "truncated \\uXXXX escape";
1635 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636
Fredrik Lundhccc74732001-02-18 22:13:49 +00001637 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001638 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001639 digits = 8;
1640 message = "truncated \\UXXXXXXXX escape";
1641 hexescape:
1642 chr = 0;
1643 for (i = 0; i < digits; i++) {
1644 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001646 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001648 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 i++;
1650 break;
1651 }
1652 chr = (chr<<4) & ~0xF;
1653 if (c >= '0' && c <= '9')
1654 chr += c - '0';
1655 else if (c >= 'a' && c <= 'f')
1656 chr += 10 + c - 'a';
1657 else
1658 chr += 10 + c - 'A';
1659 }
1660 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001661 if (chr == 0xffffffff)
1662 /* _decoding_error will have already written into the
1663 target buffer. */
1664 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001665 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001666 /* when we get here, chr is a 32-bit unicode character */
1667 if (chr <= 0xffff)
1668 /* UCS-2 character */
1669 *p++ = (Py_UNICODE) chr;
1670 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001671 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001672 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001673#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001674 *p++ = chr;
1675#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001676 chr -= 0x10000L;
1677 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001678 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001679#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001680 } else {
1681 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001682 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001683 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001684 )
1685 goto onError;
1686 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001687 break;
1688
1689 /* \N{name} */
1690 case 'N':
1691 message = "malformed \\N character escape";
1692 if (ucnhash_CAPI == NULL) {
1693 /* load the unicode data module */
1694 PyObject *m, *v;
1695 m = PyImport_ImportModule("unicodedata");
1696 if (m == NULL)
1697 goto ucnhashError;
1698 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1699 Py_DECREF(m);
1700 if (v == NULL)
1701 goto ucnhashError;
1702 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1703 Py_DECREF(v);
1704 if (ucnhash_CAPI == NULL)
1705 goto ucnhashError;
1706 }
1707 if (*s == '{') {
1708 const char *start = s+1;
1709 /* look for the closing brace */
1710 while (*s != '}' && s < end)
1711 s++;
1712 if (s > start && s < end && *s == '}') {
1713 /* found a name. look it up in the unicode database */
1714 message = "unknown Unicode character name";
1715 s++;
1716 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1717 goto store;
1718 }
1719 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001720 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722 break;
1723
1724 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001725 if (s > end) {
1726 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1727 goto onError;
1728 }
1729 else {
1730 *p++ = '\\';
1731 *p++ = (unsigned char)s[-1];
1732 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 }
1735 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001736 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001737 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001739
Fredrik Lundhccc74732001-02-18 22:13:49 +00001740ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001741 PyErr_SetString(
1742 PyExc_UnicodeError,
1743 "\\N escapes not supported (can't load unicodedata module)"
1744 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001745 return NULL;
1746
Fredrik Lundhccc74732001-02-18 22:13:49 +00001747onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 Py_XDECREF(v);
1749 return NULL;
1750}
1751
1752/* Return a Unicode-Escape string version of the Unicode object.
1753
1754 If quotes is true, the string is enclosed in u"" or u'' quotes as
1755 appropriate.
1756
1757*/
1758
Barry Warsaw51ac5802000-03-20 16:36:48 +00001759static const Py_UNICODE *findchar(const Py_UNICODE *s,
1760 int size,
1761 Py_UNICODE ch);
1762
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763static
1764PyObject *unicodeescape_string(const Py_UNICODE *s,
1765 int size,
1766 int quotes)
1767{
1768 PyObject *repr;
1769 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001771 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772
1773 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1774 if (repr == NULL)
1775 return NULL;
1776
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001777 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778
1779 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 *p++ = 'u';
1781 *p++ = (findchar(s, size, '\'') &&
1782 !findchar(s, size, '"')) ? '"' : '\'';
1783 }
1784 while (size-- > 0) {
1785 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001786
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001788 if (quotes &&
1789 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 *p++ = '\\';
1791 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001792 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001794
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001795#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001796 /* Map 21-bit characters to '\U00xxxxxx' */
1797 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001798 int offset = p - PyString_AS_STRING(repr);
1799
1800 /* Resize the string if necessary */
1801 if (offset + 12 > PyString_GET_SIZE(repr)) {
1802 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001803 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001804 p = PyString_AS_STRING(repr) + offset;
1805 }
1806
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001807 *p++ = '\\';
1808 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001809 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1810 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1811 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1812 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1813 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1814 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1815 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001816 *p++ = hexdigit[ch & 0x0000000F];
1817 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001818 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001819#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001820 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1821 else if (ch >= 0xD800 && ch < 0xDC00) {
1822 Py_UNICODE ch2;
1823 Py_UCS4 ucs;
1824
1825 ch2 = *s++;
1826 size--;
1827 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1828 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1829 *p++ = '\\';
1830 *p++ = 'U';
1831 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1832 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1833 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1834 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1835 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1836 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1837 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1838 *p++ = hexdigit[ucs & 0x0000000F];
1839 continue;
1840 }
1841 /* Fall through: isolated surrogates are copied as-is */
1842 s--;
1843 size++;
1844 }
1845
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001847 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 *p++ = '\\';
1849 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001850 *p++ = hexdigit[(ch >> 12) & 0x000F];
1851 *p++ = hexdigit[(ch >> 8) & 0x000F];
1852 *p++ = hexdigit[(ch >> 4) & 0x000F];
1853 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001855
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001856 /* Map special whitespace to '\t', \n', '\r' */
1857 else if (ch == '\t') {
1858 *p++ = '\\';
1859 *p++ = 't';
1860 }
1861 else if (ch == '\n') {
1862 *p++ = '\\';
1863 *p++ = 'n';
1864 }
1865 else if (ch == '\r') {
1866 *p++ = '\\';
1867 *p++ = 'r';
1868 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001870 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001871 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001873 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001874 *p++ = hexdigit[(ch >> 4) & 0x000F];
1875 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001877
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* Copy everything else as-is */
1879 else
1880 *p++ = (char) ch;
1881 }
1882 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001883 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884
1885 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001886 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 return repr;
1888}
1889
1890PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1891 int size)
1892{
1893 return unicodeescape_string(s, size, 0);
1894}
1895
1896PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1897{
1898 if (!PyUnicode_Check(unicode)) {
1899 PyErr_BadArgument();
1900 return NULL;
1901 }
1902 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1903 PyUnicode_GET_SIZE(unicode));
1904}
1905
1906/* --- Raw Unicode Escape Codec ------------------------------------------- */
1907
1908PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1909 int size,
1910 const char *errors)
1911{
1912 PyUnicodeObject *v;
1913 Py_UNICODE *p, *buf;
1914 const char *end;
1915 const char *bs;
1916
1917 /* Escaped strings will always be longer than the resulting
1918 Unicode string, so we start with size here and then reduce the
1919 length after conversion to the true value. */
1920 v = _PyUnicode_New(size);
1921 if (v == NULL)
1922 goto onError;
1923 if (size == 0)
1924 return (PyObject *)v;
1925 p = buf = PyUnicode_AS_UNICODE(v);
1926 end = s + size;
1927 while (s < end) {
1928 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001929 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 int i;
1931
1932 /* Non-escape characters are interpreted as Unicode ordinals */
1933 if (*s != '\\') {
1934 *p++ = (unsigned char)*s++;
1935 continue;
1936 }
1937
1938 /* \u-escapes are only interpreted iff the number of leading
1939 backslashes if odd */
1940 bs = s;
1941 for (;s < end;) {
1942 if (*s != '\\')
1943 break;
1944 *p++ = (unsigned char)*s++;
1945 }
1946 if (((s - bs) & 1) == 0 ||
1947 s >= end ||
1948 *s != 'u') {
1949 continue;
1950 }
1951 p--;
1952 s++;
1953
1954 /* \uXXXX with 4 hex digits */
1955 for (x = 0, i = 0; i < 4; i++) {
1956 c = (unsigned char)s[i];
1957 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001958 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 "truncated \\uXXXX"))
1960 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001961 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 i++;
1963 break;
1964 }
1965 x = (x<<4) & ~0xF;
1966 if (c >= '0' && c <= '9')
1967 x += c - '0';
1968 else if (c >= 'a' && c <= 'f')
1969 x += 10 + c - 'a';
1970 else
1971 x += 10 + c - 'A';
1972 }
1973 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001974 if (x != 0xffffffff)
1975 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001977 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return (PyObject *)v;
1980
1981 onError:
1982 Py_XDECREF(v);
1983 return NULL;
1984}
1985
1986PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1987 int size)
1988{
1989 PyObject *repr;
1990 char *p;
1991 char *q;
1992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001993 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 repr = PyString_FromStringAndSize(NULL, 6 * size);
1996 if (repr == NULL)
1997 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001998 if (size == 0)
1999 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 p = q = PyString_AS_STRING(repr);
2002 while (size-- > 0) {
2003 Py_UNICODE ch = *s++;
2004 /* Map 16-bit characters to '\uxxxx' */
2005 if (ch >= 256) {
2006 *p++ = '\\';
2007 *p++ = 'u';
2008 *p++ = hexdigit[(ch >> 12) & 0xf];
2009 *p++ = hexdigit[(ch >> 8) & 0xf];
2010 *p++ = hexdigit[(ch >> 4) & 0xf];
2011 *p++ = hexdigit[ch & 15];
2012 }
2013 /* Copy everything else as-is */
2014 else
2015 *p++ = (char) ch;
2016 }
2017 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002018 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 return repr;
2020}
2021
2022PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2023{
2024 if (!PyUnicode_Check(unicode)) {
2025 PyErr_BadArgument();
2026 return NULL;
2027 }
2028 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2029 PyUnicode_GET_SIZE(unicode));
2030}
2031
2032/* --- Latin-1 Codec ------------------------------------------------------ */
2033
2034PyObject *PyUnicode_DecodeLatin1(const char *s,
2035 int size,
2036 const char *errors)
2037{
2038 PyUnicodeObject *v;
2039 Py_UNICODE *p;
2040
2041 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002042 if (size == 1 && *(unsigned char*)s < 256) {
2043 Py_UNICODE r = *(unsigned char*)s;
2044 return PyUnicode_FromUnicode(&r, 1);
2045 }
2046
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 v = _PyUnicode_New(size);
2048 if (v == NULL)
2049 goto onError;
2050 if (size == 0)
2051 return (PyObject *)v;
2052 p = PyUnicode_AS_UNICODE(v);
2053 while (size-- > 0)
2054 *p++ = (unsigned char)*s++;
2055 return (PyObject *)v;
2056
2057 onError:
2058 Py_XDECREF(v);
2059 return NULL;
2060}
2061
2062static
2063int latin1_encoding_error(const Py_UNICODE **source,
2064 char **dest,
2065 const char *errors,
2066 const char *details)
2067{
2068 if ((errors == NULL) ||
2069 (strcmp(errors,"strict") == 0)) {
2070 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002071 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 details);
2073 return -1;
2074 }
2075 else if (strcmp(errors,"ignore") == 0) {
2076 return 0;
2077 }
2078 else if (strcmp(errors,"replace") == 0) {
2079 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002080 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return 0;
2082 }
2083 else {
2084 PyErr_Format(PyExc_ValueError,
2085 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002086 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 errors);
2088 return -1;
2089 }
2090}
2091
2092PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2093 int size,
2094 const char *errors)
2095{
2096 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002097 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 repr = PyString_FromStringAndSize(NULL, size);
2100 if (repr == NULL)
2101 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002102 if (size == 0)
2103 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
2105 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002106 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 while (size-- > 0) {
2108 Py_UNICODE ch = *p++;
2109 if (ch >= 256) {
2110 if (latin1_encoding_error(&p, &s, errors,
2111 "ordinal not in range(256)"))
2112 goto onError;
2113 }
2114 else
2115 *s++ = (char)ch;
2116 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002117 /* Resize if error handling skipped some characters */
2118 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002119 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 return repr;
2121
2122 onError:
2123 Py_DECREF(repr);
2124 return NULL;
2125}
2126
2127PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2128{
2129 if (!PyUnicode_Check(unicode)) {
2130 PyErr_BadArgument();
2131 return NULL;
2132 }
2133 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2134 PyUnicode_GET_SIZE(unicode),
2135 NULL);
2136}
2137
2138/* --- 7-bit ASCII Codec -------------------------------------------------- */
2139
2140static
2141int ascii_decoding_error(const char **source,
2142 Py_UNICODE **dest,
2143 const char *errors,
2144 const char *details)
2145{
2146 if ((errors == NULL) ||
2147 (strcmp(errors,"strict") == 0)) {
2148 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002149 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 details);
2151 return -1;
2152 }
2153 else if (strcmp(errors,"ignore") == 0) {
2154 return 0;
2155 }
2156 else if (strcmp(errors,"replace") == 0) {
2157 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2158 (*dest)++;
2159 return 0;
2160 }
2161 else {
2162 PyErr_Format(PyExc_ValueError,
2163 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 errors);
2166 return -1;
2167 }
2168}
2169
2170PyObject *PyUnicode_DecodeASCII(const char *s,
2171 int size,
2172 const char *errors)
2173{
2174 PyUnicodeObject *v;
2175 Py_UNICODE *p;
2176
2177 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002178 if (size == 1 && *(unsigned char*)s < 128) {
2179 Py_UNICODE r = *(unsigned char*)s;
2180 return PyUnicode_FromUnicode(&r, 1);
2181 }
2182
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 v = _PyUnicode_New(size);
2184 if (v == NULL)
2185 goto onError;
2186 if (size == 0)
2187 return (PyObject *)v;
2188 p = PyUnicode_AS_UNICODE(v);
2189 while (size-- > 0) {
2190 register unsigned char c;
2191
2192 c = (unsigned char)*s++;
2193 if (c < 128)
2194 *p++ = c;
2195 else if (ascii_decoding_error(&s, &p, errors,
2196 "ordinal not in range(128)"))
2197 goto onError;
2198 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002199 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 return (PyObject *)v;
2203
2204 onError:
2205 Py_XDECREF(v);
2206 return NULL;
2207}
2208
2209static
2210int ascii_encoding_error(const Py_UNICODE **source,
2211 char **dest,
2212 const char *errors,
2213 const char *details)
2214{
2215 if ((errors == NULL) ||
2216 (strcmp(errors,"strict") == 0)) {
2217 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002218 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 details);
2220 return -1;
2221 }
2222 else if (strcmp(errors,"ignore") == 0) {
2223 return 0;
2224 }
2225 else if (strcmp(errors,"replace") == 0) {
2226 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002227 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return 0;
2229 }
2230 else {
2231 PyErr_Format(PyExc_ValueError,
2232 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002233 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 errors);
2235 return -1;
2236 }
2237}
2238
2239PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2240 int size,
2241 const char *errors)
2242{
2243 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002244 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002245
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 repr = PyString_FromStringAndSize(NULL, size);
2247 if (repr == NULL)
2248 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002249 if (size == 0)
2250 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251
2252 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002253 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 while (size-- > 0) {
2255 Py_UNICODE ch = *p++;
2256 if (ch >= 128) {
2257 if (ascii_encoding_error(&p, &s, errors,
2258 "ordinal not in range(128)"))
2259 goto onError;
2260 }
2261 else
2262 *s++ = (char)ch;
2263 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002264 /* Resize if error handling skipped some characters */
2265 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002266 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 return repr;
2268
2269 onError:
2270 Py_DECREF(repr);
2271 return NULL;
2272}
2273
2274PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode),
2282 NULL);
2283}
2284
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002285#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002286
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002287/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002288
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002289PyObject *PyUnicode_DecodeMBCS(const char *s,
2290 int size,
2291 const char *errors)
2292{
2293 PyUnicodeObject *v;
2294 Py_UNICODE *p;
2295
2296 /* First get the size of the result */
2297 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002298 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002299 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2300
2301 v = _PyUnicode_New(usize);
2302 if (v == NULL)
2303 return NULL;
2304 if (usize == 0)
2305 return (PyObject *)v;
2306 p = PyUnicode_AS_UNICODE(v);
2307 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2308 Py_DECREF(v);
2309 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2310 }
2311
2312 return (PyObject *)v;
2313}
2314
2315PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2316 int size,
2317 const char *errors)
2318{
2319 PyObject *repr;
2320 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002321 DWORD mbcssize;
2322
2323 /* If there are no characters, bail now! */
2324 if (size==0)
2325 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002326
2327 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002328 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002329 if (mbcssize==0)
2330 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2331
2332 repr = PyString_FromStringAndSize(NULL, mbcssize);
2333 if (repr == NULL)
2334 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002335 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002336 return repr;
2337
2338 /* Do the conversion */
2339 s = PyString_AS_STRING(repr);
2340 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2341 Py_DECREF(repr);
2342 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2343 }
2344 return repr;
2345}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002346
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002347#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349/* --- Character Mapping Codec -------------------------------------------- */
2350
2351static
2352int charmap_decoding_error(const char **source,
2353 Py_UNICODE **dest,
2354 const char *errors,
2355 const char *details)
2356{
2357 if ((errors == NULL) ||
2358 (strcmp(errors,"strict") == 0)) {
2359 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002360 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361 details);
2362 return -1;
2363 }
2364 else if (strcmp(errors,"ignore") == 0) {
2365 return 0;
2366 }
2367 else if (strcmp(errors,"replace") == 0) {
2368 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2369 (*dest)++;
2370 return 0;
2371 }
2372 else {
2373 PyErr_Format(PyExc_ValueError,
2374 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002375 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 errors);
2377 return -1;
2378 }
2379}
2380
2381PyObject *PyUnicode_DecodeCharmap(const char *s,
2382 int size,
2383 PyObject *mapping,
2384 const char *errors)
2385{
2386 PyUnicodeObject *v;
2387 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002388 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
2390 /* Default to Latin-1 */
2391 if (mapping == NULL)
2392 return PyUnicode_DecodeLatin1(s, size, errors);
2393
2394 v = _PyUnicode_New(size);
2395 if (v == NULL)
2396 goto onError;
2397 if (size == 0)
2398 return (PyObject *)v;
2399 p = PyUnicode_AS_UNICODE(v);
2400 while (size-- > 0) {
2401 unsigned char ch = *s++;
2402 PyObject *w, *x;
2403
2404 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2405 w = PyInt_FromLong((long)ch);
2406 if (w == NULL)
2407 goto onError;
2408 x = PyObject_GetItem(mapping, w);
2409 Py_DECREF(w);
2410 if (x == NULL) {
2411 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002412 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002414 x = Py_None;
2415 Py_INCREF(x);
2416 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
2419
2420 /* Apply mapping */
2421 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002422 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 if (value < 0 || value > 65535) {
2424 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002425 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 Py_DECREF(x);
2427 goto onError;
2428 }
2429 *p++ = (Py_UNICODE)value;
2430 }
2431 else if (x == Py_None) {
2432 /* undefined mapping */
2433 if (charmap_decoding_error(&s, &p, errors,
2434 "character maps to <undefined>")) {
2435 Py_DECREF(x);
2436 goto onError;
2437 }
2438 }
2439 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002440 int targetsize = PyUnicode_GET_SIZE(x);
2441
2442 if (targetsize == 1)
2443 /* 1-1 mapping */
2444 *p++ = *PyUnicode_AS_UNICODE(x);
2445
2446 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002448 if (targetsize > extrachars) {
2449 /* resize first */
2450 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2451 int needed = (targetsize - extrachars) + \
2452 (targetsize << 2);
2453 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002454 if (_PyUnicode_Resize(&v,
2455 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002456 Py_DECREF(x);
2457 goto onError;
2458 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002459 p = PyUnicode_AS_UNICODE(v) + oldpos;
2460 }
2461 Py_UNICODE_COPY(p,
2462 PyUnicode_AS_UNICODE(x),
2463 targetsize);
2464 p += targetsize;
2465 extrachars -= targetsize;
2466 }
2467 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
2469 else {
2470 /* wrong return value */
2471 PyErr_SetString(PyExc_TypeError,
2472 "character mapping must return integer, None or unicode");
2473 Py_DECREF(x);
2474 goto onError;
2475 }
2476 Py_DECREF(x);
2477 }
2478 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002479 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 goto onError;
2481 return (PyObject *)v;
2482
2483 onError:
2484 Py_XDECREF(v);
2485 return NULL;
2486}
2487
2488static
2489int charmap_encoding_error(const Py_UNICODE **source,
2490 char **dest,
2491 const char *errors,
2492 const char *details)
2493{
2494 if ((errors == NULL) ||
2495 (strcmp(errors,"strict") == 0)) {
2496 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002497 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 details);
2499 return -1;
2500 }
2501 else if (strcmp(errors,"ignore") == 0) {
2502 return 0;
2503 }
2504 else if (strcmp(errors,"replace") == 0) {
2505 **dest = '?';
2506 (*dest)++;
2507 return 0;
2508 }
2509 else {
2510 PyErr_Format(PyExc_ValueError,
2511 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002512 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 errors);
2514 return -1;
2515 }
2516}
2517
2518PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2519 int size,
2520 PyObject *mapping,
2521 const char *errors)
2522{
2523 PyObject *v;
2524 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002525 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526
2527 /* Default to Latin-1 */
2528 if (mapping == NULL)
2529 return PyUnicode_EncodeLatin1(p, size, errors);
2530
2531 v = PyString_FromStringAndSize(NULL, size);
2532 if (v == NULL)
2533 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002534 if (size == 0)
2535 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 s = PyString_AS_STRING(v);
2537 while (size-- > 0) {
2538 Py_UNICODE ch = *p++;
2539 PyObject *w, *x;
2540
2541 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2542 w = PyInt_FromLong((long)ch);
2543 if (w == NULL)
2544 goto onError;
2545 x = PyObject_GetItem(mapping, w);
2546 Py_DECREF(w);
2547 if (x == NULL) {
2548 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002549 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002551 x = Py_None;
2552 Py_INCREF(x);
2553 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002554 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 }
2556
2557 /* Apply mapping */
2558 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002559 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 if (value < 0 || value > 255) {
2561 PyErr_SetString(PyExc_TypeError,
2562 "character mapping must be in range(256)");
2563 Py_DECREF(x);
2564 goto onError;
2565 }
2566 *s++ = (char)value;
2567 }
2568 else if (x == Py_None) {
2569 /* undefined mapping */
2570 if (charmap_encoding_error(&p, &s, errors,
2571 "character maps to <undefined>")) {
2572 Py_DECREF(x);
2573 goto onError;
2574 }
2575 }
2576 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002577 int targetsize = PyString_GET_SIZE(x);
2578
2579 if (targetsize == 1)
2580 /* 1-1 mapping */
2581 *s++ = *PyString_AS_STRING(x);
2582
2583 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002585 if (targetsize > extrachars) {
2586 /* resize first */
2587 int oldpos = (int)(s - PyString_AS_STRING(v));
2588 int needed = (targetsize - extrachars) + \
2589 (targetsize << 2);
2590 extrachars += needed;
2591 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002592 Py_DECREF(x);
2593 goto onError;
2594 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002595 s = PyString_AS_STRING(v) + oldpos;
2596 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002597 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002598 s += targetsize;
2599 extrachars -= targetsize;
2600 }
2601 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
2603 else {
2604 /* wrong return value */
2605 PyErr_SetString(PyExc_TypeError,
2606 "character mapping must return integer, None or unicode");
2607 Py_DECREF(x);
2608 goto onError;
2609 }
2610 Py_DECREF(x);
2611 }
2612 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
Tim Peters5de98422002-04-27 18:44:32 +00002613 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 return v;
2615
2616 onError:
Tim Peters5de98422002-04-27 18:44:32 +00002617 Py_XDECREF(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 return NULL;
2619}
2620
2621PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2622 PyObject *mapping)
2623{
2624 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2625 PyErr_BadArgument();
2626 return NULL;
2627 }
2628 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2629 PyUnicode_GET_SIZE(unicode),
2630 mapping,
2631 NULL);
2632}
2633
2634static
2635int translate_error(const Py_UNICODE **source,
2636 Py_UNICODE **dest,
2637 const char *errors,
2638 const char *details)
2639{
2640 if ((errors == NULL) ||
2641 (strcmp(errors,"strict") == 0)) {
2642 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002643 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 details);
2645 return -1;
2646 }
2647 else if (strcmp(errors,"ignore") == 0) {
2648 return 0;
2649 }
2650 else if (strcmp(errors,"replace") == 0) {
2651 **dest = '?';
2652 (*dest)++;
2653 return 0;
2654 }
2655 else {
2656 PyErr_Format(PyExc_ValueError,
2657 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002658 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 errors);
2660 return -1;
2661 }
2662}
2663
2664PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2665 int size,
2666 PyObject *mapping,
2667 const char *errors)
2668{
2669 PyUnicodeObject *v;
2670 Py_UNICODE *p;
2671
2672 if (mapping == NULL) {
2673 PyErr_BadArgument();
2674 return NULL;
2675 }
2676
2677 /* Output will never be longer than input */
2678 v = _PyUnicode_New(size);
2679 if (v == NULL)
2680 goto onError;
2681 if (size == 0)
2682 goto done;
2683 p = PyUnicode_AS_UNICODE(v);
2684 while (size-- > 0) {
2685 Py_UNICODE ch = *s++;
2686 PyObject *w, *x;
2687
2688 /* Get mapping */
2689 w = PyInt_FromLong(ch);
2690 if (w == NULL)
2691 goto onError;
2692 x = PyObject_GetItem(mapping, w);
2693 Py_DECREF(w);
2694 if (x == NULL) {
2695 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2696 /* No mapping found: default to 1-1 mapping */
2697 PyErr_Clear();
2698 *p++ = ch;
2699 continue;
2700 }
2701 goto onError;
2702 }
2703
2704 /* Apply mapping */
2705 if (PyInt_Check(x))
2706 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2707 else if (x == Py_None) {
2708 /* undefined mapping */
2709 if (translate_error(&s, &p, errors,
2710 "character maps to <undefined>")) {
2711 Py_DECREF(x);
2712 goto onError;
2713 }
2714 }
2715 else if (PyUnicode_Check(x)) {
2716 if (PyUnicode_GET_SIZE(x) != 1) {
2717 /* 1-n mapping */
2718 PyErr_SetString(PyExc_NotImplementedError,
2719 "1-n mappings are currently not implemented");
2720 Py_DECREF(x);
2721 goto onError;
2722 }
2723 *p++ = *PyUnicode_AS_UNICODE(x);
2724 }
2725 else {
2726 /* wrong return value */
2727 PyErr_SetString(PyExc_TypeError,
2728 "translate mapping must return integer, None or unicode");
2729 Py_DECREF(x);
2730 goto onError;
2731 }
2732 Py_DECREF(x);
2733 }
2734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002735 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
2738 done:
2739 return (PyObject *)v;
2740
2741 onError:
2742 Py_XDECREF(v);
2743 return NULL;
2744}
2745
2746PyObject *PyUnicode_Translate(PyObject *str,
2747 PyObject *mapping,
2748 const char *errors)
2749{
2750 PyObject *result;
2751
2752 str = PyUnicode_FromObject(str);
2753 if (str == NULL)
2754 goto onError;
2755 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2756 PyUnicode_GET_SIZE(str),
2757 mapping,
2758 errors);
2759 Py_DECREF(str);
2760 return result;
2761
2762 onError:
2763 Py_XDECREF(str);
2764 return NULL;
2765}
2766
Guido van Rossum9e896b32000-04-05 20:11:21 +00002767/* --- Decimal Encoder ---------------------------------------------------- */
2768
2769int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2770 int length,
2771 char *output,
2772 const char *errors)
2773{
2774 Py_UNICODE *p, *end;
2775
2776 if (output == NULL) {
2777 PyErr_BadArgument();
2778 return -1;
2779 }
2780
2781 p = s;
2782 end = s + length;
2783 while (p < end) {
2784 register Py_UNICODE ch = *p++;
2785 int decimal;
2786
2787 if (Py_UNICODE_ISSPACE(ch)) {
2788 *output++ = ' ';
2789 continue;
2790 }
2791 decimal = Py_UNICODE_TODECIMAL(ch);
2792 if (decimal >= 0) {
2793 *output++ = '0' + decimal;
2794 continue;
2795 }
Guido van Rossumba477042000-04-06 18:18:10 +00002796 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002797 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002798 continue;
2799 }
2800 /* All other characters are considered invalid */
2801 if (errors == NULL || strcmp(errors, "strict") == 0) {
2802 PyErr_SetString(PyExc_ValueError,
2803 "invalid decimal Unicode string");
2804 goto onError;
2805 }
2806 else if (strcmp(errors, "ignore") == 0)
2807 continue;
2808 else if (strcmp(errors, "replace") == 0) {
2809 *output++ = '?';
2810 continue;
2811 }
2812 }
2813 /* 0-terminate the output string */
2814 *output++ = '\0';
2815 return 0;
2816
2817 onError:
2818 return -1;
2819}
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821/* --- Helpers ------------------------------------------------------------ */
2822
2823static
2824int count(PyUnicodeObject *self,
2825 int start,
2826 int end,
2827 PyUnicodeObject *substring)
2828{
2829 int count = 0;
2830
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002831 if (start < 0)
2832 start += self->length;
2833 if (start < 0)
2834 start = 0;
2835 if (end > self->length)
2836 end = self->length;
2837 if (end < 0)
2838 end += self->length;
2839 if (end < 0)
2840 end = 0;
2841
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002842 if (substring->length == 0)
2843 return (end - start + 1);
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 end -= substring->length;
2846
2847 while (start <= end)
2848 if (Py_UNICODE_MATCH(self, start, substring)) {
2849 count++;
2850 start += substring->length;
2851 } else
2852 start++;
2853
2854 return count;
2855}
2856
2857int PyUnicode_Count(PyObject *str,
2858 PyObject *substr,
2859 int start,
2860 int end)
2861{
2862 int result;
2863
2864 str = PyUnicode_FromObject(str);
2865 if (str == NULL)
2866 return -1;
2867 substr = PyUnicode_FromObject(substr);
2868 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002869 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 return -1;
2871 }
2872
2873 result = count((PyUnicodeObject *)str,
2874 start, end,
2875 (PyUnicodeObject *)substr);
2876
2877 Py_DECREF(str);
2878 Py_DECREF(substr);
2879 return result;
2880}
2881
2882static
2883int findstring(PyUnicodeObject *self,
2884 PyUnicodeObject *substring,
2885 int start,
2886 int end,
2887 int direction)
2888{
2889 if (start < 0)
2890 start += self->length;
2891 if (start < 0)
2892 start = 0;
2893
2894 if (substring->length == 0)
2895 return start;
2896
2897 if (end > self->length)
2898 end = self->length;
2899 if (end < 0)
2900 end += self->length;
2901 if (end < 0)
2902 end = 0;
2903
2904 end -= substring->length;
2905
2906 if (direction < 0) {
2907 for (; end >= start; end--)
2908 if (Py_UNICODE_MATCH(self, end, substring))
2909 return end;
2910 } else {
2911 for (; start <= end; start++)
2912 if (Py_UNICODE_MATCH(self, start, substring))
2913 return start;
2914 }
2915
2916 return -1;
2917}
2918
2919int PyUnicode_Find(PyObject *str,
2920 PyObject *substr,
2921 int start,
2922 int end,
2923 int direction)
2924{
2925 int result;
2926
2927 str = PyUnicode_FromObject(str);
2928 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002929 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 substr = PyUnicode_FromObject(substr);
2931 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00002932 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002933 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 }
2935
2936 result = findstring((PyUnicodeObject *)str,
2937 (PyUnicodeObject *)substr,
2938 start, end, direction);
2939 Py_DECREF(str);
2940 Py_DECREF(substr);
2941 return result;
2942}
2943
2944static
2945int tailmatch(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int start,
2948 int end,
2949 int direction)
2950{
2951 if (start < 0)
2952 start += self->length;
2953 if (start < 0)
2954 start = 0;
2955
2956 if (substring->length == 0)
2957 return 1;
2958
2959 if (end > self->length)
2960 end = self->length;
2961 if (end < 0)
2962 end += self->length;
2963 if (end < 0)
2964 end = 0;
2965
2966 end -= substring->length;
2967 if (end < start)
2968 return 0;
2969
2970 if (direction > 0) {
2971 if (Py_UNICODE_MATCH(self, end, substring))
2972 return 1;
2973 } else {
2974 if (Py_UNICODE_MATCH(self, start, substring))
2975 return 1;
2976 }
2977
2978 return 0;
2979}
2980
2981int PyUnicode_Tailmatch(PyObject *str,
2982 PyObject *substr,
2983 int start,
2984 int end,
2985 int direction)
2986{
2987 int result;
2988
2989 str = PyUnicode_FromObject(str);
2990 if (str == NULL)
2991 return -1;
2992 substr = PyUnicode_FromObject(substr);
2993 if (substr == NULL) {
2994 Py_DECREF(substr);
2995 return -1;
2996 }
2997
2998 result = tailmatch((PyUnicodeObject *)str,
2999 (PyUnicodeObject *)substr,
3000 start, end, direction);
3001 Py_DECREF(str);
3002 Py_DECREF(substr);
3003 return result;
3004}
3005
3006static
3007const Py_UNICODE *findchar(const Py_UNICODE *s,
3008 int size,
3009 Py_UNICODE ch)
3010{
3011 /* like wcschr, but doesn't stop at NULL characters */
3012
3013 while (size-- > 0) {
3014 if (*s == ch)
3015 return s;
3016 s++;
3017 }
3018
3019 return NULL;
3020}
3021
3022/* Apply fixfct filter to the Unicode object self and return a
3023 reference to the modified object */
3024
3025static
3026PyObject *fixup(PyUnicodeObject *self,
3027 int (*fixfct)(PyUnicodeObject *s))
3028{
3029
3030 PyUnicodeObject *u;
3031
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003032 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 if (u == NULL)
3034 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003035
3036 Py_UNICODE_COPY(u->str, self->str, self->length);
3037
Tim Peters7a29bd52001-09-12 03:03:31 +00003038 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 /* fixfct should return TRUE if it modified the buffer. If
3040 FALSE, return a reference to the original buffer instead
3041 (to save space, not time) */
3042 Py_INCREF(self);
3043 Py_DECREF(u);
3044 return (PyObject*) self;
3045 }
3046 return (PyObject*) u;
3047}
3048
3049static
3050int fixupper(PyUnicodeObject *self)
3051{
3052 int len = self->length;
3053 Py_UNICODE *s = self->str;
3054 int status = 0;
3055
3056 while (len-- > 0) {
3057 register Py_UNICODE ch;
3058
3059 ch = Py_UNICODE_TOUPPER(*s);
3060 if (ch != *s) {
3061 status = 1;
3062 *s = ch;
3063 }
3064 s++;
3065 }
3066
3067 return status;
3068}
3069
3070static
3071int fixlower(PyUnicodeObject *self)
3072{
3073 int len = self->length;
3074 Py_UNICODE *s = self->str;
3075 int status = 0;
3076
3077 while (len-- > 0) {
3078 register Py_UNICODE ch;
3079
3080 ch = Py_UNICODE_TOLOWER(*s);
3081 if (ch != *s) {
3082 status = 1;
3083 *s = ch;
3084 }
3085 s++;
3086 }
3087
3088 return status;
3089}
3090
3091static
3092int fixswapcase(PyUnicodeObject *self)
3093{
3094 int len = self->length;
3095 Py_UNICODE *s = self->str;
3096 int status = 0;
3097
3098 while (len-- > 0) {
3099 if (Py_UNICODE_ISUPPER(*s)) {
3100 *s = Py_UNICODE_TOLOWER(*s);
3101 status = 1;
3102 } else if (Py_UNICODE_ISLOWER(*s)) {
3103 *s = Py_UNICODE_TOUPPER(*s);
3104 status = 1;
3105 }
3106 s++;
3107 }
3108
3109 return status;
3110}
3111
3112static
3113int fixcapitalize(PyUnicodeObject *self)
3114{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003115 int len = self->length;
3116 Py_UNICODE *s = self->str;
3117 int status = 0;
3118
3119 if (len == 0)
3120 return 0;
3121 if (Py_UNICODE_ISLOWER(*s)) {
3122 *s = Py_UNICODE_TOUPPER(*s);
3123 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003125 s++;
3126 while (--len > 0) {
3127 if (Py_UNICODE_ISUPPER(*s)) {
3128 *s = Py_UNICODE_TOLOWER(*s);
3129 status = 1;
3130 }
3131 s++;
3132 }
3133 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134}
3135
3136static
3137int fixtitle(PyUnicodeObject *self)
3138{
3139 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3140 register Py_UNICODE *e;
3141 int previous_is_cased;
3142
3143 /* Shortcut for single character strings */
3144 if (PyUnicode_GET_SIZE(self) == 1) {
3145 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3146 if (*p != ch) {
3147 *p = ch;
3148 return 1;
3149 }
3150 else
3151 return 0;
3152 }
3153
3154 e = p + PyUnicode_GET_SIZE(self);
3155 previous_is_cased = 0;
3156 for (; p < e; p++) {
3157 register const Py_UNICODE ch = *p;
3158
3159 if (previous_is_cased)
3160 *p = Py_UNICODE_TOLOWER(ch);
3161 else
3162 *p = Py_UNICODE_TOTITLE(ch);
3163
3164 if (Py_UNICODE_ISLOWER(ch) ||
3165 Py_UNICODE_ISUPPER(ch) ||
3166 Py_UNICODE_ISTITLE(ch))
3167 previous_is_cased = 1;
3168 else
3169 previous_is_cased = 0;
3170 }
3171 return 1;
3172}
3173
3174PyObject *PyUnicode_Join(PyObject *separator,
3175 PyObject *seq)
3176{
3177 Py_UNICODE *sep;
3178 int seplen;
3179 PyUnicodeObject *res = NULL;
3180 int reslen = 0;
3181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 int sz = 100;
3183 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003184 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
Tim Peters2cfe3682001-05-05 05:36:48 +00003186 it = PyObject_GetIter(seq);
3187 if (it == NULL)
3188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189
3190 if (separator == NULL) {
3191 Py_UNICODE blank = ' ';
3192 sep = &blank;
3193 seplen = 1;
3194 }
3195 else {
3196 separator = PyUnicode_FromObject(separator);
3197 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 sep = PyUnicode_AS_UNICODE(separator);
3200 seplen = PyUnicode_GET_SIZE(separator);
3201 }
3202
3203 res = _PyUnicode_New(sz);
3204 if (res == NULL)
3205 goto onError;
3206 p = PyUnicode_AS_UNICODE(res);
3207 reslen = 0;
3208
Tim Peters2cfe3682001-05-05 05:36:48 +00003209 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003211 PyObject *item = PyIter_Next(it);
3212 if (item == NULL) {
3213 if (PyErr_Occurred())
3214 goto onError;
3215 break;
3216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 if (!PyUnicode_Check(item)) {
3218 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003219 if (!PyString_Check(item)) {
3220 PyErr_Format(PyExc_TypeError,
3221 "sequence item %i: expected string or Unicode,"
3222 " %.80s found",
3223 i, item->ob_type->tp_name);
3224 Py_DECREF(item);
3225 goto onError;
3226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 v = PyUnicode_FromObject(item);
3228 Py_DECREF(item);
3229 item = v;
3230 if (item == NULL)
3231 goto onError;
3232 }
3233 itemlen = PyUnicode_GET_SIZE(item);
3234 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003235 if (_PyUnicode_Resize(&res, sz*2)) {
3236 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 sz *= 2;
3240 p = PyUnicode_AS_UNICODE(res) + reslen;
3241 }
3242 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003243 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 p += seplen;
3245 reslen += seplen;
3246 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003247 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 p += itemlen;
3249 reslen += itemlen;
3250 Py_DECREF(item);
3251 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003252 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 goto onError;
3254
3255 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003256 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 return (PyObject *)res;
3258
3259 onError:
3260 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003261 Py_XDECREF(res);
3262 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 return NULL;
3264}
3265
3266static
3267PyUnicodeObject *pad(PyUnicodeObject *self,
3268 int left,
3269 int right,
3270 Py_UNICODE fill)
3271{
3272 PyUnicodeObject *u;
3273
3274 if (left < 0)
3275 left = 0;
3276 if (right < 0)
3277 right = 0;
3278
Tim Peters7a29bd52001-09-12 03:03:31 +00003279 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 Py_INCREF(self);
3281 return self;
3282 }
3283
3284 u = _PyUnicode_New(left + self->length + right);
3285 if (u) {
3286 if (left)
3287 Py_UNICODE_FILL(u->str, fill, left);
3288 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3289 if (right)
3290 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3291 }
3292
3293 return u;
3294}
3295
3296#define SPLIT_APPEND(data, left, right) \
3297 str = PyUnicode_FromUnicode(data + left, right - left); \
3298 if (!str) \
3299 goto onError; \
3300 if (PyList_Append(list, str)) { \
3301 Py_DECREF(str); \
3302 goto onError; \
3303 } \
3304 else \
3305 Py_DECREF(str);
3306
3307static
3308PyObject *split_whitespace(PyUnicodeObject *self,
3309 PyObject *list,
3310 int maxcount)
3311{
3312 register int i;
3313 register int j;
3314 int len = self->length;
3315 PyObject *str;
3316
3317 for (i = j = 0; i < len; ) {
3318 /* find a token */
3319 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3320 i++;
3321 j = i;
3322 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3323 i++;
3324 if (j < i) {
3325 if (maxcount-- <= 0)
3326 break;
3327 SPLIT_APPEND(self->str, j, i);
3328 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3329 i++;
3330 j = i;
3331 }
3332 }
3333 if (j < len) {
3334 SPLIT_APPEND(self->str, j, len);
3335 }
3336 return list;
3337
3338 onError:
3339 Py_DECREF(list);
3340 return NULL;
3341}
3342
3343PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003344 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345{
3346 register int i;
3347 register int j;
3348 int len;
3349 PyObject *list;
3350 PyObject *str;
3351 Py_UNICODE *data;
3352
3353 string = PyUnicode_FromObject(string);
3354 if (string == NULL)
3355 return NULL;
3356 data = PyUnicode_AS_UNICODE(string);
3357 len = PyUnicode_GET_SIZE(string);
3358
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 list = PyList_New(0);
3360 if (!list)
3361 goto onError;
3362
3363 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003364 int eol;
3365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 /* Find a line and append it */
3367 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3368 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369
3370 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003371 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 if (i < len) {
3373 if (data[i] == '\r' && i + 1 < len &&
3374 data[i+1] == '\n')
3375 i += 2;
3376 else
3377 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003378 if (keepends)
3379 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
Guido van Rossum86662912000-04-11 15:38:46 +00003381 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 j = i;
3383 }
3384 if (j < len) {
3385 SPLIT_APPEND(data, j, len);
3386 }
3387
3388 Py_DECREF(string);
3389 return list;
3390
3391 onError:
3392 Py_DECREF(list);
3393 Py_DECREF(string);
3394 return NULL;
3395}
3396
3397static
3398PyObject *split_char(PyUnicodeObject *self,
3399 PyObject *list,
3400 Py_UNICODE ch,
3401 int maxcount)
3402{
3403 register int i;
3404 register int j;
3405 int len = self->length;
3406 PyObject *str;
3407
3408 for (i = j = 0; i < len; ) {
3409 if (self->str[i] == ch) {
3410 if (maxcount-- <= 0)
3411 break;
3412 SPLIT_APPEND(self->str, j, i);
3413 i = j = i + 1;
3414 } else
3415 i++;
3416 }
3417 if (j <= len) {
3418 SPLIT_APPEND(self->str, j, len);
3419 }
3420 return list;
3421
3422 onError:
3423 Py_DECREF(list);
3424 return NULL;
3425}
3426
3427static
3428PyObject *split_substring(PyUnicodeObject *self,
3429 PyObject *list,
3430 PyUnicodeObject *substring,
3431 int maxcount)
3432{
3433 register int i;
3434 register int j;
3435 int len = self->length;
3436 int sublen = substring->length;
3437 PyObject *str;
3438
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003439 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 if (Py_UNICODE_MATCH(self, i, substring)) {
3441 if (maxcount-- <= 0)
3442 break;
3443 SPLIT_APPEND(self->str, j, i);
3444 i = j = i + sublen;
3445 } else
3446 i++;
3447 }
3448 if (j <= len) {
3449 SPLIT_APPEND(self->str, j, len);
3450 }
3451 return list;
3452
3453 onError:
3454 Py_DECREF(list);
3455 return NULL;
3456}
3457
3458#undef SPLIT_APPEND
3459
3460static
3461PyObject *split(PyUnicodeObject *self,
3462 PyUnicodeObject *substring,
3463 int maxcount)
3464{
3465 PyObject *list;
3466
3467 if (maxcount < 0)
3468 maxcount = INT_MAX;
3469
3470 list = PyList_New(0);
3471 if (!list)
3472 return NULL;
3473
3474 if (substring == NULL)
3475 return split_whitespace(self,list,maxcount);
3476
3477 else if (substring->length == 1)
3478 return split_char(self,list,substring->str[0],maxcount);
3479
3480 else if (substring->length == 0) {
3481 Py_DECREF(list);
3482 PyErr_SetString(PyExc_ValueError, "empty separator");
3483 return NULL;
3484 }
3485 else
3486 return split_substring(self,list,substring,maxcount);
3487}
3488
3489static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490PyObject *replace(PyUnicodeObject *self,
3491 PyUnicodeObject *str1,
3492 PyUnicodeObject *str2,
3493 int maxcount)
3494{
3495 PyUnicodeObject *u;
3496
Guido van Rossumf36921c2002-08-09 15:36:48 +00003497 if (str1->length == 0) {
3498 PyErr_SetString(PyExc_ValueError, "empty pattern string");
3499 return NULL;
3500 }
3501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 if (maxcount < 0)
3503 maxcount = INT_MAX;
3504
3505 if (str1->length == 1 && str2->length == 1) {
3506 int i;
3507
3508 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003509 if (!findchar(self->str, self->length, str1->str[0]) &&
3510 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 /* nothing to replace, return original string */
3512 Py_INCREF(self);
3513 u = self;
3514 } else {
3515 Py_UNICODE u1 = str1->str[0];
3516 Py_UNICODE u2 = str2->str[0];
3517
3518 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003519 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 self->length
3521 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003522 if (u != NULL) {
3523 Py_UNICODE_COPY(u->str, self->str,
3524 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 for (i = 0; i < u->length; i++)
3526 if (u->str[i] == u1) {
3527 if (--maxcount < 0)
3528 break;
3529 u->str[i] = u2;
3530 }
3531 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
3534 } else {
3535 int n, i;
3536 Py_UNICODE *p;
3537
3538 /* replace strings */
3539 n = count(self, 0, self->length, str1);
3540 if (n > maxcount)
3541 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003542 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 /* nothing to replace, return original string */
3544 Py_INCREF(self);
3545 u = self;
3546 } else {
3547 u = _PyUnicode_New(
3548 self->length + n * (str2->length - str1->length));
3549 if (u) {
3550 i = 0;
3551 p = u->str;
3552 while (i <= self->length - str1->length)
3553 if (Py_UNICODE_MATCH(self, i, str1)) {
3554 /* replace string segment */
3555 Py_UNICODE_COPY(p, str2->str, str2->length);
3556 p += str2->length;
3557 i += str1->length;
3558 if (--n <= 0) {
3559 /* copy remaining part */
3560 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3561 break;
3562 }
3563 } else
3564 *p++ = self->str[i++];
3565 }
3566 }
3567 }
3568
3569 return (PyObject *) u;
3570}
3571
3572/* --- Unicode Object Methods --------------------------------------------- */
3573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003574PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575"S.title() -> unicode\n\
3576\n\
3577Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003578characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579
3580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003581unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 return fixup(self, fixtitle);
3584}
3585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003586PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587"S.capitalize() -> unicode\n\
3588\n\
3589Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003590have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591
3592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003593unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 return fixup(self, fixcapitalize);
3596}
3597
3598#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003599PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600"S.capwords() -> unicode\n\
3601\n\
3602Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003603normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003606unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607{
3608 PyObject *list;
3609 PyObject *item;
3610 int i;
3611
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 /* Split into words */
3613 list = split(self, NULL, -1);
3614 if (!list)
3615 return NULL;
3616
3617 /* Capitalize each word */
3618 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3619 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3620 fixcapitalize);
3621 if (item == NULL)
3622 goto onError;
3623 Py_DECREF(PyList_GET_ITEM(list, i));
3624 PyList_SET_ITEM(list, i, item);
3625 }
3626
3627 /* Join the words to form a new string */
3628 item = PyUnicode_Join(NULL, list);
3629
3630onError:
3631 Py_DECREF(list);
3632 return (PyObject *)item;
3633}
3634#endif
3635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003636PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637"S.center(width) -> unicode\n\
3638\n\
3639Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003640using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641
3642static PyObject *
3643unicode_center(PyUnicodeObject *self, PyObject *args)
3644{
3645 int marg, left;
3646 int width;
3647
3648 if (!PyArg_ParseTuple(args, "i:center", &width))
3649 return NULL;
3650
Tim Peters7a29bd52001-09-12 03:03:31 +00003651 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 Py_INCREF(self);
3653 return (PyObject*) self;
3654 }
3655
3656 marg = width - self->length;
3657 left = marg / 2 + (marg & width & 1);
3658
3659 return (PyObject*) pad(self, left, marg - left, ' ');
3660}
3661
Marc-André Lemburge5034372000-08-08 08:04:29 +00003662#if 0
3663
3664/* This code should go into some future Unicode collation support
3665 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003666 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003667
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003668/* speedy UTF-16 code point order comparison */
3669/* gleaned from: */
3670/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3671
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003672static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003673{
3674 0, 0, 0, 0, 0, 0, 0, 0,
3675 0, 0, 0, 0, 0, 0, 0, 0,
3676 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003677 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003678};
3679
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680static int
3681unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3682{
3683 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 Py_UNICODE *s1 = str1->str;
3686 Py_UNICODE *s2 = str2->str;
3687
3688 len1 = str1->length;
3689 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003690
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003692 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003693
3694 c1 = *s1++;
3695 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003696
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003697 if (c1 > (1<<11) * 26)
3698 c1 += utf16Fixup[c1>>11];
3699 if (c2 > (1<<11) * 26)
3700 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003701 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003702
3703 if (c1 != c2)
3704 return (c1 < c2) ? -1 : 1;
3705
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003706 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 }
3708
3709 return (len1 < len2) ? -1 : (len1 != len2);
3710}
3711
Marc-André Lemburge5034372000-08-08 08:04:29 +00003712#else
3713
3714static int
3715unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3716{
3717 register int len1, len2;
3718
3719 Py_UNICODE *s1 = str1->str;
3720 Py_UNICODE *s2 = str2->str;
3721
3722 len1 = str1->length;
3723 len2 = str2->length;
3724
3725 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003726 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003727
Fredrik Lundh45714e92001-06-26 16:39:36 +00003728 c1 = *s1++;
3729 c2 = *s2++;
3730
3731 if (c1 != c2)
3732 return (c1 < c2) ? -1 : 1;
3733
Marc-André Lemburge5034372000-08-08 08:04:29 +00003734 len1--; len2--;
3735 }
3736
3737 return (len1 < len2) ? -1 : (len1 != len2);
3738}
3739
3740#endif
3741
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742int PyUnicode_Compare(PyObject *left,
3743 PyObject *right)
3744{
3745 PyUnicodeObject *u = NULL, *v = NULL;
3746 int result;
3747
3748 /* Coerce the two arguments */
3749 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3750 if (u == NULL)
3751 goto onError;
3752 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3753 if (v == NULL)
3754 goto onError;
3755
Thomas Wouters7e474022000-07-16 12:04:32 +00003756 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 if (v == u) {
3758 Py_DECREF(u);
3759 Py_DECREF(v);
3760 return 0;
3761 }
3762
3763 result = unicode_compare(u, v);
3764
3765 Py_DECREF(u);
3766 Py_DECREF(v);
3767 return result;
3768
3769onError:
3770 Py_XDECREF(u);
3771 Py_XDECREF(v);
3772 return -1;
3773}
3774
Guido van Rossum403d68b2000-03-13 15:55:09 +00003775int PyUnicode_Contains(PyObject *container,
3776 PyObject *element)
3777{
3778 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00003779 int result, size;
3780 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00003781
3782 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003783 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003784 if (v == NULL) {
3785 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00003786 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003787 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003788 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003789 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3790 if (u == NULL) {
3791 Py_DECREF(v);
3792 goto onError;
3793 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003794
Barry Warsaw817918c2002-08-06 16:58:21 +00003795 size = PyUnicode_GET_SIZE(v);
3796 rhs = PyUnicode_AS_UNICODE(v);
3797 lhs = PyUnicode_AS_UNICODE(u);
3798
Guido van Rossum403d68b2000-03-13 15:55:09 +00003799 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00003800 if (size == 1) {
3801 end = lhs + PyUnicode_GET_SIZE(u);
3802 while (lhs < end) {
3803 if (*lhs++ == *rhs) {
3804 result = 1;
3805 break;
3806 }
3807 }
3808 }
3809 else {
3810 end = lhs + (PyUnicode_GET_SIZE(u) - size);
3811 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00003812 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00003813 result = 1;
3814 break;
3815 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003816 }
3817 }
3818
3819 Py_DECREF(u);
3820 Py_DECREF(v);
3821 return result;
3822
3823onError:
3824 Py_XDECREF(u);
3825 Py_XDECREF(v);
3826 return -1;
3827}
3828
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829/* Concat to string or Unicode object giving a new Unicode object. */
3830
3831PyObject *PyUnicode_Concat(PyObject *left,
3832 PyObject *right)
3833{
3834 PyUnicodeObject *u = NULL, *v = NULL, *w;
3835
3836 /* Coerce the two arguments */
3837 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3838 if (u == NULL)
3839 goto onError;
3840 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3841 if (v == NULL)
3842 goto onError;
3843
3844 /* Shortcuts */
3845 if (v == unicode_empty) {
3846 Py_DECREF(v);
3847 return (PyObject *)u;
3848 }
3849 if (u == unicode_empty) {
3850 Py_DECREF(u);
3851 return (PyObject *)v;
3852 }
3853
3854 /* Concat the two Unicode strings */
3855 w = _PyUnicode_New(u->length + v->length);
3856 if (w == NULL)
3857 goto onError;
3858 Py_UNICODE_COPY(w->str, u->str, u->length);
3859 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3860
3861 Py_DECREF(u);
3862 Py_DECREF(v);
3863 return (PyObject *)w;
3864
3865onError:
3866 Py_XDECREF(u);
3867 Py_XDECREF(v);
3868 return NULL;
3869}
3870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003871PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872"S.count(sub[, start[, end]]) -> int\n\
3873\n\
3874Return the number of occurrences of substring sub in Unicode string\n\
3875S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003876interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877
3878static PyObject *
3879unicode_count(PyUnicodeObject *self, PyObject *args)
3880{
3881 PyUnicodeObject *substring;
3882 int start = 0;
3883 int end = INT_MAX;
3884 PyObject *result;
3885
Guido van Rossumb8872e62000-05-09 14:14:27 +00003886 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3887 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 return NULL;
3889
3890 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3891 (PyObject *)substring);
3892 if (substring == NULL)
3893 return NULL;
3894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 if (start < 0)
3896 start += self->length;
3897 if (start < 0)
3898 start = 0;
3899 if (end > self->length)
3900 end = self->length;
3901 if (end < 0)
3902 end += self->length;
3903 if (end < 0)
3904 end = 0;
3905
3906 result = PyInt_FromLong((long) count(self, start, end, substring));
3907
3908 Py_DECREF(substring);
3909 return result;
3910}
3911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003912PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913"S.encode([encoding[,errors]]) -> string\n\
3914\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003915Return an encoded string version of S. Default encoding is the current\n\
3916default string encoding. errors may be given to set a different error\n\
3917handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003918a ValueError. Other possible values are 'ignore' and 'replace'.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919
3920static PyObject *
3921unicode_encode(PyUnicodeObject *self, PyObject *args)
3922{
3923 char *encoding = NULL;
3924 char *errors = NULL;
3925 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3926 return NULL;
3927 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3928}
3929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003930PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931"S.expandtabs([tabsize]) -> unicode\n\
3932\n\
3933Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003934If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935
3936static PyObject*
3937unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3938{
3939 Py_UNICODE *e;
3940 Py_UNICODE *p;
3941 Py_UNICODE *q;
3942 int i, j;
3943 PyUnicodeObject *u;
3944 int tabsize = 8;
3945
3946 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3947 return NULL;
3948
Thomas Wouters7e474022000-07-16 12:04:32 +00003949 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 i = j = 0;
3951 e = self->str + self->length;
3952 for (p = self->str; p < e; p++)
3953 if (*p == '\t') {
3954 if (tabsize > 0)
3955 j += tabsize - (j % tabsize);
3956 }
3957 else {
3958 j++;
3959 if (*p == '\n' || *p == '\r') {
3960 i += j;
3961 j = 0;
3962 }
3963 }
3964
3965 /* Second pass: create output string and fill it */
3966 u = _PyUnicode_New(i + j);
3967 if (!u)
3968 return NULL;
3969
3970 j = 0;
3971 q = u->str;
3972
3973 for (p = self->str; p < e; p++)
3974 if (*p == '\t') {
3975 if (tabsize > 0) {
3976 i = tabsize - (j % tabsize);
3977 j += i;
3978 while (i--)
3979 *q++ = ' ';
3980 }
3981 }
3982 else {
3983 j++;
3984 *q++ = *p;
3985 if (*p == '\n' || *p == '\r')
3986 j = 0;
3987 }
3988
3989 return (PyObject*) u;
3990}
3991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003992PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993"S.find(sub [,start [,end]]) -> int\n\
3994\n\
3995Return the lowest index in S where substring sub is found,\n\
3996such that sub is contained within s[start,end]. Optional\n\
3997arguments start and end are interpreted as in slice notation.\n\
3998\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003999Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000
4001static PyObject *
4002unicode_find(PyUnicodeObject *self, PyObject *args)
4003{
4004 PyUnicodeObject *substring;
4005 int start = 0;
4006 int end = INT_MAX;
4007 PyObject *result;
4008
Guido van Rossumb8872e62000-05-09 14:14:27 +00004009 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4010 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 return NULL;
4012 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4013 (PyObject *)substring);
4014 if (substring == NULL)
4015 return NULL;
4016
4017 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4018
4019 Py_DECREF(substring);
4020 return result;
4021}
4022
4023static PyObject *
4024unicode_getitem(PyUnicodeObject *self, int index)
4025{
4026 if (index < 0 || index >= self->length) {
4027 PyErr_SetString(PyExc_IndexError, "string index out of range");
4028 return NULL;
4029 }
4030
4031 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4032}
4033
4034static long
4035unicode_hash(PyUnicodeObject *self)
4036{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004037 /* Since Unicode objects compare equal to their ASCII string
4038 counterparts, they should use the individual character values
4039 as basis for their hash value. This is needed to assure that
4040 strings and Unicode objects behave in the same way as
4041 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042
Fredrik Lundhdde61642000-07-10 18:27:47 +00004043 register int len;
4044 register Py_UNICODE *p;
4045 register long x;
4046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 if (self->hash != -1)
4048 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004049 len = PyUnicode_GET_SIZE(self);
4050 p = PyUnicode_AS_UNICODE(self);
4051 x = *p << 7;
4052 while (--len >= 0)
4053 x = (1000003*x) ^ *p++;
4054 x ^= PyUnicode_GET_SIZE(self);
4055 if (x == -1)
4056 x = -2;
4057 self->hash = x;
4058 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059}
4060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004061PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062"S.index(sub [,start [,end]]) -> int\n\
4063\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004064Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065
4066static PyObject *
4067unicode_index(PyUnicodeObject *self, PyObject *args)
4068{
4069 int result;
4070 PyUnicodeObject *substring;
4071 int start = 0;
4072 int end = INT_MAX;
4073
Guido van Rossumb8872e62000-05-09 14:14:27 +00004074 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4075 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 return NULL;
4077
4078 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4079 (PyObject *)substring);
4080 if (substring == NULL)
4081 return NULL;
4082
4083 result = findstring(self, substring, start, end, 1);
4084
4085 Py_DECREF(substring);
4086 if (result < 0) {
4087 PyErr_SetString(PyExc_ValueError, "substring not found");
4088 return NULL;
4089 }
4090 return PyInt_FromLong(result);
4091}
4092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004093PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004094"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004096Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004097at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098
4099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004100unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101{
4102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4103 register const Py_UNICODE *e;
4104 int cased;
4105
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 /* Shortcut for single character strings */
4107 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004108 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004110 /* Special case for empty strings */
4111 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004112 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004113
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 e = p + PyUnicode_GET_SIZE(self);
4115 cased = 0;
4116 for (; p < e; p++) {
4117 register const Py_UNICODE ch = *p;
4118
4119 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004120 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 else if (!cased && Py_UNICODE_ISLOWER(ch))
4122 cased = 1;
4123 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004124 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125}
4126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004127PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004128"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004130Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004131at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132
4133static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004134unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
4136 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4137 register const Py_UNICODE *e;
4138 int cased;
4139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 /* Shortcut for single character strings */
4141 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004142 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004144 /* Special case for empty strings */
4145 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004146 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004147
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 e = p + PyUnicode_GET_SIZE(self);
4149 cased = 0;
4150 for (; p < e; p++) {
4151 register const Py_UNICODE ch = *p;
4152
4153 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004154 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 else if (!cased && Py_UNICODE_ISUPPER(ch))
4156 cased = 1;
4157 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004158 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159}
4160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004161PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004162"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004164Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4165characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004166only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167
4168static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004169unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170{
4171 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4172 register const Py_UNICODE *e;
4173 int cased, previous_is_cased;
4174
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 /* Shortcut for single character strings */
4176 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004177 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4178 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004180 /* Special case for empty strings */
4181 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004182 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004183
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 e = p + PyUnicode_GET_SIZE(self);
4185 cased = 0;
4186 previous_is_cased = 0;
4187 for (; p < e; p++) {
4188 register const Py_UNICODE ch = *p;
4189
4190 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4191 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004192 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 previous_is_cased = 1;
4194 cased = 1;
4195 }
4196 else if (Py_UNICODE_ISLOWER(ch)) {
4197 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004198 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 previous_is_cased = 1;
4200 cased = 1;
4201 }
4202 else
4203 previous_is_cased = 0;
4204 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004205 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206}
4207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004208PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004209"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004211Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004212False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213
4214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004215unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216{
4217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4218 register const Py_UNICODE *e;
4219
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220 /* Shortcut for single character strings */
4221 if (PyUnicode_GET_SIZE(self) == 1 &&
4222 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004223 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004225 /* Special case for empty strings */
4226 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004228
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 e = p + PyUnicode_GET_SIZE(self);
4230 for (; p < e; p++) {
4231 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004232 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004234 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235}
4236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004237PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004238"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004239\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004240Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004241and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004242
4243static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004244unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004245{
4246 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4247 register const Py_UNICODE *e;
4248
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004249 /* Shortcut for single character strings */
4250 if (PyUnicode_GET_SIZE(self) == 1 &&
4251 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004252 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004253
4254 /* Special case for empty strings */
4255 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004256 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004257
4258 e = p + PyUnicode_GET_SIZE(self);
4259 for (; p < e; p++) {
4260 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004261 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004262 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004263 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004264}
4265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004266PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004267"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004268\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004269Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004270and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004271
4272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004273unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274{
4275 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4276 register const Py_UNICODE *e;
4277
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004278 /* Shortcut for single character strings */
4279 if (PyUnicode_GET_SIZE(self) == 1 &&
4280 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004281 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004282
4283 /* Special case for empty strings */
4284 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004285 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004286
4287 e = p + PyUnicode_GET_SIZE(self);
4288 for (; p < e; p++) {
4289 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004290 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004291 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004292 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004293}
4294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004295PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004296"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004298Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004299False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300
4301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004302unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303{
4304 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4305 register const Py_UNICODE *e;
4306
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 /* Shortcut for single character strings */
4308 if (PyUnicode_GET_SIZE(self) == 1 &&
4309 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004310 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004312 /* Special case for empty strings */
4313 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004314 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004315
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 e = p + PyUnicode_GET_SIZE(self);
4317 for (; p < e; p++) {
4318 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004319 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004321 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004324PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004325"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004327Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004328False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329
4330static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004331unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332{
4333 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4334 register const Py_UNICODE *e;
4335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 /* Shortcut for single character strings */
4337 if (PyUnicode_GET_SIZE(self) == 1 &&
4338 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004339 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004341 /* Special case for empty strings */
4342 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004343 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004344
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 e = p + PyUnicode_GET_SIZE(self);
4346 for (; p < e; p++) {
4347 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004348 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351}
4352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004353PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004354"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004356Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004357False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358
4359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004360unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361{
4362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4363 register const Py_UNICODE *e;
4364
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 /* Shortcut for single character strings */
4366 if (PyUnicode_GET_SIZE(self) == 1 &&
4367 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004368 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004370 /* Special case for empty strings */
4371 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004372 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004373
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 e = p + PyUnicode_GET_SIZE(self);
4375 for (; p < e; p++) {
4376 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004377 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004379 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380}
4381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004382PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383"S.join(sequence) -> unicode\n\
4384\n\
4385Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004386sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387
4388static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004389unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004391 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392}
4393
4394static int
4395unicode_length(PyUnicodeObject *self)
4396{
4397 return self->length;
4398}
4399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004400PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401"S.ljust(width) -> unicode\n\
4402\n\
4403Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004404done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405
4406static PyObject *
4407unicode_ljust(PyUnicodeObject *self, PyObject *args)
4408{
4409 int width;
4410 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4411 return NULL;
4412
Tim Peters7a29bd52001-09-12 03:03:31 +00004413 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 Py_INCREF(self);
4415 return (PyObject*) self;
4416 }
4417
4418 return (PyObject*) pad(self, 0, width - self->length, ' ');
4419}
4420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004421PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422"S.lower() -> unicode\n\
4423\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004424Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425
4426static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004427unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 return fixup(self, fixlower);
4430}
4431
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004432#define LEFTSTRIP 0
4433#define RIGHTSTRIP 1
4434#define BOTHSTRIP 2
4435
4436/* Arrays indexed by above */
4437static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4438
4439#define STRIPNAME(i) (stripformat[i]+3)
4440
4441static const Py_UNICODE *
4442unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4443{
Tim Peters030a5ce2002-04-22 19:00:10 +00004444 size_t i;
4445 for (i = 0; i < n; ++i)
4446 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004447 return s+i;
4448 return NULL;
4449}
4450
4451/* externally visible for str.strip(unicode) */
4452PyObject *
4453_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4454{
4455 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4456 int len = PyUnicode_GET_SIZE(self);
4457 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4458 int seplen = PyUnicode_GET_SIZE(sepobj);
4459 int i, j;
4460
4461 i = 0;
4462 if (striptype != RIGHTSTRIP) {
4463 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4464 i++;
4465 }
4466 }
4467
4468 j = len;
4469 if (striptype != LEFTSTRIP) {
4470 do {
4471 j--;
4472 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4473 j++;
4474 }
4475
4476 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4477 Py_INCREF(self);
4478 return (PyObject*)self;
4479 }
4480 else
4481 return PyUnicode_FromUnicode(s+i, j-i);
4482}
4483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484
4485static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004486do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004488 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4489 int len = PyUnicode_GET_SIZE(self), i, j;
4490
4491 i = 0;
4492 if (striptype != RIGHTSTRIP) {
4493 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4494 i++;
4495 }
4496 }
4497
4498 j = len;
4499 if (striptype != LEFTSTRIP) {
4500 do {
4501 j--;
4502 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4503 j++;
4504 }
4505
4506 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4507 Py_INCREF(self);
4508 return (PyObject*)self;
4509 }
4510 else
4511 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512}
4513
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004514
4515static PyObject *
4516do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4517{
4518 PyObject *sep = NULL;
4519
4520 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4521 return NULL;
4522
4523 if (sep != NULL && sep != Py_None) {
4524 if (PyUnicode_Check(sep))
4525 return _PyUnicode_XStrip(self, striptype, sep);
4526 else if (PyString_Check(sep)) {
4527 PyObject *res;
4528 sep = PyUnicode_FromObject(sep);
4529 if (sep==NULL)
4530 return NULL;
4531 res = _PyUnicode_XStrip(self, striptype, sep);
4532 Py_DECREF(sep);
4533 return res;
4534 }
4535 else {
4536 PyErr_Format(PyExc_TypeError,
4537 "%s arg must be None, unicode or str",
4538 STRIPNAME(striptype));
4539 return NULL;
4540 }
4541 }
4542
4543 return do_strip(self, striptype);
4544}
4545
4546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004547PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004548"S.strip([sep]) -> unicode\n\
4549\n\
4550Return a copy of the string S with leading and trailing\n\
4551whitespace removed.\n\
4552If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004553If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004554
4555static PyObject *
4556unicode_strip(PyUnicodeObject *self, PyObject *args)
4557{
4558 if (PyTuple_GET_SIZE(args) == 0)
4559 return do_strip(self, BOTHSTRIP); /* Common case */
4560 else
4561 return do_argstrip(self, BOTHSTRIP, args);
4562}
4563
4564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004565PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004566"S.lstrip([sep]) -> unicode\n\
4567\n\
4568Return a copy of the string S with leading whitespace removed.\n\
4569If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004570If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004571
4572static PyObject *
4573unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4574{
4575 if (PyTuple_GET_SIZE(args) == 0)
4576 return do_strip(self, LEFTSTRIP); /* Common case */
4577 else
4578 return do_argstrip(self, LEFTSTRIP, args);
4579}
4580
4581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004582PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004583"S.rstrip([sep]) -> unicode\n\
4584\n\
4585Return a copy of the string S with trailing whitespace removed.\n\
4586If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004587If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004588
4589static PyObject *
4590unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4591{
4592 if (PyTuple_GET_SIZE(args) == 0)
4593 return do_strip(self, RIGHTSTRIP); /* Common case */
4594 else
4595 return do_argstrip(self, RIGHTSTRIP, args);
4596}
4597
4598
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599static PyObject*
4600unicode_repeat(PyUnicodeObject *str, int len)
4601{
4602 PyUnicodeObject *u;
4603 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004604 int nchars;
4605 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606
4607 if (len < 0)
4608 len = 0;
4609
Tim Peters7a29bd52001-09-12 03:03:31 +00004610 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 /* no repeat, return original string */
4612 Py_INCREF(str);
4613 return (PyObject*) str;
4614 }
Tim Peters8f422462000-09-09 06:13:41 +00004615
4616 /* ensure # of chars needed doesn't overflow int and # of bytes
4617 * needed doesn't overflow size_t
4618 */
4619 nchars = len * str->length;
4620 if (len && nchars / len != str->length) {
4621 PyErr_SetString(PyExc_OverflowError,
4622 "repeated string is too long");
4623 return NULL;
4624 }
4625 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4626 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4627 PyErr_SetString(PyExc_OverflowError,
4628 "repeated string is too long");
4629 return NULL;
4630 }
4631 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 if (!u)
4633 return NULL;
4634
4635 p = u->str;
4636
4637 while (len-- > 0) {
4638 Py_UNICODE_COPY(p, str->str, str->length);
4639 p += str->length;
4640 }
4641
4642 return (PyObject*) u;
4643}
4644
4645PyObject *PyUnicode_Replace(PyObject *obj,
4646 PyObject *subobj,
4647 PyObject *replobj,
4648 int maxcount)
4649{
4650 PyObject *self;
4651 PyObject *str1;
4652 PyObject *str2;
4653 PyObject *result;
4654
4655 self = PyUnicode_FromObject(obj);
4656 if (self == NULL)
4657 return NULL;
4658 str1 = PyUnicode_FromObject(subobj);
4659 if (str1 == NULL) {
4660 Py_DECREF(self);
4661 return NULL;
4662 }
4663 str2 = PyUnicode_FromObject(replobj);
4664 if (str2 == NULL) {
4665 Py_DECREF(self);
4666 Py_DECREF(str1);
4667 return NULL;
4668 }
4669 result = replace((PyUnicodeObject *)self,
4670 (PyUnicodeObject *)str1,
4671 (PyUnicodeObject *)str2,
4672 maxcount);
4673 Py_DECREF(self);
4674 Py_DECREF(str1);
4675 Py_DECREF(str2);
4676 return result;
4677}
4678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004679PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680"S.replace (old, new[, maxsplit]) -> unicode\n\
4681\n\
4682Return a copy of S with all occurrences of substring\n\
4683old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004684given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686static PyObject*
4687unicode_replace(PyUnicodeObject *self, PyObject *args)
4688{
4689 PyUnicodeObject *str1;
4690 PyUnicodeObject *str2;
4691 int maxcount = -1;
4692 PyObject *result;
4693
4694 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4695 return NULL;
4696 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4697 if (str1 == NULL)
4698 return NULL;
4699 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4700 if (str2 == NULL)
4701 return NULL;
4702
4703 result = replace(self, str1, str2, maxcount);
4704
4705 Py_DECREF(str1);
4706 Py_DECREF(str2);
4707 return result;
4708}
4709
4710static
4711PyObject *unicode_repr(PyObject *unicode)
4712{
4713 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4714 PyUnicode_GET_SIZE(unicode),
4715 1);
4716}
4717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004718PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719"S.rfind(sub [,start [,end]]) -> int\n\
4720\n\
4721Return the highest index in S where substring sub is found,\n\
4722such that sub is contained within s[start,end]. Optional\n\
4723arguments start and end are interpreted as in slice notation.\n\
4724\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004725Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
4727static PyObject *
4728unicode_rfind(PyUnicodeObject *self, PyObject *args)
4729{
4730 PyUnicodeObject *substring;
4731 int start = 0;
4732 int end = INT_MAX;
4733 PyObject *result;
4734
Guido van Rossumb8872e62000-05-09 14:14:27 +00004735 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4736 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 return NULL;
4738 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4739 (PyObject *)substring);
4740 if (substring == NULL)
4741 return NULL;
4742
4743 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4744
4745 Py_DECREF(substring);
4746 return result;
4747}
4748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004749PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750"S.rindex(sub [,start [,end]]) -> int\n\
4751\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004752Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
4754static PyObject *
4755unicode_rindex(PyUnicodeObject *self, PyObject *args)
4756{
4757 int result;
4758 PyUnicodeObject *substring;
4759 int start = 0;
4760 int end = INT_MAX;
4761
Guido van Rossumb8872e62000-05-09 14:14:27 +00004762 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4763 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 return NULL;
4765 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4766 (PyObject *)substring);
4767 if (substring == NULL)
4768 return NULL;
4769
4770 result = findstring(self, substring, start, end, -1);
4771
4772 Py_DECREF(substring);
4773 if (result < 0) {
4774 PyErr_SetString(PyExc_ValueError, "substring not found");
4775 return NULL;
4776 }
4777 return PyInt_FromLong(result);
4778}
4779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004780PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781"S.rjust(width) -> unicode\n\
4782\n\
4783Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004784done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785
4786static PyObject *
4787unicode_rjust(PyUnicodeObject *self, PyObject *args)
4788{
4789 int width;
4790 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4791 return NULL;
4792
Tim Peters7a29bd52001-09-12 03:03:31 +00004793 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 Py_INCREF(self);
4795 return (PyObject*) self;
4796 }
4797
4798 return (PyObject*) pad(self, width - self->length, 0, ' ');
4799}
4800
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801static PyObject*
4802unicode_slice(PyUnicodeObject *self, int start, int end)
4803{
4804 /* standard clamping */
4805 if (start < 0)
4806 start = 0;
4807 if (end < 0)
4808 end = 0;
4809 if (end > self->length)
4810 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004811 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 /* full slice, return original string */
4813 Py_INCREF(self);
4814 return (PyObject*) self;
4815 }
4816 if (start > end)
4817 start = end;
4818 /* copy slice */
4819 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4820 end - start);
4821}
4822
4823PyObject *PyUnicode_Split(PyObject *s,
4824 PyObject *sep,
4825 int maxsplit)
4826{
4827 PyObject *result;
4828
4829 s = PyUnicode_FromObject(s);
4830 if (s == NULL)
4831 return NULL;
4832 if (sep != NULL) {
4833 sep = PyUnicode_FromObject(sep);
4834 if (sep == NULL) {
4835 Py_DECREF(s);
4836 return NULL;
4837 }
4838 }
4839
4840 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4841
4842 Py_DECREF(s);
4843 Py_XDECREF(sep);
4844 return result;
4845}
4846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004847PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848"S.split([sep [,maxsplit]]) -> list of strings\n\
4849\n\
4850Return a list of the words in S, using sep as the\n\
4851delimiter string. If maxsplit is given, at most maxsplit\n\
4852splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004853is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
4855static PyObject*
4856unicode_split(PyUnicodeObject *self, PyObject *args)
4857{
4858 PyObject *substring = Py_None;
4859 int maxcount = -1;
4860
4861 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4862 return NULL;
4863
4864 if (substring == Py_None)
4865 return split(self, NULL, maxcount);
4866 else if (PyUnicode_Check(substring))
4867 return split(self, (PyUnicodeObject *)substring, maxcount);
4868 else
4869 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4870}
4871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004872PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00004873"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874\n\
4875Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004876Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004877is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878
4879static PyObject*
4880unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4881{
Guido van Rossum86662912000-04-11 15:38:46 +00004882 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883
Guido van Rossum86662912000-04-11 15:38:46 +00004884 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 return NULL;
4886
Guido van Rossum86662912000-04-11 15:38:46 +00004887 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888}
4889
4890static
4891PyObject *unicode_str(PyUnicodeObject *self)
4892{
Fred Drakee4315f52000-05-09 19:53:39 +00004893 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894}
4895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004896PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897"S.swapcase() -> unicode\n\
4898\n\
4899Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004900and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901
4902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004903unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 return fixup(self, fixswapcase);
4906}
4907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004908PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909"S.translate(table) -> unicode\n\
4910\n\
4911Return a copy of the string S, where all characters have been mapped\n\
4912through the given translation table, which must be a mapping of\n\
4913Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914are left untouched. Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915
4916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004917unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 return PyUnicode_TranslateCharmap(self->str,
4920 self->length,
4921 table,
4922 "ignore");
4923}
4924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004925PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926"S.upper() -> unicode\n\
4927\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004928Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929
4930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004931unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 return fixup(self, fixupper);
4934}
4935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004936PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937"S.zfill(width) -> unicode\n\
4938\n\
4939Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004940of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941
4942static PyObject *
4943unicode_zfill(PyUnicodeObject *self, PyObject *args)
4944{
4945 int fill;
4946 PyUnicodeObject *u;
4947
4948 int width;
4949 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4950 return NULL;
4951
4952 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004953 if (PyUnicode_CheckExact(self)) {
4954 Py_INCREF(self);
4955 return (PyObject*) self;
4956 }
4957 else
4958 return PyUnicode_FromUnicode(
4959 PyUnicode_AS_UNICODE(self),
4960 PyUnicode_GET_SIZE(self)
4961 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 }
4963
4964 fill = width - self->length;
4965
4966 u = pad(self, fill, 0, '0');
4967
Walter Dörwald068325e2002-04-15 13:36:47 +00004968 if (u == NULL)
4969 return NULL;
4970
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 if (u->str[fill] == '+' || u->str[fill] == '-') {
4972 /* move sign to beginning of string */
4973 u->str[0] = u->str[fill];
4974 u->str[fill] = '0';
4975 }
4976
4977 return (PyObject*) u;
4978}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979
4980#if 0
4981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004982unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 return PyInt_FromLong(unicode_freelist_size);
4985}
4986#endif
4987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004988PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004989"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004991Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004993comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
4995static PyObject *
4996unicode_startswith(PyUnicodeObject *self,
4997 PyObject *args)
4998{
4999 PyUnicodeObject *substring;
5000 int start = 0;
5001 int end = INT_MAX;
5002 PyObject *result;
5003
Guido van Rossumb8872e62000-05-09 14:14:27 +00005004 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5005 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 return NULL;
5007 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5008 (PyObject *)substring);
5009 if (substring == NULL)
5010 return NULL;
5011
Guido van Rossum77f6a652002-04-03 22:41:51 +00005012 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013
5014 Py_DECREF(substring);
5015 return result;
5016}
5017
5018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005019PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005020"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005022Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005024comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025
5026static PyObject *
5027unicode_endswith(PyUnicodeObject *self,
5028 PyObject *args)
5029{
5030 PyUnicodeObject *substring;
5031 int start = 0;
5032 int end = INT_MAX;
5033 PyObject *result;
5034
Guido van Rossumb8872e62000-05-09 14:14:27 +00005035 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5036 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 return NULL;
5038 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5039 (PyObject *)substring);
5040 if (substring == NULL)
5041 return NULL;
5042
Guido van Rossum77f6a652002-04-03 22:41:51 +00005043 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044
5045 Py_DECREF(substring);
5046 return result;
5047}
5048
5049
5050static PyMethodDef unicode_methods[] = {
5051
5052 /* Order is according to common usage: often used methods should
5053 appear first, since lookup is done sequentially. */
5054
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005055 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5056 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5057 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5058 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5059 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5060 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5061 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5062 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5063 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5064 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5065 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5066 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5067 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005068 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005069/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5070 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5071 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5072 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005073 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005074 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005075 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005076 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5077 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5078 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5079 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5080 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5081 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5082 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5083 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5084 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5085 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5086 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5087 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5088 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5089 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005090 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005091#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005092 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093#endif
5094
5095#if 0
5096 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005097 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098#endif
5099
5100 {NULL, NULL}
5101};
5102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103static PySequenceMethods unicode_as_sequence = {
5104 (inquiry) unicode_length, /* sq_length */
5105 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5106 (intargfunc) unicode_repeat, /* sq_repeat */
5107 (intargfunc) unicode_getitem, /* sq_item */
5108 (intintargfunc) unicode_slice, /* sq_slice */
5109 0, /* sq_ass_item */
5110 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005111 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112};
5113
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005114static PyObject*
5115unicode_subscript(PyUnicodeObject* self, PyObject* item)
5116{
5117 if (PyInt_Check(item)) {
5118 long i = PyInt_AS_LONG(item);
5119 if (i < 0)
5120 i += PyString_GET_SIZE(self);
5121 return unicode_getitem(self, i);
5122 } else if (PyLong_Check(item)) {
5123 long i = PyLong_AsLong(item);
5124 if (i == -1 && PyErr_Occurred())
5125 return NULL;
5126 if (i < 0)
5127 i += PyString_GET_SIZE(self);
5128 return unicode_getitem(self, i);
5129 } else if (PySlice_Check(item)) {
5130 int start, stop, step, slicelength, cur, i;
5131 Py_UNICODE* source_buf;
5132 Py_UNICODE* result_buf;
5133 PyObject* result;
5134
5135 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5136 &start, &stop, &step, &slicelength) < 0) {
5137 return NULL;
5138 }
5139
5140 if (slicelength <= 0) {
5141 return PyUnicode_FromUnicode(NULL, 0);
5142 } else {
5143 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5144 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5145
5146 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5147 result_buf[i] = source_buf[cur];
5148 }
5149
5150 result = PyUnicode_FromUnicode(result_buf, slicelength);
5151 PyMem_FREE(result_buf);
5152 return result;
5153 }
5154 } else {
5155 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5156 return NULL;
5157 }
5158}
5159
5160static PyMappingMethods unicode_as_mapping = {
5161 (inquiry)unicode_length, /* mp_length */
5162 (binaryfunc)unicode_subscript, /* mp_subscript */
5163 (objobjargproc)0, /* mp_ass_subscript */
5164};
5165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166static int
5167unicode_buffer_getreadbuf(PyUnicodeObject *self,
5168 int index,
5169 const void **ptr)
5170{
5171 if (index != 0) {
5172 PyErr_SetString(PyExc_SystemError,
5173 "accessing non-existent unicode segment");
5174 return -1;
5175 }
5176 *ptr = (void *) self->str;
5177 return PyUnicode_GET_DATA_SIZE(self);
5178}
5179
5180static int
5181unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5182 const void **ptr)
5183{
5184 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005185 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return -1;
5187}
5188
5189static int
5190unicode_buffer_getsegcount(PyUnicodeObject *self,
5191 int *lenp)
5192{
5193 if (lenp)
5194 *lenp = PyUnicode_GET_DATA_SIZE(self);
5195 return 1;
5196}
5197
5198static int
5199unicode_buffer_getcharbuf(PyUnicodeObject *self,
5200 int index,
5201 const void **ptr)
5202{
5203 PyObject *str;
5204
5205 if (index != 0) {
5206 PyErr_SetString(PyExc_SystemError,
5207 "accessing non-existent unicode segment");
5208 return -1;
5209 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005210 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 if (str == NULL)
5212 return -1;
5213 *ptr = (void *) PyString_AS_STRING(str);
5214 return PyString_GET_SIZE(str);
5215}
5216
5217/* Helpers for PyUnicode_Format() */
5218
5219static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005220getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
5222 int argidx = *p_argidx;
5223 if (argidx < arglen) {
5224 (*p_argidx)++;
5225 if (arglen < 0)
5226 return args;
5227 else
5228 return PyTuple_GetItem(args, argidx);
5229 }
5230 PyErr_SetString(PyExc_TypeError,
5231 "not enough arguments for format string");
5232 return NULL;
5233}
5234
5235#define F_LJUST (1<<0)
5236#define F_SIGN (1<<1)
5237#define F_BLANK (1<<2)
5238#define F_ALT (1<<3)
5239#define F_ZERO (1<<4)
5240
5241static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
5244 register int i;
5245 int len;
5246 va_list va;
5247 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
5250 /* First, format the string as char array, then expand to Py_UNICODE
5251 array. */
5252 charbuffer = (char *)buffer;
5253 len = vsprintf(charbuffer, format, va);
5254 for (i = len - 1; i >= 0; i--)
5255 buffer[i] = (Py_UNICODE) charbuffer[i];
5256
5257 va_end(va);
5258 return len;
5259}
5260
Guido van Rossum078151d2002-08-11 04:24:12 +00005261/* XXX To save some code duplication, formatfloat/long/int could have been
5262 shared with stringobject.c, converting from 8-bit to Unicode after the
5263 formatting is done. */
5264
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265static int
5266formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005267 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 int flags,
5269 int prec,
5270 int type,
5271 PyObject *v)
5272{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005273 /* fmt = '%#.' + `prec` + `type`
5274 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 char fmt[20];
5276 double x;
5277
5278 x = PyFloat_AsDouble(v);
5279 if (x == -1.0 && PyErr_Occurred())
5280 return -1;
5281 if (prec < 0)
5282 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5284 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005285 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5286 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005287 /* worst case length calc to ensure no buffer overrun:
5288 fmt = %#.<prec>g
5289 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5290 for any double rep.)
5291 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5292 If prec=0 the effective precision is 1 (the leading digit is
5293 always given), therefore increase by one to 10+prec. */
5294 if (buflen <= (size_t)10 + (size_t)prec) {
5295 PyErr_SetString(PyExc_OverflowError,
5296 "formatted float is too long (precision too long?)");
5297 return -1;
5298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 return usprintf(buf, fmt, x);
5300}
5301
Tim Peters38fd5b62000-09-21 05:43:11 +00005302static PyObject*
5303formatlong(PyObject *val, int flags, int prec, int type)
5304{
5305 char *buf;
5306 int i, len;
5307 PyObject *str; /* temporary string object. */
5308 PyUnicodeObject *result;
5309
5310 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5311 if (!str)
5312 return NULL;
5313 result = _PyUnicode_New(len);
5314 for (i = 0; i < len; i++)
5315 result->str[i] = buf[i];
5316 result->str[len] = 0;
5317 Py_DECREF(str);
5318 return (PyObject*)result;
5319}
5320
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321static int
5322formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005323 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 int flags,
5325 int prec,
5326 int type,
5327 PyObject *v)
5328{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005329 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005330 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5331 * + 1 + 1
5332 * = 24
5333 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005334 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 long x;
5336
5337 x = PyInt_AsLong(v);
5338 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005339 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00005340 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00005341 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00005342 "%u/%o/%x/%X of negative int will return "
5343 "a signed string in Python 2.4 and up") < 0)
5344 return -1;
5345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005347 prec = 1;
5348
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005349 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005350 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5351 */
5352 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005353 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005354 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005355 return -1;
5356 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005357
5358 if ((flags & F_ALT) &&
5359 (type == 'x' || type == 'X')) {
5360 /* When converting under %#x or %#X, there are a number
5361 * of issues that cause pain:
5362 * - when 0 is being converted, the C standard leaves off
5363 * the '0x' or '0X', which is inconsistent with other
5364 * %#x/%#X conversions and inconsistent with Python's
5365 * hex() function
5366 * - there are platforms that violate the standard and
5367 * convert 0 with the '0x' or '0X'
5368 * (Metrowerks, Compaq Tru64)
5369 * - there are platforms that give '0x' when converting
5370 * under %#X, but convert 0 in accordance with the
5371 * standard (OS/2 EMX)
5372 *
5373 * We can achieve the desired consistency by inserting our
5374 * own '0x' or '0X' prefix, and substituting %x/%X in place
5375 * of %#x/%#X.
5376 *
5377 * Note that this is the same approach as used in
5378 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005379 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005380 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5381 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005382 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005383 else {
5384 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5385 (flags&F_ALT) ? "#" : "",
5386 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 return usprintf(buf, fmt, x);
5389}
5390
5391static int
5392formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005393 size_t buflen,
5394 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005396 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005397 if (PyUnicode_Check(v)) {
5398 if (PyUnicode_GET_SIZE(v) != 1)
5399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005403 else if (PyString_Check(v)) {
5404 if (PyString_GET_SIZE(v) != 1)
5405 goto onError;
5406 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
5409 else {
5410 /* Integer input truncated to a character */
5411 long x;
5412 x = PyInt_AsLong(v);
5413 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005414 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00005415#ifdef Py_UNICODE_WIDE
5416 if (x < 0 || x > 0x10ffff) {
5417 PyErr_SetString(PyExc_ValueError,
5418 "%c arg not in range(0x110000) "
5419 "(wide Python build)");
5420 return -1;
5421 }
5422#else
5423 if (x < 0 || x > 0xffff) {
5424 PyErr_SetString(PyExc_ValueError,
5425 "%c arg not in range(0x10000) "
5426 "(narrow Python build)");
5427 return -1;
5428 }
5429#endif
5430 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
5432 buf[1] = '\0';
5433 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005434
5435 onError:
5436 PyErr_SetString(PyExc_TypeError,
5437 "%c requires int or char");
5438 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439}
5440
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005441/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5442
5443 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5444 chars are formatted. XXX This is a magic number. Each formatting
5445 routine does bounds checking to ensure no overflow, but a better
5446 solution may be to malloc a buffer of appropriate size for each
5447 format. For now, the current solution is sufficient.
5448*/
5449#define FORMATBUFLEN (size_t)120
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451PyObject *PyUnicode_Format(PyObject *format,
5452 PyObject *args)
5453{
5454 Py_UNICODE *fmt, *res;
5455 int fmtcnt, rescnt, reslen, arglen, argidx;
5456 int args_owned = 0;
5457 PyUnicodeObject *result = NULL;
5458 PyObject *dict = NULL;
5459 PyObject *uformat;
5460
5461 if (format == NULL || args == NULL) {
5462 PyErr_BadInternalCall();
5463 return NULL;
5464 }
5465 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005466 if (uformat == NULL)
5467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 fmt = PyUnicode_AS_UNICODE(uformat);
5469 fmtcnt = PyUnicode_GET_SIZE(uformat);
5470
5471 reslen = rescnt = fmtcnt + 100;
5472 result = _PyUnicode_New(reslen);
5473 if (result == NULL)
5474 goto onError;
5475 res = PyUnicode_AS_UNICODE(result);
5476
5477 if (PyTuple_Check(args)) {
5478 arglen = PyTuple_Size(args);
5479 argidx = 0;
5480 }
5481 else {
5482 arglen = -1;
5483 argidx = -2;
5484 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005485 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 dict = args;
5487
5488 while (--fmtcnt >= 0) {
5489 if (*fmt != '%') {
5490 if (--rescnt < 0) {
5491 rescnt = fmtcnt + 100;
5492 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005493 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 return NULL;
5495 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5496 --rescnt;
5497 }
5498 *res++ = *fmt++;
5499 }
5500 else {
5501 /* Got a format specifier */
5502 int flags = 0;
5503 int width = -1;
5504 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 Py_UNICODE c = '\0';
5506 Py_UNICODE fill;
5507 PyObject *v = NULL;
5508 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005509 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_UNICODE sign;
5511 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005512 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
5514 fmt++;
5515 if (*fmt == '(') {
5516 Py_UNICODE *keystart;
5517 int keylen;
5518 PyObject *key;
5519 int pcount = 1;
5520
5521 if (dict == NULL) {
5522 PyErr_SetString(PyExc_TypeError,
5523 "format requires a mapping");
5524 goto onError;
5525 }
5526 ++fmt;
5527 --fmtcnt;
5528 keystart = fmt;
5529 /* Skip over balanced parentheses */
5530 while (pcount > 0 && --fmtcnt >= 0) {
5531 if (*fmt == ')')
5532 --pcount;
5533 else if (*fmt == '(')
5534 ++pcount;
5535 fmt++;
5536 }
5537 keylen = fmt - keystart - 1;
5538 if (fmtcnt < 0 || pcount > 0) {
5539 PyErr_SetString(PyExc_ValueError,
5540 "incomplete format key");
5541 goto onError;
5542 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005543#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005544 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 then looked up since Python uses strings to hold
5546 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005547 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 key = PyUnicode_EncodeUTF8(keystart,
5549 keylen,
5550 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005551#else
5552 key = PyUnicode_FromUnicode(keystart, keylen);
5553#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 if (key == NULL)
5555 goto onError;
5556 if (args_owned) {
5557 Py_DECREF(args);
5558 args_owned = 0;
5559 }
5560 args = PyObject_GetItem(dict, key);
5561 Py_DECREF(key);
5562 if (args == NULL) {
5563 goto onError;
5564 }
5565 args_owned = 1;
5566 arglen = -1;
5567 argidx = -2;
5568 }
5569 while (--fmtcnt >= 0) {
5570 switch (c = *fmt++) {
5571 case '-': flags |= F_LJUST; continue;
5572 case '+': flags |= F_SIGN; continue;
5573 case ' ': flags |= F_BLANK; continue;
5574 case '#': flags |= F_ALT; continue;
5575 case '0': flags |= F_ZERO; continue;
5576 }
5577 break;
5578 }
5579 if (c == '*') {
5580 v = getnextarg(args, arglen, &argidx);
5581 if (v == NULL)
5582 goto onError;
5583 if (!PyInt_Check(v)) {
5584 PyErr_SetString(PyExc_TypeError,
5585 "* wants int");
5586 goto onError;
5587 }
5588 width = PyInt_AsLong(v);
5589 if (width < 0) {
5590 flags |= F_LJUST;
5591 width = -width;
5592 }
5593 if (--fmtcnt >= 0)
5594 c = *fmt++;
5595 }
5596 else if (c >= '0' && c <= '9') {
5597 width = c - '0';
5598 while (--fmtcnt >= 0) {
5599 c = *fmt++;
5600 if (c < '0' || c > '9')
5601 break;
5602 if ((width*10) / 10 != width) {
5603 PyErr_SetString(PyExc_ValueError,
5604 "width too big");
5605 goto onError;
5606 }
5607 width = width*10 + (c - '0');
5608 }
5609 }
5610 if (c == '.') {
5611 prec = 0;
5612 if (--fmtcnt >= 0)
5613 c = *fmt++;
5614 if (c == '*') {
5615 v = getnextarg(args, arglen, &argidx);
5616 if (v == NULL)
5617 goto onError;
5618 if (!PyInt_Check(v)) {
5619 PyErr_SetString(PyExc_TypeError,
5620 "* wants int");
5621 goto onError;
5622 }
5623 prec = PyInt_AsLong(v);
5624 if (prec < 0)
5625 prec = 0;
5626 if (--fmtcnt >= 0)
5627 c = *fmt++;
5628 }
5629 else if (c >= '0' && c <= '9') {
5630 prec = c - '0';
5631 while (--fmtcnt >= 0) {
5632 c = Py_CHARMASK(*fmt++);
5633 if (c < '0' || c > '9')
5634 break;
5635 if ((prec*10) / 10 != prec) {
5636 PyErr_SetString(PyExc_ValueError,
5637 "prec too big");
5638 goto onError;
5639 }
5640 prec = prec*10 + (c - '0');
5641 }
5642 }
5643 } /* prec */
5644 if (fmtcnt >= 0) {
5645 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (--fmtcnt >= 0)
5647 c = *fmt++;
5648 }
5649 }
5650 if (fmtcnt < 0) {
5651 PyErr_SetString(PyExc_ValueError,
5652 "incomplete format");
5653 goto onError;
5654 }
5655 if (c != '%') {
5656 v = getnextarg(args, arglen, &argidx);
5657 if (v == NULL)
5658 goto onError;
5659 }
5660 sign = 0;
5661 fill = ' ';
5662 switch (c) {
5663
5664 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005665 pbuf = formatbuf;
5666 /* presume that buffer length is at least 1 */
5667 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 len = 1;
5669 break;
5670
5671 case 's':
5672 case 'r':
5673 if (PyUnicode_Check(v) && c == 's') {
5674 temp = v;
5675 Py_INCREF(temp);
5676 }
5677 else {
5678 PyObject *unicode;
5679 if (c == 's')
5680 temp = PyObject_Str(v);
5681 else
5682 temp = PyObject_Repr(v);
5683 if (temp == NULL)
5684 goto onError;
5685 if (!PyString_Check(temp)) {
5686 /* XXX Note: this should never happen, since
5687 PyObject_Repr() and PyObject_Str() assure
5688 this */
5689 Py_DECREF(temp);
5690 PyErr_SetString(PyExc_TypeError,
5691 "%s argument has non-string str()");
5692 goto onError;
5693 }
Fred Drakee4315f52000-05-09 19:53:39 +00005694 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005696 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 "strict");
5698 Py_DECREF(temp);
5699 temp = unicode;
5700 if (temp == NULL)
5701 goto onError;
5702 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005703 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 len = PyUnicode_GET_SIZE(temp);
5705 if (prec >= 0 && len > prec)
5706 len = prec;
5707 break;
5708
5709 case 'i':
5710 case 'd':
5711 case 'u':
5712 case 'o':
5713 case 'x':
5714 case 'X':
5715 if (c == 'i')
5716 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005717 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005718 temp = formatlong(v, flags, prec, c);
5719 if (!temp)
5720 goto onError;
5721 pbuf = PyUnicode_AS_UNICODE(temp);
5722 len = PyUnicode_GET_SIZE(temp);
5723 /* unbounded ints can always produce
5724 a sign character! */
5725 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005727 else {
5728 pbuf = formatbuf;
5729 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5730 flags, prec, c, v);
5731 if (len < 0)
5732 goto onError;
5733 /* only d conversion is signed */
5734 sign = c == 'd';
5735 }
5736 if (flags & F_ZERO)
5737 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 break;
5739
5740 case 'e':
5741 case 'E':
5742 case 'f':
5743 case 'g':
5744 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005745 pbuf = formatbuf;
5746 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5747 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 if (len < 0)
5749 goto onError;
5750 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005751 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 fill = '0';
5753 break;
5754
5755 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005756 pbuf = formatbuf;
5757 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 if (len < 0)
5759 goto onError;
5760 break;
5761
5762 default:
5763 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005764 "unsupported format character '%c' (0x%x) "
5765 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005766 (31<=c && c<=126) ? c : '?',
5767 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 goto onError;
5769 }
5770 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005771 if (*pbuf == '-' || *pbuf == '+') {
5772 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 len--;
5774 }
5775 else if (flags & F_SIGN)
5776 sign = '+';
5777 else if (flags & F_BLANK)
5778 sign = ' ';
5779 else
5780 sign = 0;
5781 }
5782 if (width < len)
5783 width = len;
5784 if (rescnt < width + (sign != 0)) {
5785 reslen -= rescnt;
5786 rescnt = width + fmtcnt + 100;
5787 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005788 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 return NULL;
5790 res = PyUnicode_AS_UNICODE(result)
5791 + reslen - rescnt;
5792 }
5793 if (sign) {
5794 if (fill != ' ')
5795 *res++ = sign;
5796 rescnt--;
5797 if (width > len)
5798 width--;
5799 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005800 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5801 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005802 assert(pbuf[1] == c);
5803 if (fill != ' ') {
5804 *res++ = *pbuf++;
5805 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005806 }
Tim Petersfff53252001-04-12 18:38:48 +00005807 rescnt -= 2;
5808 width -= 2;
5809 if (width < 0)
5810 width = 0;
5811 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 if (width > len && !(flags & F_LJUST)) {
5814 do {
5815 --rescnt;
5816 *res++ = fill;
5817 } while (--width > len);
5818 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005819 if (fill == ' ') {
5820 if (sign)
5821 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005822 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005823 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005824 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005825 *res++ = *pbuf++;
5826 *res++ = *pbuf++;
5827 }
5828 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005829 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 res += len;
5831 rescnt -= len;
5832 while (--width >= len) {
5833 --rescnt;
5834 *res++ = ' ';
5835 }
5836 if (dict && (argidx < arglen) && c != '%') {
5837 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005838 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 goto onError;
5840 }
5841 Py_XDECREF(temp);
5842 } /* '%' */
5843 } /* until end */
5844 if (argidx < arglen && !dict) {
5845 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005846 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 goto onError;
5848 }
5849
5850 if (args_owned) {
5851 Py_DECREF(args);
5852 }
5853 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005854 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 return (PyObject *)result;
5857
5858 onError:
5859 Py_XDECREF(result);
5860 Py_DECREF(uformat);
5861 if (args_owned) {
5862 Py_DECREF(args);
5863 }
5864 return NULL;
5865}
5866
5867static PyBufferProcs unicode_as_buffer = {
5868 (getreadbufferproc) unicode_buffer_getreadbuf,
5869 (getwritebufferproc) unicode_buffer_getwritebuf,
5870 (getsegcountproc) unicode_buffer_getsegcount,
5871 (getcharbufferproc) unicode_buffer_getcharbuf,
5872};
5873
Jeremy Hylton938ace62002-07-17 16:30:39 +00005874static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00005875unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5876
Tim Peters6d6c1a32001-08-02 04:15:00 +00005877static PyObject *
5878unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5879{
5880 PyObject *x = NULL;
5881 static char *kwlist[] = {"string", "encoding", "errors", 0};
5882 char *encoding = NULL;
5883 char *errors = NULL;
5884
Guido van Rossume023fe02001-08-30 03:12:59 +00005885 if (type != &PyUnicode_Type)
5886 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005887 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5888 kwlist, &x, &encoding, &errors))
5889 return NULL;
5890 if (x == NULL)
5891 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005892 if (encoding == NULL && errors == NULL)
5893 return PyObject_Unicode(x);
5894 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005895 return PyUnicode_FromEncodedObject(x, encoding, errors);
5896}
5897
Guido van Rossume023fe02001-08-30 03:12:59 +00005898static PyObject *
5899unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5900{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005901 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005902 int n;
5903
5904 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5905 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5906 if (tmp == NULL)
5907 return NULL;
5908 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005909 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5910 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005911 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005912 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5913 if (pnew->str == NULL) {
5914 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005915 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005916 return NULL;
5917 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005918 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5919 pnew->length = n;
5920 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005921 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005922 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005923}
5924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005925PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00005926"unicode(string [, encoding[, errors]]) -> object\n\
5927\n\
5928Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00005929encoding defaults to the current default string encoding.\n\
5930errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00005931
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932PyTypeObject PyUnicode_Type = {
5933 PyObject_HEAD_INIT(&PyType_Type)
5934 0, /* ob_size */
5935 "unicode", /* tp_name */
5936 sizeof(PyUnicodeObject), /* tp_size */
5937 0, /* tp_itemsize */
5938 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005939 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005941 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 0, /* tp_setattr */
5943 (cmpfunc) unicode_compare, /* tp_compare */
5944 (reprfunc) unicode_repr, /* tp_repr */
5945 0, /* tp_as_number */
5946 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005947 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 (hashfunc) unicode_hash, /* tp_hash*/
5949 0, /* tp_call*/
5950 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005951 PyObject_GenericGetAttr, /* tp_getattro */
5952 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005954 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005955 unicode_doc, /* tp_doc */
5956 0, /* tp_traverse */
5957 0, /* tp_clear */
5958 0, /* tp_richcompare */
5959 0, /* tp_weaklistoffset */
5960 0, /* tp_iter */
5961 0, /* tp_iternext */
5962 unicode_methods, /* tp_methods */
5963 0, /* tp_members */
5964 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00005965 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005966 0, /* tp_dict */
5967 0, /* tp_descr_get */
5968 0, /* tp_descr_set */
5969 0, /* tp_dictoffset */
5970 0, /* tp_init */
5971 0, /* tp_alloc */
5972 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005973 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974};
5975
5976/* Initialize the Unicode implementation */
5977
Thomas Wouters78890102000-07-22 19:25:51 +00005978void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005980 int i;
5981
Fred Drakee4315f52000-05-09 19:53:39 +00005982 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005983 unicode_freelist = NULL;
5984 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005986 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005987 for (i = 0; i < 256; i++)
5988 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00005989 if (PyType_Ready(&PyUnicode_Type) < 0)
5990 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991}
5992
5993/* Finalize the Unicode implementation */
5994
5995void
Thomas Wouters78890102000-07-22 19:25:51 +00005996_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005998 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005999 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006001 Py_XDECREF(unicode_empty);
6002 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006003
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006004 for (i = 0; i < 256; i++) {
6005 if (unicode_latin1[i]) {
6006 Py_DECREF(unicode_latin1[i]);
6007 unicode_latin1[i] = NULL;
6008 }
6009 }
6010
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006011 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 PyUnicodeObject *v = u;
6013 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006014 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006015 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006016 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006017 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006019 unicode_freelist = NULL;
6020 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021}