blob: d6fd62af8040a2ee3bd8e005aa012ea1b2f357b1 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
Tim Peters5de98422002-04-27 18:44:32 +0000930 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000931 return v;
932}
933
934#undef SPECIAL
935#undef B64
936#undef B64CHAR
937#undef UB64
938#undef ENCODE
939#undef DECODE
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941/* --- UTF-8 Codec -------------------------------------------------------- */
942
943static
944char utf8_code_length[256] = {
945 /* Map UTF-8 encoded prefix byte to sequence length. zero means
946 illegal prefix. see RFC 2279 for details */
947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
960 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
961 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
962 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
963};
964
965static
966int utf8_decoding_error(const char **source,
967 Py_UNICODE **dest,
968 const char *errors,
969 const char *details)
970{
971 if ((errors == NULL) ||
972 (strcmp(errors,"strict") == 0)) {
973 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000974 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975 details);
976 return -1;
977 }
978 else if (strcmp(errors,"ignore") == 0) {
979 (*source)++;
980 return 0;
981 }
982 else if (strcmp(errors,"replace") == 0) {
983 (*source)++;
984 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
985 (*dest)++;
986 return 0;
987 }
988 else {
989 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000990 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 errors);
992 return -1;
993 }
994}
995
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996PyObject *PyUnicode_DecodeUTF8(const char *s,
997 int size,
998 const char *errors)
999{
1000 int n;
1001 const char *e;
1002 PyUnicodeObject *unicode;
1003 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001004 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005
1006 /* Note: size will always be longer than the resulting Unicode
1007 character count */
1008 unicode = _PyUnicode_New(size);
1009 if (!unicode)
1010 return NULL;
1011 if (size == 0)
1012 return (PyObject *)unicode;
1013
1014 /* Unpack UTF-8 encoded data */
1015 p = unicode->str;
1016 e = s + size;
1017
1018 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001019 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020
1021 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 s++;
1024 continue;
1025 }
1026
1027 n = utf8_code_length[ch];
1028
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001029 if (s + n > e) {
1030 errmsg = "unexpected end of data";
1031 goto utf8Error;
1032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033
1034 switch (n) {
1035
1036 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001037 errmsg = "unexpected code byte";
1038 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039
1040 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001041 errmsg = "internal error";
1042 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001045 if ((s[1] & 0xc0) != 0x80) {
1046 errmsg = "invalid data";
1047 goto utf8Error;
1048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001050 if (ch < 0x80) {
1051 errmsg = "illegal encoding";
1052 goto utf8Error;
1053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 break;
1057
1058 case 3:
1059 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 (s[2] & 0xc0) != 0x80) {
1061 errmsg = "invalid data";
1062 goto utf8Error;
1063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001065 if (ch < 0x0800) {
1066 /* Note: UTF-8 encodings of surrogates are considered
1067 legal UTF-8 sequences;
1068
1069 XXX For wide builds (UCS-4) we should probably try
1070 to recombine the surrogates into a single code
1071 unit.
1072 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001073 errmsg = "illegal encoding";
1074 goto utf8Error;
1075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001077 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001078 break;
1079
1080 case 4:
1081 if ((s[1] & 0xc0) != 0x80 ||
1082 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001083 (s[3] & 0xc0) != 0x80) {
1084 errmsg = "invalid data";
1085 goto utf8Error;
1086 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001087 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1088 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1089 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001091 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001092 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001093 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001094 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001095 errmsg = "illegal encoding";
1096 goto utf8Error;
1097 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001098#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001099 *p++ = (Py_UNICODE)ch;
1100#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001101 /* compute and append the two surrogates: */
1102
1103 /* translate from 10000..10FFFF to 0..FFFF */
1104 ch -= 0x10000;
1105
1106 /* high surrogate = top 10 bits added to D800 */
1107 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1108
1109 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001110 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001111#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 break;
1113
1114 default:
1115 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001116 errmsg = "unsupported Unicode code range";
1117 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 }
1119 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001120 continue;
1121
1122 utf8Error:
1123 if (utf8_decoding_error(&s, &p, errors, errmsg))
1124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 }
1126
1127 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001128 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129 goto onError;
1130
1131 return (PyObject *)unicode;
1132
1133onError:
1134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
Tim Peters602f7402002-04-27 18:03:26 +00001138/* Allocation strategy: if the string is short, convert into a stack buffer
1139 and allocate exactly as much space needed at the end. Else allocate the
1140 maximum possible needed (4 result bytes per Unicode character), and return
1141 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001142*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001143PyObject *
1144PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1145 int size,
1146 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147{
Tim Peters602f7402002-04-27 18:03:26 +00001148#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001149
Tim Peters602f7402002-04-27 18:03:26 +00001150 int i; /* index into s of next input byte */
1151 PyObject *v; /* result string object */
1152 char *p; /* next free byte in output buffer */
1153 int nallocated; /* number of result bytes allocated */
1154 int nneeded; /* number of result bytes needed */
1155 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001156
Tim Peters602f7402002-04-27 18:03:26 +00001157 assert(s != NULL);
1158 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159
Tim Peters602f7402002-04-27 18:03:26 +00001160 if (size <= MAX_SHORT_UNICHARS) {
1161 /* Write into the stack buffer; nallocated can't overflow.
1162 * At the end, we'll allocate exactly as much heap space as it
1163 * turns out we need.
1164 */
1165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1166 v = NULL; /* will allocate after we're done */
1167 p = stackbuf;
1168 }
1169 else {
1170 /* Overallocate on the heap, and give the excess back at the end. */
1171 nallocated = size * 4;
1172 if (nallocated / 4 != size) /* overflow! */
1173 return PyErr_NoMemory();
1174 v = PyString_FromStringAndSize(NULL, nallocated);
1175 if (v == NULL)
1176 return NULL;
1177 p = PyString_AS_STRING(v);
1178 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001179
Tim Peters602f7402002-04-27 18:03:26 +00001180 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001181 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001182
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001183 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001184 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001188 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001189 *p++ = (char)(0xc0 | (ch >> 6));
1190 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001192 else {
Tim Peters602f7402002-04-27 18:03:26 +00001193 /* Encode UCS2 Unicode ordinals */
1194 if (ch < 0x10000) {
1195 /* Special case: check for high surrogate */
1196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1197 Py_UCS4 ch2 = s[i];
1198 /* Check for low surrogate and combine the two to
1199 form a UCS4 value */
1200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001202 i++;
1203 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001204 }
Tim Peters602f7402002-04-27 18:03:26 +00001205 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001206 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001207 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1209 *p++ = (char)(0x80 | (ch & 0x3f));
1210 continue;
1211 }
1212encodeUCS4:
1213 /* Encode UCS4 Unicode ordinals */
1214 *p++ = (char)(0xf0 | (ch >> 18));
1215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1217 *p++ = (char)(0x80 | (ch & 0x3f));
1218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001220
Tim Peters602f7402002-04-27 18:03:26 +00001221 if (v == NULL) {
1222 /* This was stack allocated. */
1223 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1224 assert(nneeded <= nallocated);
1225 v = PyString_FromStringAndSize(stackbuf, nneeded);
1226 }
1227 else {
1228 /* Cut back to size actually needed. */
1229 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1230 assert(nneeded <= nallocated);
1231 _PyString_Resize(&v, nneeded);
1232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001234
Tim Peters602f7402002-04-27 18:03:26 +00001235#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236}
1237
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1239{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (!PyUnicode_Check(unicode)) {
1241 PyErr_BadArgument();
1242 return NULL;
1243 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001244 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1245 PyUnicode_GET_SIZE(unicode),
1246 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247}
1248
1249/* --- UTF-16 Codec ------------------------------------------------------- */
1250
1251static
Tim Peters772747b2001-08-09 22:21:55 +00001252int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 const char *errors,
1254 const char *details)
1255{
1256 if ((errors == NULL) ||
1257 (strcmp(errors,"strict") == 0)) {
1258 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001259 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 details);
1261 return -1;
1262 }
1263 else if (strcmp(errors,"ignore") == 0) {
1264 return 0;
1265 }
1266 else if (strcmp(errors,"replace") == 0) {
1267 if (dest) {
1268 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1269 (*dest)++;
1270 }
1271 return 0;
1272 }
1273 else {
1274 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001275 "UTF-16 decoding error; "
1276 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 errors);
1278 return -1;
1279 }
1280}
1281
Tim Peters772747b2001-08-09 22:21:55 +00001282PyObject *
1283PyUnicode_DecodeUTF16(const char *s,
1284 int size,
1285 const char *errors,
1286 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 PyUnicodeObject *unicode;
1289 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001290 const unsigned char *q, *e;
1291 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001293 /* Offsets from q for retrieving byte pairs in the right order. */
1294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1295 int ihi = 1, ilo = 0;
1296#else
1297 int ihi = 0, ilo = 1;
1298#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299
1300 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001301 if (size & 1) {
1302 if (utf16_decoding_error(NULL, errors, "truncated data"))
1303 return NULL;
1304 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 }
1306
1307 /* Note: size will always be longer than the resulting Unicode
1308 character count */
1309 unicode = _PyUnicode_New(size);
1310 if (!unicode)
1311 return NULL;
1312 if (size == 0)
1313 return (PyObject *)unicode;
1314
1315 /* Unpack UTF-16 encoded data */
1316 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001317 q = (unsigned char *)s;
1318 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001321 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001323 /* Check for BOM marks (U+FEFF) in the input and adjust current
1324 byte order setting accordingly. In native mode, the leading BOM
1325 mark is skipped, in all other modes, it is copied to the output
1326 stream as-is (giving a ZWNBSP character). */
1327 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001328 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001330 if (bom == 0xFEFF) {
1331 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001332 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001333 }
1334 else if (bom == 0xFFFE) {
1335 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001336 bo = 1;
1337 }
1338#else
Tim Peters772747b2001-08-09 22:21:55 +00001339 if (bom == 0xFEFF) {
1340 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001341 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001342 }
1343 else if (bom == 0xFFFE) {
1344 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345 bo = -1;
1346 }
1347#endif
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349
Tim Peters772747b2001-08-09 22:21:55 +00001350 if (bo == -1) {
1351 /* force LE */
1352 ihi = 1;
1353 ilo = 0;
1354 }
1355 else if (bo == 1) {
1356 /* force BE */
1357 ihi = 0;
1358 ilo = 1;
1359 }
1360
1361 while (q < e) {
1362 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1363 q += 2;
1364
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (ch < 0xD800 || ch > 0xDFFF) {
1366 *p++ = ch;
1367 continue;
1368 }
1369
1370 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 if (q >= e) {
1372 errmsg = "unexpected end of data";
1373 goto utf16Error;
1374 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001375 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001376 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1377 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001378 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001379#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001380 *p++ = ch;
1381 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001382#else
1383 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001384#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001385 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001386 }
1387 else {
1388 errmsg = "illegal UTF-16 surrogate";
1389 goto utf16Error;
1390 }
1391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001393 errmsg = "illegal encoding";
1394 /* Fall through to report the error */
1395
1396 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001397 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 }
1400
1401 if (byteorder)
1402 *byteorder = bo;
1403
1404 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001405 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 goto onError;
1407
1408 return (PyObject *)unicode;
1409
1410onError:
1411 Py_DECREF(unicode);
1412 return NULL;
1413}
1414
Tim Peters772747b2001-08-09 22:21:55 +00001415PyObject *
1416PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1417 int size,
1418 const char *errors,
1419 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420{
1421 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001422 unsigned char *p;
1423 int i, pairs;
1424 /* Offsets from p for storing byte pairs in the right order. */
1425#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1426 int ihi = 1, ilo = 0;
1427#else
1428 int ihi = 0, ilo = 1;
1429#endif
1430
1431#define STORECHAR(CH) \
1432 do { \
1433 p[ihi] = ((CH) >> 8) & 0xff; \
1434 p[ilo] = (CH) & 0xff; \
1435 p += 2; \
1436 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001438 for (i = pairs = 0; i < size; i++)
1439 if (s[i] >= 0x10000)
1440 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001442 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 if (v == NULL)
1444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445
Tim Peters772747b2001-08-09 22:21:55 +00001446 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001448 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001449 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001450 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001451
1452 if (byteorder == -1) {
1453 /* force LE */
1454 ihi = 1;
1455 ilo = 0;
1456 }
1457 else if (byteorder == 1) {
1458 /* force BE */
1459 ihi = 0;
1460 ilo = 1;
1461 }
1462
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001463 while (size-- > 0) {
1464 Py_UNICODE ch = *s++;
1465 Py_UNICODE ch2 = 0;
1466 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001467 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1468 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
Tim Peters772747b2001-08-09 22:21:55 +00001470 STORECHAR(ch);
1471 if (ch2)
1472 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001475#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476}
1477
1478PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1479{
1480 if (!PyUnicode_Check(unicode)) {
1481 PyErr_BadArgument();
1482 return NULL;
1483 }
1484 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 NULL,
1487 0);
1488}
1489
1490/* --- Unicode Escape Codec ----------------------------------------------- */
1491
1492static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001493int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 const char *errors,
1495 const char *details)
1496{
1497 if ((errors == NULL) ||
1498 (strcmp(errors,"strict") == 0)) {
1499 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001500 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 details);
1502 return -1;
1503 }
1504 else if (strcmp(errors,"ignore") == 0) {
1505 return 0;
1506 }
1507 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001508 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1509 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 return 0;
1511 }
1512 else {
1513 PyErr_Format(PyExc_ValueError,
1514 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001515 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 errors);
1517 return -1;
1518 }
1519}
1520
Fredrik Lundh06d12682001-01-24 07:59:11 +00001521static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001522
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1524 int size,
1525 const char *errors)
1526{
1527 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001528 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001530 char* message;
1531 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1532
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 /* Escaped strings will always be longer than the resulting
1534 Unicode string, so we start with size here and then reduce the
1535 length after conversion to the true value. */
1536 v = _PyUnicode_New(size);
1537 if (v == NULL)
1538 goto onError;
1539 if (size == 0)
1540 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001541
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 p = buf = PyUnicode_AS_UNICODE(v);
1543 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 while (s < end) {
1546 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001547 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001548 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549
1550 /* Non-escape characters are interpreted as Unicode ordinals */
1551 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001552 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 continue;
1554 }
1555
1556 /* \ - Escapes */
1557 s++;
1558 switch (*s++) {
1559
1560 /* \x escapes */
1561 case '\n': break;
1562 case '\\': *p++ = '\\'; break;
1563 case '\'': *p++ = '\''; break;
1564 case '\"': *p++ = '\"'; break;
1565 case 'b': *p++ = '\b'; break;
1566 case 'f': *p++ = '\014'; break; /* FF */
1567 case 't': *p++ = '\t'; break;
1568 case 'n': *p++ = '\n'; break;
1569 case 'r': *p++ = '\r'; break;
1570 case 'v': *p++ = '\013'; break; /* VT */
1571 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1572
1573 /* \OOO (octal) escapes */
1574 case '0': case '1': case '2': case '3':
1575 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001576 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001578 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001580 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001582 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 break;
1584
Fredrik Lundhccc74732001-02-18 22:13:49 +00001585 /* hex escapes */
1586 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001588 digits = 2;
1589 message = "truncated \\xXX escape";
1590 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591
Fredrik Lundhccc74732001-02-18 22:13:49 +00001592 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001594 digits = 4;
1595 message = "truncated \\uXXXX escape";
1596 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597
Fredrik Lundhccc74732001-02-18 22:13:49 +00001598 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001599 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001600 digits = 8;
1601 message = "truncated \\UXXXXXXXX escape";
1602 hexescape:
1603 chr = 0;
1604 for (i = 0; i < digits; i++) {
1605 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001606 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001607 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001608 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001609 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001610 i++;
1611 break;
1612 }
1613 chr = (chr<<4) & ~0xF;
1614 if (c >= '0' && c <= '9')
1615 chr += c - '0';
1616 else if (c >= 'a' && c <= 'f')
1617 chr += 10 + c - 'a';
1618 else
1619 chr += 10 + c - 'A';
1620 }
1621 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001622 if (chr == 0xffffffff)
1623 /* _decoding_error will have already written into the
1624 target buffer. */
1625 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001627 /* when we get here, chr is a 32-bit unicode character */
1628 if (chr <= 0xffff)
1629 /* UCS-2 character */
1630 *p++ = (Py_UNICODE) chr;
1631 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001632 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001633 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001634#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635 *p++ = chr;
1636#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001637 chr -= 0x10000L;
1638 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001639 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001641 } else {
1642 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001643 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001644 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 )
1646 goto onError;
1647 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648 break;
1649
1650 /* \N{name} */
1651 case 'N':
1652 message = "malformed \\N character escape";
1653 if (ucnhash_CAPI == NULL) {
1654 /* load the unicode data module */
1655 PyObject *m, *v;
1656 m = PyImport_ImportModule("unicodedata");
1657 if (m == NULL)
1658 goto ucnhashError;
1659 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1660 Py_DECREF(m);
1661 if (v == NULL)
1662 goto ucnhashError;
1663 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1664 Py_DECREF(v);
1665 if (ucnhash_CAPI == NULL)
1666 goto ucnhashError;
1667 }
1668 if (*s == '{') {
1669 const char *start = s+1;
1670 /* look for the closing brace */
1671 while (*s != '}' && s < end)
1672 s++;
1673 if (s > start && s < end && *s == '}') {
1674 /* found a name. look it up in the unicode database */
1675 message = "unknown Unicode character name";
1676 s++;
1677 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1678 goto store;
1679 }
1680 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001681 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001683 break;
1684
1685 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001686 if (s > end) {
1687 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1688 goto onError;
1689 }
1690 else {
1691 *p++ = '\\';
1692 *p++ = (unsigned char)s[-1];
1693 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001694 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695 }
1696 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001697 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001700
Fredrik Lundhccc74732001-02-18 22:13:49 +00001701ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001702 PyErr_SetString(
1703 PyExc_UnicodeError,
1704 "\\N escapes not supported (can't load unicodedata module)"
1705 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001706 return NULL;
1707
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 Py_XDECREF(v);
1710 return NULL;
1711}
1712
1713/* Return a Unicode-Escape string version of the Unicode object.
1714
1715 If quotes is true, the string is enclosed in u"" or u'' quotes as
1716 appropriate.
1717
1718*/
1719
Barry Warsaw51ac5802000-03-20 16:36:48 +00001720static const Py_UNICODE *findchar(const Py_UNICODE *s,
1721 int size,
1722 Py_UNICODE ch);
1723
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724static
1725PyObject *unicodeescape_string(const Py_UNICODE *s,
1726 int size,
1727 int quotes)
1728{
1729 PyObject *repr;
1730 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001732 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
1734 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1735 if (repr == NULL)
1736 return NULL;
1737
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001738 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 *p++ = 'u';
1742 *p++ = (findchar(s, size, '\'') &&
1743 !findchar(s, size, '"')) ? '"' : '\'';
1744 }
1745 while (size-- > 0) {
1746 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001747
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001749 if (quotes &&
1750 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 *p++ = '\\';
1752 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001753 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001755
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001756#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001757 /* Map 21-bit characters to '\U00xxxxxx' */
1758 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001759 int offset = p - PyString_AS_STRING(repr);
1760
1761 /* Resize the string if necessary */
1762 if (offset + 12 > PyString_GET_SIZE(repr)) {
1763 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001764 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001765 p = PyString_AS_STRING(repr) + offset;
1766 }
1767
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001768 *p++ = '\\';
1769 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001770 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1771 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1772 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1773 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1774 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001777 *p++ = hexdigit[ch & 0x0000000F];
1778 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001779 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001780#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001781 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1782 else if (ch >= 0xD800 && ch < 0xDC00) {
1783 Py_UNICODE ch2;
1784 Py_UCS4 ucs;
1785
1786 ch2 = *s++;
1787 size--;
1788 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1789 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1790 *p++ = '\\';
1791 *p++ = 'U';
1792 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1793 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1794 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1795 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1796 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1799 *p++ = hexdigit[ucs & 0x0000000F];
1800 continue;
1801 }
1802 /* Fall through: isolated surrogates are copied as-is */
1803 s--;
1804 size++;
1805 }
1806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001808 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 *p++ = '\\';
1810 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001811 *p++ = hexdigit[(ch >> 12) & 0x000F];
1812 *p++ = hexdigit[(ch >> 8) & 0x000F];
1813 *p++ = hexdigit[(ch >> 4) & 0x000F];
1814 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001816
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001817 /* Map special whitespace to '\t', \n', '\r' */
1818 else if (ch == '\t') {
1819 *p++ = '\\';
1820 *p++ = 't';
1821 }
1822 else if (ch == '\n') {
1823 *p++ = '\\';
1824 *p++ = 'n';
1825 }
1826 else if (ch == '\r') {
1827 *p++ = '\\';
1828 *p++ = 'r';
1829 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001830
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001831 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001832 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001834 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001835 *p++ = hexdigit[(ch >> 4) & 0x000F];
1836 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001838
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 /* Copy everything else as-is */
1840 else
1841 *p++ = (char) ch;
1842 }
1843 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001844 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845
1846 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001847 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 return repr;
1849}
1850
1851PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1852 int size)
1853{
1854 return unicodeescape_string(s, size, 0);
1855}
1856
1857PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1858{
1859 if (!PyUnicode_Check(unicode)) {
1860 PyErr_BadArgument();
1861 return NULL;
1862 }
1863 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1864 PyUnicode_GET_SIZE(unicode));
1865}
1866
1867/* --- Raw Unicode Escape Codec ------------------------------------------- */
1868
1869PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1870 int size,
1871 const char *errors)
1872{
1873 PyUnicodeObject *v;
1874 Py_UNICODE *p, *buf;
1875 const char *end;
1876 const char *bs;
1877
1878 /* Escaped strings will always be longer than the resulting
1879 Unicode string, so we start with size here and then reduce the
1880 length after conversion to the true value. */
1881 v = _PyUnicode_New(size);
1882 if (v == NULL)
1883 goto onError;
1884 if (size == 0)
1885 return (PyObject *)v;
1886 p = buf = PyUnicode_AS_UNICODE(v);
1887 end = s + size;
1888 while (s < end) {
1889 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001890 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 int i;
1892
1893 /* Non-escape characters are interpreted as Unicode ordinals */
1894 if (*s != '\\') {
1895 *p++ = (unsigned char)*s++;
1896 continue;
1897 }
1898
1899 /* \u-escapes are only interpreted iff the number of leading
1900 backslashes if odd */
1901 bs = s;
1902 for (;s < end;) {
1903 if (*s != '\\')
1904 break;
1905 *p++ = (unsigned char)*s++;
1906 }
1907 if (((s - bs) & 1) == 0 ||
1908 s >= end ||
1909 *s != 'u') {
1910 continue;
1911 }
1912 p--;
1913 s++;
1914
1915 /* \uXXXX with 4 hex digits */
1916 for (x = 0, i = 0; i < 4; i++) {
1917 c = (unsigned char)s[i];
1918 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001919 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 "truncated \\uXXXX"))
1921 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001922 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 i++;
1924 break;
1925 }
1926 x = (x<<4) & ~0xF;
1927 if (c >= '0' && c <= '9')
1928 x += c - '0';
1929 else if (c >= 'a' && c <= 'f')
1930 x += 10 + c - 'a';
1931 else
1932 x += 10 + c - 'A';
1933 }
1934 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001935 if (x != 0xffffffff)
1936 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001938 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return (PyObject *)v;
1941
1942 onError:
1943 Py_XDECREF(v);
1944 return NULL;
1945}
1946
1947PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1948 int size)
1949{
1950 PyObject *repr;
1951 char *p;
1952 char *q;
1953
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001954 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955
1956 repr = PyString_FromStringAndSize(NULL, 6 * size);
1957 if (repr == NULL)
1958 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001959 if (size == 0)
1960 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
1962 p = q = PyString_AS_STRING(repr);
1963 while (size-- > 0) {
1964 Py_UNICODE ch = *s++;
1965 /* Map 16-bit characters to '\uxxxx' */
1966 if (ch >= 256) {
1967 *p++ = '\\';
1968 *p++ = 'u';
1969 *p++ = hexdigit[(ch >> 12) & 0xf];
1970 *p++ = hexdigit[(ch >> 8) & 0xf];
1971 *p++ = hexdigit[(ch >> 4) & 0xf];
1972 *p++ = hexdigit[ch & 15];
1973 }
1974 /* Copy everything else as-is */
1975 else
1976 *p++ = (char) ch;
1977 }
1978 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001979 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 return repr;
1981}
1982
1983PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1984{
1985 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument();
1987 return NULL;
1988 }
1989 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1990 PyUnicode_GET_SIZE(unicode));
1991}
1992
1993/* --- Latin-1 Codec ------------------------------------------------------ */
1994
1995PyObject *PyUnicode_DecodeLatin1(const char *s,
1996 int size,
1997 const char *errors)
1998{
1999 PyUnicodeObject *v;
2000 Py_UNICODE *p;
2001
2002 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002003 if (size == 1 && *(unsigned char*)s < 256) {
2004 Py_UNICODE r = *(unsigned char*)s;
2005 return PyUnicode_FromUnicode(&r, 1);
2006 }
2007
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 v = _PyUnicode_New(size);
2009 if (v == NULL)
2010 goto onError;
2011 if (size == 0)
2012 return (PyObject *)v;
2013 p = PyUnicode_AS_UNICODE(v);
2014 while (size-- > 0)
2015 *p++ = (unsigned char)*s++;
2016 return (PyObject *)v;
2017
2018 onError:
2019 Py_XDECREF(v);
2020 return NULL;
2021}
2022
2023static
2024int latin1_encoding_error(const Py_UNICODE **source,
2025 char **dest,
2026 const char *errors,
2027 const char *details)
2028{
2029 if ((errors == NULL) ||
2030 (strcmp(errors,"strict") == 0)) {
2031 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002032 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 details);
2034 return -1;
2035 }
2036 else if (strcmp(errors,"ignore") == 0) {
2037 return 0;
2038 }
2039 else if (strcmp(errors,"replace") == 0) {
2040 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002041 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 return 0;
2043 }
2044 else {
2045 PyErr_Format(PyExc_ValueError,
2046 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002047 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 errors);
2049 return -1;
2050 }
2051}
2052
2053PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2054 int size,
2055 const char *errors)
2056{
2057 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002058 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002059
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 repr = PyString_FromStringAndSize(NULL, size);
2061 if (repr == NULL)
2062 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002063 if (size == 0)
2064 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065
2066 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002067 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 while (size-- > 0) {
2069 Py_UNICODE ch = *p++;
2070 if (ch >= 256) {
2071 if (latin1_encoding_error(&p, &s, errors,
2072 "ordinal not in range(256)"))
2073 goto onError;
2074 }
2075 else
2076 *s++ = (char)ch;
2077 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002078 /* Resize if error handling skipped some characters */
2079 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002080 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return repr;
2082
2083 onError:
2084 Py_DECREF(repr);
2085 return NULL;
2086}
2087
2088PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2089{
2090 if (!PyUnicode_Check(unicode)) {
2091 PyErr_BadArgument();
2092 return NULL;
2093 }
2094 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2095 PyUnicode_GET_SIZE(unicode),
2096 NULL);
2097}
2098
2099/* --- 7-bit ASCII Codec -------------------------------------------------- */
2100
2101static
2102int ascii_decoding_error(const char **source,
2103 Py_UNICODE **dest,
2104 const char *errors,
2105 const char *details)
2106{
2107 if ((errors == NULL) ||
2108 (strcmp(errors,"strict") == 0)) {
2109 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002110 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 details);
2112 return -1;
2113 }
2114 else if (strcmp(errors,"ignore") == 0) {
2115 return 0;
2116 }
2117 else if (strcmp(errors,"replace") == 0) {
2118 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2119 (*dest)++;
2120 return 0;
2121 }
2122 else {
2123 PyErr_Format(PyExc_ValueError,
2124 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 errors);
2127 return -1;
2128 }
2129}
2130
2131PyObject *PyUnicode_DecodeASCII(const char *s,
2132 int size,
2133 const char *errors)
2134{
2135 PyUnicodeObject *v;
2136 Py_UNICODE *p;
2137
2138 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002139 if (size == 1 && *(unsigned char*)s < 128) {
2140 Py_UNICODE r = *(unsigned char*)s;
2141 return PyUnicode_FromUnicode(&r, 1);
2142 }
2143
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 v = _PyUnicode_New(size);
2145 if (v == NULL)
2146 goto onError;
2147 if (size == 0)
2148 return (PyObject *)v;
2149 p = PyUnicode_AS_UNICODE(v);
2150 while (size-- > 0) {
2151 register unsigned char c;
2152
2153 c = (unsigned char)*s++;
2154 if (c < 128)
2155 *p++ = c;
2156 else if (ascii_decoding_error(&s, &p, errors,
2157 "ordinal not in range(128)"))
2158 goto onError;
2159 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002160 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002161 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002162 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return (PyObject *)v;
2164
2165 onError:
2166 Py_XDECREF(v);
2167 return NULL;
2168}
2169
2170static
2171int ascii_encoding_error(const Py_UNICODE **source,
2172 char **dest,
2173 const char *errors,
2174 const char *details)
2175{
2176 if ((errors == NULL) ||
2177 (strcmp(errors,"strict") == 0)) {
2178 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 details);
2181 return -1;
2182 }
2183 else if (strcmp(errors,"ignore") == 0) {
2184 return 0;
2185 }
2186 else if (strcmp(errors,"replace") == 0) {
2187 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002188 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 return 0;
2190 }
2191 else {
2192 PyErr_Format(PyExc_ValueError,
2193 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002194 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 errors);
2196 return -1;
2197 }
2198}
2199
2200PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2201 int size,
2202 const char *errors)
2203{
2204 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002205 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002206
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 repr = PyString_FromStringAndSize(NULL, size);
2208 if (repr == NULL)
2209 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002210 if (size == 0)
2211 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
2213 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002214 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 while (size-- > 0) {
2216 Py_UNICODE ch = *p++;
2217 if (ch >= 128) {
2218 if (ascii_encoding_error(&p, &s, errors,
2219 "ordinal not in range(128)"))
2220 goto onError;
2221 }
2222 else
2223 *s++ = (char)ch;
2224 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002225 /* Resize if error handling skipped some characters */
2226 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002227 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return repr;
2229
2230 onError:
2231 Py_DECREF(repr);
2232 return NULL;
2233}
2234
2235PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2236{
2237 if (!PyUnicode_Check(unicode)) {
2238 PyErr_BadArgument();
2239 return NULL;
2240 }
2241 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2242 PyUnicode_GET_SIZE(unicode),
2243 NULL);
2244}
2245
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002246#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002247
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002248/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002249
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002250PyObject *PyUnicode_DecodeMBCS(const char *s,
2251 int size,
2252 const char *errors)
2253{
2254 PyUnicodeObject *v;
2255 Py_UNICODE *p;
2256
2257 /* First get the size of the result */
2258 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002259 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002260 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2261
2262 v = _PyUnicode_New(usize);
2263 if (v == NULL)
2264 return NULL;
2265 if (usize == 0)
2266 return (PyObject *)v;
2267 p = PyUnicode_AS_UNICODE(v);
2268 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2269 Py_DECREF(v);
2270 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2271 }
2272
2273 return (PyObject *)v;
2274}
2275
2276PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2277 int size,
2278 const char *errors)
2279{
2280 PyObject *repr;
2281 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002282 DWORD mbcssize;
2283
2284 /* If there are no characters, bail now! */
2285 if (size==0)
2286 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002287
2288 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002289 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002290 if (mbcssize==0)
2291 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2292
2293 repr = PyString_FromStringAndSize(NULL, mbcssize);
2294 if (repr == NULL)
2295 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002296 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002297 return repr;
2298
2299 /* Do the conversion */
2300 s = PyString_AS_STRING(repr);
2301 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2302 Py_DECREF(repr);
2303 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2304 }
2305 return repr;
2306}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002307
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002308#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002309
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310/* --- Character Mapping Codec -------------------------------------------- */
2311
2312static
2313int charmap_decoding_error(const char **source,
2314 Py_UNICODE **dest,
2315 const char *errors,
2316 const char *details)
2317{
2318 if ((errors == NULL) ||
2319 (strcmp(errors,"strict") == 0)) {
2320 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002321 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 details);
2323 return -1;
2324 }
2325 else if (strcmp(errors,"ignore") == 0) {
2326 return 0;
2327 }
2328 else if (strcmp(errors,"replace") == 0) {
2329 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2330 (*dest)++;
2331 return 0;
2332 }
2333 else {
2334 PyErr_Format(PyExc_ValueError,
2335 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002336 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 errors);
2338 return -1;
2339 }
2340}
2341
2342PyObject *PyUnicode_DecodeCharmap(const char *s,
2343 int size,
2344 PyObject *mapping,
2345 const char *errors)
2346{
2347 PyUnicodeObject *v;
2348 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002349 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350
2351 /* Default to Latin-1 */
2352 if (mapping == NULL)
2353 return PyUnicode_DecodeLatin1(s, size, errors);
2354
2355 v = _PyUnicode_New(size);
2356 if (v == NULL)
2357 goto onError;
2358 if (size == 0)
2359 return (PyObject *)v;
2360 p = PyUnicode_AS_UNICODE(v);
2361 while (size-- > 0) {
2362 unsigned char ch = *s++;
2363 PyObject *w, *x;
2364
2365 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2366 w = PyInt_FromLong((long)ch);
2367 if (w == NULL)
2368 goto onError;
2369 x = PyObject_GetItem(mapping, w);
2370 Py_DECREF(w);
2371 if (x == NULL) {
2372 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002373 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002375 x = Py_None;
2376 Py_INCREF(x);
2377 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 }
2380
2381 /* Apply mapping */
2382 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002383 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 if (value < 0 || value > 65535) {
2385 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002386 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 Py_DECREF(x);
2388 goto onError;
2389 }
2390 *p++ = (Py_UNICODE)value;
2391 }
2392 else if (x == Py_None) {
2393 /* undefined mapping */
2394 if (charmap_decoding_error(&s, &p, errors,
2395 "character maps to <undefined>")) {
2396 Py_DECREF(x);
2397 goto onError;
2398 }
2399 }
2400 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002401 int targetsize = PyUnicode_GET_SIZE(x);
2402
2403 if (targetsize == 1)
2404 /* 1-1 mapping */
2405 *p++ = *PyUnicode_AS_UNICODE(x);
2406
2407 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002409 if (targetsize > extrachars) {
2410 /* resize first */
2411 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2412 int needed = (targetsize - extrachars) + \
2413 (targetsize << 2);
2414 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002415 if (_PyUnicode_Resize(&v,
2416 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002417 Py_DECREF(x);
2418 goto onError;
2419 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002420 p = PyUnicode_AS_UNICODE(v) + oldpos;
2421 }
2422 Py_UNICODE_COPY(p,
2423 PyUnicode_AS_UNICODE(x),
2424 targetsize);
2425 p += targetsize;
2426 extrachars -= targetsize;
2427 }
2428 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 }
2430 else {
2431 /* wrong return value */
2432 PyErr_SetString(PyExc_TypeError,
2433 "character mapping must return integer, None or unicode");
2434 Py_DECREF(x);
2435 goto onError;
2436 }
2437 Py_DECREF(x);
2438 }
2439 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002440 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 goto onError;
2442 return (PyObject *)v;
2443
2444 onError:
2445 Py_XDECREF(v);
2446 return NULL;
2447}
2448
2449static
2450int charmap_encoding_error(const Py_UNICODE **source,
2451 char **dest,
2452 const char *errors,
2453 const char *details)
2454{
2455 if ((errors == NULL) ||
2456 (strcmp(errors,"strict") == 0)) {
2457 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002458 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 details);
2460 return -1;
2461 }
2462 else if (strcmp(errors,"ignore") == 0) {
2463 return 0;
2464 }
2465 else if (strcmp(errors,"replace") == 0) {
2466 **dest = '?';
2467 (*dest)++;
2468 return 0;
2469 }
2470 else {
2471 PyErr_Format(PyExc_ValueError,
2472 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002473 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 errors);
2475 return -1;
2476 }
2477}
2478
2479PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2480 int size,
2481 PyObject *mapping,
2482 const char *errors)
2483{
2484 PyObject *v;
2485 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002486 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
2488 /* Default to Latin-1 */
2489 if (mapping == NULL)
2490 return PyUnicode_EncodeLatin1(p, size, errors);
2491
2492 v = PyString_FromStringAndSize(NULL, size);
2493 if (v == NULL)
2494 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002495 if (size == 0)
2496 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 s = PyString_AS_STRING(v);
2498 while (size-- > 0) {
2499 Py_UNICODE ch = *p++;
2500 PyObject *w, *x;
2501
2502 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2503 w = PyInt_FromLong((long)ch);
2504 if (w == NULL)
2505 goto onError;
2506 x = PyObject_GetItem(mapping, w);
2507 Py_DECREF(w);
2508 if (x == NULL) {
2509 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002510 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002512 x = Py_None;
2513 Py_INCREF(x);
2514 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 }
2517
2518 /* Apply mapping */
2519 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002520 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 if (value < 0 || value > 255) {
2522 PyErr_SetString(PyExc_TypeError,
2523 "character mapping must be in range(256)");
2524 Py_DECREF(x);
2525 goto onError;
2526 }
2527 *s++ = (char)value;
2528 }
2529 else if (x == Py_None) {
2530 /* undefined mapping */
2531 if (charmap_encoding_error(&p, &s, errors,
2532 "character maps to <undefined>")) {
2533 Py_DECREF(x);
2534 goto onError;
2535 }
2536 }
2537 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002538 int targetsize = PyString_GET_SIZE(x);
2539
2540 if (targetsize == 1)
2541 /* 1-1 mapping */
2542 *s++ = *PyString_AS_STRING(x);
2543
2544 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002546 if (targetsize > extrachars) {
2547 /* resize first */
2548 int oldpos = (int)(s - PyString_AS_STRING(v));
2549 int needed = (targetsize - extrachars) + \
2550 (targetsize << 2);
2551 extrachars += needed;
2552 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002553 Py_DECREF(x);
2554 goto onError;
2555 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002556 s = PyString_AS_STRING(v) + oldpos;
2557 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002558 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002559 s += targetsize;
2560 extrachars -= targetsize;
2561 }
2562 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
2564 else {
2565 /* wrong return value */
2566 PyErr_SetString(PyExc_TypeError,
2567 "character mapping must return integer, None or unicode");
2568 Py_DECREF(x);
2569 goto onError;
2570 }
2571 Py_DECREF(x);
2572 }
2573 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
Tim Peters5de98422002-04-27 18:44:32 +00002574 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 return v;
2576
2577 onError:
Tim Peters5de98422002-04-27 18:44:32 +00002578 Py_XDECREF(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return NULL;
2580}
2581
2582PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2583 PyObject *mapping)
2584{
2585 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2586 PyErr_BadArgument();
2587 return NULL;
2588 }
2589 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2590 PyUnicode_GET_SIZE(unicode),
2591 mapping,
2592 NULL);
2593}
2594
2595static
2596int translate_error(const Py_UNICODE **source,
2597 Py_UNICODE **dest,
2598 const char *errors,
2599 const char *details)
2600{
2601 if ((errors == NULL) ||
2602 (strcmp(errors,"strict") == 0)) {
2603 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002604 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 details);
2606 return -1;
2607 }
2608 else if (strcmp(errors,"ignore") == 0) {
2609 return 0;
2610 }
2611 else if (strcmp(errors,"replace") == 0) {
2612 **dest = '?';
2613 (*dest)++;
2614 return 0;
2615 }
2616 else {
2617 PyErr_Format(PyExc_ValueError,
2618 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002619 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 errors);
2621 return -1;
2622 }
2623}
2624
2625PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2626 int size,
2627 PyObject *mapping,
2628 const char *errors)
2629{
2630 PyUnicodeObject *v;
2631 Py_UNICODE *p;
2632
2633 if (mapping == NULL) {
2634 PyErr_BadArgument();
2635 return NULL;
2636 }
2637
2638 /* Output will never be longer than input */
2639 v = _PyUnicode_New(size);
2640 if (v == NULL)
2641 goto onError;
2642 if (size == 0)
2643 goto done;
2644 p = PyUnicode_AS_UNICODE(v);
2645 while (size-- > 0) {
2646 Py_UNICODE ch = *s++;
2647 PyObject *w, *x;
2648
2649 /* Get mapping */
2650 w = PyInt_FromLong(ch);
2651 if (w == NULL)
2652 goto onError;
2653 x = PyObject_GetItem(mapping, w);
2654 Py_DECREF(w);
2655 if (x == NULL) {
2656 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2657 /* No mapping found: default to 1-1 mapping */
2658 PyErr_Clear();
2659 *p++ = ch;
2660 continue;
2661 }
2662 goto onError;
2663 }
2664
2665 /* Apply mapping */
2666 if (PyInt_Check(x))
2667 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2668 else if (x == Py_None) {
2669 /* undefined mapping */
2670 if (translate_error(&s, &p, errors,
2671 "character maps to <undefined>")) {
2672 Py_DECREF(x);
2673 goto onError;
2674 }
2675 }
2676 else if (PyUnicode_Check(x)) {
2677 if (PyUnicode_GET_SIZE(x) != 1) {
2678 /* 1-n mapping */
2679 PyErr_SetString(PyExc_NotImplementedError,
2680 "1-n mappings are currently not implemented");
2681 Py_DECREF(x);
2682 goto onError;
2683 }
2684 *p++ = *PyUnicode_AS_UNICODE(x);
2685 }
2686 else {
2687 /* wrong return value */
2688 PyErr_SetString(PyExc_TypeError,
2689 "translate mapping must return integer, None or unicode");
2690 Py_DECREF(x);
2691 goto onError;
2692 }
2693 Py_DECREF(x);
2694 }
2695 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002696 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 done:
2700 return (PyObject *)v;
2701
2702 onError:
2703 Py_XDECREF(v);
2704 return NULL;
2705}
2706
2707PyObject *PyUnicode_Translate(PyObject *str,
2708 PyObject *mapping,
2709 const char *errors)
2710{
2711 PyObject *result;
2712
2713 str = PyUnicode_FromObject(str);
2714 if (str == NULL)
2715 goto onError;
2716 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2717 PyUnicode_GET_SIZE(str),
2718 mapping,
2719 errors);
2720 Py_DECREF(str);
2721 return result;
2722
2723 onError:
2724 Py_XDECREF(str);
2725 return NULL;
2726}
2727
Guido van Rossum9e896b32000-04-05 20:11:21 +00002728/* --- Decimal Encoder ---------------------------------------------------- */
2729
2730int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2731 int length,
2732 char *output,
2733 const char *errors)
2734{
2735 Py_UNICODE *p, *end;
2736
2737 if (output == NULL) {
2738 PyErr_BadArgument();
2739 return -1;
2740 }
2741
2742 p = s;
2743 end = s + length;
2744 while (p < end) {
2745 register Py_UNICODE ch = *p++;
2746 int decimal;
2747
2748 if (Py_UNICODE_ISSPACE(ch)) {
2749 *output++ = ' ';
2750 continue;
2751 }
2752 decimal = Py_UNICODE_TODECIMAL(ch);
2753 if (decimal >= 0) {
2754 *output++ = '0' + decimal;
2755 continue;
2756 }
Guido van Rossumba477042000-04-06 18:18:10 +00002757 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002758 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002759 continue;
2760 }
2761 /* All other characters are considered invalid */
2762 if (errors == NULL || strcmp(errors, "strict") == 0) {
2763 PyErr_SetString(PyExc_ValueError,
2764 "invalid decimal Unicode string");
2765 goto onError;
2766 }
2767 else if (strcmp(errors, "ignore") == 0)
2768 continue;
2769 else if (strcmp(errors, "replace") == 0) {
2770 *output++ = '?';
2771 continue;
2772 }
2773 }
2774 /* 0-terminate the output string */
2775 *output++ = '\0';
2776 return 0;
2777
2778 onError:
2779 return -1;
2780}
2781
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782/* --- Helpers ------------------------------------------------------------ */
2783
2784static
2785int count(PyUnicodeObject *self,
2786 int start,
2787 int end,
2788 PyUnicodeObject *substring)
2789{
2790 int count = 0;
2791
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002792 if (start < 0)
2793 start += self->length;
2794 if (start < 0)
2795 start = 0;
2796 if (end > self->length)
2797 end = self->length;
2798 if (end < 0)
2799 end += self->length;
2800 if (end < 0)
2801 end = 0;
2802
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002803 if (substring->length == 0)
2804 return (end - start + 1);
2805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 end -= substring->length;
2807
2808 while (start <= end)
2809 if (Py_UNICODE_MATCH(self, start, substring)) {
2810 count++;
2811 start += substring->length;
2812 } else
2813 start++;
2814
2815 return count;
2816}
2817
2818int PyUnicode_Count(PyObject *str,
2819 PyObject *substr,
2820 int start,
2821 int end)
2822{
2823 int result;
2824
2825 str = PyUnicode_FromObject(str);
2826 if (str == NULL)
2827 return -1;
2828 substr = PyUnicode_FromObject(substr);
2829 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002830 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 return -1;
2832 }
2833
2834 result = count((PyUnicodeObject *)str,
2835 start, end,
2836 (PyUnicodeObject *)substr);
2837
2838 Py_DECREF(str);
2839 Py_DECREF(substr);
2840 return result;
2841}
2842
2843static
2844int findstring(PyUnicodeObject *self,
2845 PyUnicodeObject *substring,
2846 int start,
2847 int end,
2848 int direction)
2849{
2850 if (start < 0)
2851 start += self->length;
2852 if (start < 0)
2853 start = 0;
2854
2855 if (substring->length == 0)
2856 return start;
2857
2858 if (end > self->length)
2859 end = self->length;
2860 if (end < 0)
2861 end += self->length;
2862 if (end < 0)
2863 end = 0;
2864
2865 end -= substring->length;
2866
2867 if (direction < 0) {
2868 for (; end >= start; end--)
2869 if (Py_UNICODE_MATCH(self, end, substring))
2870 return end;
2871 } else {
2872 for (; start <= end; start++)
2873 if (Py_UNICODE_MATCH(self, start, substring))
2874 return start;
2875 }
2876
2877 return -1;
2878}
2879
2880int PyUnicode_Find(PyObject *str,
2881 PyObject *substr,
2882 int start,
2883 int end,
2884 int direction)
2885{
2886 int result;
2887
2888 str = PyUnicode_FromObject(str);
2889 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002890 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 substr = PyUnicode_FromObject(substr);
2892 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00002893 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002894 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 }
2896
2897 result = findstring((PyUnicodeObject *)str,
2898 (PyUnicodeObject *)substr,
2899 start, end, direction);
2900 Py_DECREF(str);
2901 Py_DECREF(substr);
2902 return result;
2903}
2904
2905static
2906int tailmatch(PyUnicodeObject *self,
2907 PyUnicodeObject *substring,
2908 int start,
2909 int end,
2910 int direction)
2911{
2912 if (start < 0)
2913 start += self->length;
2914 if (start < 0)
2915 start = 0;
2916
2917 if (substring->length == 0)
2918 return 1;
2919
2920 if (end > self->length)
2921 end = self->length;
2922 if (end < 0)
2923 end += self->length;
2924 if (end < 0)
2925 end = 0;
2926
2927 end -= substring->length;
2928 if (end < start)
2929 return 0;
2930
2931 if (direction > 0) {
2932 if (Py_UNICODE_MATCH(self, end, substring))
2933 return 1;
2934 } else {
2935 if (Py_UNICODE_MATCH(self, start, substring))
2936 return 1;
2937 }
2938
2939 return 0;
2940}
2941
2942int PyUnicode_Tailmatch(PyObject *str,
2943 PyObject *substr,
2944 int start,
2945 int end,
2946 int direction)
2947{
2948 int result;
2949
2950 str = PyUnicode_FromObject(str);
2951 if (str == NULL)
2952 return -1;
2953 substr = PyUnicode_FromObject(substr);
2954 if (substr == NULL) {
2955 Py_DECREF(substr);
2956 return -1;
2957 }
2958
2959 result = tailmatch((PyUnicodeObject *)str,
2960 (PyUnicodeObject *)substr,
2961 start, end, direction);
2962 Py_DECREF(str);
2963 Py_DECREF(substr);
2964 return result;
2965}
2966
2967static
2968const Py_UNICODE *findchar(const Py_UNICODE *s,
2969 int size,
2970 Py_UNICODE ch)
2971{
2972 /* like wcschr, but doesn't stop at NULL characters */
2973
2974 while (size-- > 0) {
2975 if (*s == ch)
2976 return s;
2977 s++;
2978 }
2979
2980 return NULL;
2981}
2982
2983/* Apply fixfct filter to the Unicode object self and return a
2984 reference to the modified object */
2985
2986static
2987PyObject *fixup(PyUnicodeObject *self,
2988 int (*fixfct)(PyUnicodeObject *s))
2989{
2990
2991 PyUnicodeObject *u;
2992
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 if (u == NULL)
2995 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002996
2997 Py_UNICODE_COPY(u->str, self->str, self->length);
2998
Tim Peters7a29bd52001-09-12 03:03:31 +00002999 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 /* fixfct should return TRUE if it modified the buffer. If
3001 FALSE, return a reference to the original buffer instead
3002 (to save space, not time) */
3003 Py_INCREF(self);
3004 Py_DECREF(u);
3005 return (PyObject*) self;
3006 }
3007 return (PyObject*) u;
3008}
3009
3010static
3011int fixupper(PyUnicodeObject *self)
3012{
3013 int len = self->length;
3014 Py_UNICODE *s = self->str;
3015 int status = 0;
3016
3017 while (len-- > 0) {
3018 register Py_UNICODE ch;
3019
3020 ch = Py_UNICODE_TOUPPER(*s);
3021 if (ch != *s) {
3022 status = 1;
3023 *s = ch;
3024 }
3025 s++;
3026 }
3027
3028 return status;
3029}
3030
3031static
3032int fixlower(PyUnicodeObject *self)
3033{
3034 int len = self->length;
3035 Py_UNICODE *s = self->str;
3036 int status = 0;
3037
3038 while (len-- > 0) {
3039 register Py_UNICODE ch;
3040
3041 ch = Py_UNICODE_TOLOWER(*s);
3042 if (ch != *s) {
3043 status = 1;
3044 *s = ch;
3045 }
3046 s++;
3047 }
3048
3049 return status;
3050}
3051
3052static
3053int fixswapcase(PyUnicodeObject *self)
3054{
3055 int len = self->length;
3056 Py_UNICODE *s = self->str;
3057 int status = 0;
3058
3059 while (len-- > 0) {
3060 if (Py_UNICODE_ISUPPER(*s)) {
3061 *s = Py_UNICODE_TOLOWER(*s);
3062 status = 1;
3063 } else if (Py_UNICODE_ISLOWER(*s)) {
3064 *s = Py_UNICODE_TOUPPER(*s);
3065 status = 1;
3066 }
3067 s++;
3068 }
3069
3070 return status;
3071}
3072
3073static
3074int fixcapitalize(PyUnicodeObject *self)
3075{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003076 int len = self->length;
3077 Py_UNICODE *s = self->str;
3078 int status = 0;
3079
3080 if (len == 0)
3081 return 0;
3082 if (Py_UNICODE_ISLOWER(*s)) {
3083 *s = Py_UNICODE_TOUPPER(*s);
3084 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003086 s++;
3087 while (--len > 0) {
3088 if (Py_UNICODE_ISUPPER(*s)) {
3089 *s = Py_UNICODE_TOLOWER(*s);
3090 status = 1;
3091 }
3092 s++;
3093 }
3094 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
3097static
3098int fixtitle(PyUnicodeObject *self)
3099{
3100 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3101 register Py_UNICODE *e;
3102 int previous_is_cased;
3103
3104 /* Shortcut for single character strings */
3105 if (PyUnicode_GET_SIZE(self) == 1) {
3106 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3107 if (*p != ch) {
3108 *p = ch;
3109 return 1;
3110 }
3111 else
3112 return 0;
3113 }
3114
3115 e = p + PyUnicode_GET_SIZE(self);
3116 previous_is_cased = 0;
3117 for (; p < e; p++) {
3118 register const Py_UNICODE ch = *p;
3119
3120 if (previous_is_cased)
3121 *p = Py_UNICODE_TOLOWER(ch);
3122 else
3123 *p = Py_UNICODE_TOTITLE(ch);
3124
3125 if (Py_UNICODE_ISLOWER(ch) ||
3126 Py_UNICODE_ISUPPER(ch) ||
3127 Py_UNICODE_ISTITLE(ch))
3128 previous_is_cased = 1;
3129 else
3130 previous_is_cased = 0;
3131 }
3132 return 1;
3133}
3134
3135PyObject *PyUnicode_Join(PyObject *separator,
3136 PyObject *seq)
3137{
3138 Py_UNICODE *sep;
3139 int seplen;
3140 PyUnicodeObject *res = NULL;
3141 int reslen = 0;
3142 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 int sz = 100;
3144 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003145 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146
Tim Peters2cfe3682001-05-05 05:36:48 +00003147 it = PyObject_GetIter(seq);
3148 if (it == NULL)
3149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150
3151 if (separator == NULL) {
3152 Py_UNICODE blank = ' ';
3153 sep = &blank;
3154 seplen = 1;
3155 }
3156 else {
3157 separator = PyUnicode_FromObject(separator);
3158 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 sep = PyUnicode_AS_UNICODE(separator);
3161 seplen = PyUnicode_GET_SIZE(separator);
3162 }
3163
3164 res = _PyUnicode_New(sz);
3165 if (res == NULL)
3166 goto onError;
3167 p = PyUnicode_AS_UNICODE(res);
3168 reslen = 0;
3169
Tim Peters2cfe3682001-05-05 05:36:48 +00003170 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003172 PyObject *item = PyIter_Next(it);
3173 if (item == NULL) {
3174 if (PyErr_Occurred())
3175 goto onError;
3176 break;
3177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 if (!PyUnicode_Check(item)) {
3179 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003180 if (!PyString_Check(item)) {
3181 PyErr_Format(PyExc_TypeError,
3182 "sequence item %i: expected string or Unicode,"
3183 " %.80s found",
3184 i, item->ob_type->tp_name);
3185 Py_DECREF(item);
3186 goto onError;
3187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 v = PyUnicode_FromObject(item);
3189 Py_DECREF(item);
3190 item = v;
3191 if (item == NULL)
3192 goto onError;
3193 }
3194 itemlen = PyUnicode_GET_SIZE(item);
3195 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003196 if (_PyUnicode_Resize(&res, sz*2)) {
3197 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 sz *= 2;
3201 p = PyUnicode_AS_UNICODE(res) + reslen;
3202 }
3203 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003204 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 p += seplen;
3206 reslen += seplen;
3207 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003208 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 p += itemlen;
3210 reslen += itemlen;
3211 Py_DECREF(item);
3212 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003213 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 goto onError;
3215
3216 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003217 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)res;
3219
3220 onError:
3221 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003222 Py_XDECREF(res);
3223 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 return NULL;
3225}
3226
3227static
3228PyUnicodeObject *pad(PyUnicodeObject *self,
3229 int left,
3230 int right,
3231 Py_UNICODE fill)
3232{
3233 PyUnicodeObject *u;
3234
3235 if (left < 0)
3236 left = 0;
3237 if (right < 0)
3238 right = 0;
3239
Tim Peters7a29bd52001-09-12 03:03:31 +00003240 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_INCREF(self);
3242 return self;
3243 }
3244
3245 u = _PyUnicode_New(left + self->length + right);
3246 if (u) {
3247 if (left)
3248 Py_UNICODE_FILL(u->str, fill, left);
3249 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3250 if (right)
3251 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3252 }
3253
3254 return u;
3255}
3256
3257#define SPLIT_APPEND(data, left, right) \
3258 str = PyUnicode_FromUnicode(data + left, right - left); \
3259 if (!str) \
3260 goto onError; \
3261 if (PyList_Append(list, str)) { \
3262 Py_DECREF(str); \
3263 goto onError; \
3264 } \
3265 else \
3266 Py_DECREF(str);
3267
3268static
3269PyObject *split_whitespace(PyUnicodeObject *self,
3270 PyObject *list,
3271 int maxcount)
3272{
3273 register int i;
3274 register int j;
3275 int len = self->length;
3276 PyObject *str;
3277
3278 for (i = j = 0; i < len; ) {
3279 /* find a token */
3280 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3281 i++;
3282 j = i;
3283 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3284 i++;
3285 if (j < i) {
3286 if (maxcount-- <= 0)
3287 break;
3288 SPLIT_APPEND(self->str, j, i);
3289 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3290 i++;
3291 j = i;
3292 }
3293 }
3294 if (j < len) {
3295 SPLIT_APPEND(self->str, j, len);
3296 }
3297 return list;
3298
3299 onError:
3300 Py_DECREF(list);
3301 return NULL;
3302}
3303
3304PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003305 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 register int i;
3308 register int j;
3309 int len;
3310 PyObject *list;
3311 PyObject *str;
3312 Py_UNICODE *data;
3313
3314 string = PyUnicode_FromObject(string);
3315 if (string == NULL)
3316 return NULL;
3317 data = PyUnicode_AS_UNICODE(string);
3318 len = PyUnicode_GET_SIZE(string);
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 list = PyList_New(0);
3321 if (!list)
3322 goto onError;
3323
3324 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003325 int eol;
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 /* Find a line and append it */
3328 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3329 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003332 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 if (i < len) {
3334 if (data[i] == '\r' && i + 1 < len &&
3335 data[i+1] == '\n')
3336 i += 2;
3337 else
3338 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003339 if (keepends)
3340 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Guido van Rossum86662912000-04-11 15:38:46 +00003342 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 j = i;
3344 }
3345 if (j < len) {
3346 SPLIT_APPEND(data, j, len);
3347 }
3348
3349 Py_DECREF(string);
3350 return list;
3351
3352 onError:
3353 Py_DECREF(list);
3354 Py_DECREF(string);
3355 return NULL;
3356}
3357
3358static
3359PyObject *split_char(PyUnicodeObject *self,
3360 PyObject *list,
3361 Py_UNICODE ch,
3362 int maxcount)
3363{
3364 register int i;
3365 register int j;
3366 int len = self->length;
3367 PyObject *str;
3368
3369 for (i = j = 0; i < len; ) {
3370 if (self->str[i] == ch) {
3371 if (maxcount-- <= 0)
3372 break;
3373 SPLIT_APPEND(self->str, j, i);
3374 i = j = i + 1;
3375 } else
3376 i++;
3377 }
3378 if (j <= len) {
3379 SPLIT_APPEND(self->str, j, len);
3380 }
3381 return list;
3382
3383 onError:
3384 Py_DECREF(list);
3385 return NULL;
3386}
3387
3388static
3389PyObject *split_substring(PyUnicodeObject *self,
3390 PyObject *list,
3391 PyUnicodeObject *substring,
3392 int maxcount)
3393{
3394 register int i;
3395 register int j;
3396 int len = self->length;
3397 int sublen = substring->length;
3398 PyObject *str;
3399
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003400 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 if (Py_UNICODE_MATCH(self, i, substring)) {
3402 if (maxcount-- <= 0)
3403 break;
3404 SPLIT_APPEND(self->str, j, i);
3405 i = j = i + sublen;
3406 } else
3407 i++;
3408 }
3409 if (j <= len) {
3410 SPLIT_APPEND(self->str, j, len);
3411 }
3412 return list;
3413
3414 onError:
3415 Py_DECREF(list);
3416 return NULL;
3417}
3418
3419#undef SPLIT_APPEND
3420
3421static
3422PyObject *split(PyUnicodeObject *self,
3423 PyUnicodeObject *substring,
3424 int maxcount)
3425{
3426 PyObject *list;
3427
3428 if (maxcount < 0)
3429 maxcount = INT_MAX;
3430
3431 list = PyList_New(0);
3432 if (!list)
3433 return NULL;
3434
3435 if (substring == NULL)
3436 return split_whitespace(self,list,maxcount);
3437
3438 else if (substring->length == 1)
3439 return split_char(self,list,substring->str[0],maxcount);
3440
3441 else if (substring->length == 0) {
3442 Py_DECREF(list);
3443 PyErr_SetString(PyExc_ValueError, "empty separator");
3444 return NULL;
3445 }
3446 else
3447 return split_substring(self,list,substring,maxcount);
3448}
3449
3450static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451PyObject *replace(PyUnicodeObject *self,
3452 PyUnicodeObject *str1,
3453 PyUnicodeObject *str2,
3454 int maxcount)
3455{
3456 PyUnicodeObject *u;
3457
Guido van Rossumf36921c2002-08-09 15:36:48 +00003458 if (str1->length == 0) {
3459 PyErr_SetString(PyExc_ValueError, "empty pattern string");
3460 return NULL;
3461 }
3462
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 if (maxcount < 0)
3464 maxcount = INT_MAX;
3465
3466 if (str1->length == 1 && str2->length == 1) {
3467 int i;
3468
3469 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003470 if (!findchar(self->str, self->length, str1->str[0]) &&
3471 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 /* nothing to replace, return original string */
3473 Py_INCREF(self);
3474 u = self;
3475 } else {
3476 Py_UNICODE u1 = str1->str[0];
3477 Py_UNICODE u2 = str2->str[0];
3478
3479 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003480 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 self->length
3482 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003483 if (u != NULL) {
3484 Py_UNICODE_COPY(u->str, self->str,
3485 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 for (i = 0; i < u->length; i++)
3487 if (u->str[i] == u1) {
3488 if (--maxcount < 0)
3489 break;
3490 u->str[i] = u2;
3491 }
3492 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494
3495 } else {
3496 int n, i;
3497 Py_UNICODE *p;
3498
3499 /* replace strings */
3500 n = count(self, 0, self->length, str1);
3501 if (n > maxcount)
3502 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003503 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 /* nothing to replace, return original string */
3505 Py_INCREF(self);
3506 u = self;
3507 } else {
3508 u = _PyUnicode_New(
3509 self->length + n * (str2->length - str1->length));
3510 if (u) {
3511 i = 0;
3512 p = u->str;
3513 while (i <= self->length - str1->length)
3514 if (Py_UNICODE_MATCH(self, i, str1)) {
3515 /* replace string segment */
3516 Py_UNICODE_COPY(p, str2->str, str2->length);
3517 p += str2->length;
3518 i += str1->length;
3519 if (--n <= 0) {
3520 /* copy remaining part */
3521 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3522 break;
3523 }
3524 } else
3525 *p++ = self->str[i++];
3526 }
3527 }
3528 }
3529
3530 return (PyObject *) u;
3531}
3532
3533/* --- Unicode Object Methods --------------------------------------------- */
3534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003535PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536"S.title() -> unicode\n\
3537\n\
3538Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003539characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540
3541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003542unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 return fixup(self, fixtitle);
3545}
3546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003547PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548"S.capitalize() -> unicode\n\
3549\n\
3550Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003551have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552
3553static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003554unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return fixup(self, fixcapitalize);
3557}
3558
3559#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003560PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561"S.capwords() -> unicode\n\
3562\n\
3563Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003564normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565
3566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003567unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568{
3569 PyObject *list;
3570 PyObject *item;
3571 int i;
3572
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 /* Split into words */
3574 list = split(self, NULL, -1);
3575 if (!list)
3576 return NULL;
3577
3578 /* Capitalize each word */
3579 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3580 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3581 fixcapitalize);
3582 if (item == NULL)
3583 goto onError;
3584 Py_DECREF(PyList_GET_ITEM(list, i));
3585 PyList_SET_ITEM(list, i, item);
3586 }
3587
3588 /* Join the words to form a new string */
3589 item = PyUnicode_Join(NULL, list);
3590
3591onError:
3592 Py_DECREF(list);
3593 return (PyObject *)item;
3594}
3595#endif
3596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003597PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598"S.center(width) -> unicode\n\
3599\n\
3600Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003601using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602
3603static PyObject *
3604unicode_center(PyUnicodeObject *self, PyObject *args)
3605{
3606 int marg, left;
3607 int width;
3608
3609 if (!PyArg_ParseTuple(args, "i:center", &width))
3610 return NULL;
3611
Tim Peters7a29bd52001-09-12 03:03:31 +00003612 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 Py_INCREF(self);
3614 return (PyObject*) self;
3615 }
3616
3617 marg = width - self->length;
3618 left = marg / 2 + (marg & width & 1);
3619
3620 return (PyObject*) pad(self, left, marg - left, ' ');
3621}
3622
Marc-André Lemburge5034372000-08-08 08:04:29 +00003623#if 0
3624
3625/* This code should go into some future Unicode collation support
3626 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003627 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003628
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003629/* speedy UTF-16 code point order comparison */
3630/* gleaned from: */
3631/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3632
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003633static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003634{
3635 0, 0, 0, 0, 0, 0, 0, 0,
3636 0, 0, 0, 0, 0, 0, 0, 0,
3637 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003638 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003639};
3640
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641static int
3642unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3643{
3644 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 Py_UNICODE *s1 = str1->str;
3647 Py_UNICODE *s2 = str2->str;
3648
3649 len1 = str1->length;
3650 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003653 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003654
3655 c1 = *s1++;
3656 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003657
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003658 if (c1 > (1<<11) * 26)
3659 c1 += utf16Fixup[c1>>11];
3660 if (c2 > (1<<11) * 26)
3661 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003662 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003663
3664 if (c1 != c2)
3665 return (c1 < c2) ? -1 : 1;
3666
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003667 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 }
3669
3670 return (len1 < len2) ? -1 : (len1 != len2);
3671}
3672
Marc-André Lemburge5034372000-08-08 08:04:29 +00003673#else
3674
3675static int
3676unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3677{
3678 register int len1, len2;
3679
3680 Py_UNICODE *s1 = str1->str;
3681 Py_UNICODE *s2 = str2->str;
3682
3683 len1 = str1->length;
3684 len2 = str2->length;
3685
3686 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003687 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003688
Fredrik Lundh45714e92001-06-26 16:39:36 +00003689 c1 = *s1++;
3690 c2 = *s2++;
3691
3692 if (c1 != c2)
3693 return (c1 < c2) ? -1 : 1;
3694
Marc-André Lemburge5034372000-08-08 08:04:29 +00003695 len1--; len2--;
3696 }
3697
3698 return (len1 < len2) ? -1 : (len1 != len2);
3699}
3700
3701#endif
3702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703int PyUnicode_Compare(PyObject *left,
3704 PyObject *right)
3705{
3706 PyUnicodeObject *u = NULL, *v = NULL;
3707 int result;
3708
3709 /* Coerce the two arguments */
3710 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3711 if (u == NULL)
3712 goto onError;
3713 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3714 if (v == NULL)
3715 goto onError;
3716
Thomas Wouters7e474022000-07-16 12:04:32 +00003717 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 if (v == u) {
3719 Py_DECREF(u);
3720 Py_DECREF(v);
3721 return 0;
3722 }
3723
3724 result = unicode_compare(u, v);
3725
3726 Py_DECREF(u);
3727 Py_DECREF(v);
3728 return result;
3729
3730onError:
3731 Py_XDECREF(u);
3732 Py_XDECREF(v);
3733 return -1;
3734}
3735
Guido van Rossum403d68b2000-03-13 15:55:09 +00003736int PyUnicode_Contains(PyObject *container,
3737 PyObject *element)
3738{
3739 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00003740 int result, size;
3741 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00003742
3743 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003744 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003745 if (v == NULL) {
3746 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00003747 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003748 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003749 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003750 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3751 if (u == NULL) {
3752 Py_DECREF(v);
3753 goto onError;
3754 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003755
Barry Warsaw817918c2002-08-06 16:58:21 +00003756 size = PyUnicode_GET_SIZE(v);
3757 rhs = PyUnicode_AS_UNICODE(v);
3758 lhs = PyUnicode_AS_UNICODE(u);
3759
Guido van Rossum403d68b2000-03-13 15:55:09 +00003760 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00003761 if (size == 1) {
3762 end = lhs + PyUnicode_GET_SIZE(u);
3763 while (lhs < end) {
3764 if (*lhs++ == *rhs) {
3765 result = 1;
3766 break;
3767 }
3768 }
3769 }
3770 else {
3771 end = lhs + (PyUnicode_GET_SIZE(u) - size);
3772 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00003773 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00003774 result = 1;
3775 break;
3776 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003777 }
3778 }
3779
3780 Py_DECREF(u);
3781 Py_DECREF(v);
3782 return result;
3783
3784onError:
3785 Py_XDECREF(u);
3786 Py_XDECREF(v);
3787 return -1;
3788}
3789
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790/* Concat to string or Unicode object giving a new Unicode object. */
3791
3792PyObject *PyUnicode_Concat(PyObject *left,
3793 PyObject *right)
3794{
3795 PyUnicodeObject *u = NULL, *v = NULL, *w;
3796
3797 /* Coerce the two arguments */
3798 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3799 if (u == NULL)
3800 goto onError;
3801 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3802 if (v == NULL)
3803 goto onError;
3804
3805 /* Shortcuts */
3806 if (v == unicode_empty) {
3807 Py_DECREF(v);
3808 return (PyObject *)u;
3809 }
3810 if (u == unicode_empty) {
3811 Py_DECREF(u);
3812 return (PyObject *)v;
3813 }
3814
3815 /* Concat the two Unicode strings */
3816 w = _PyUnicode_New(u->length + v->length);
3817 if (w == NULL)
3818 goto onError;
3819 Py_UNICODE_COPY(w->str, u->str, u->length);
3820 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3821
3822 Py_DECREF(u);
3823 Py_DECREF(v);
3824 return (PyObject *)w;
3825
3826onError:
3827 Py_XDECREF(u);
3828 Py_XDECREF(v);
3829 return NULL;
3830}
3831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003832PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833"S.count(sub[, start[, end]]) -> int\n\
3834\n\
3835Return the number of occurrences of substring sub in Unicode string\n\
3836S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003837interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838
3839static PyObject *
3840unicode_count(PyUnicodeObject *self, PyObject *args)
3841{
3842 PyUnicodeObject *substring;
3843 int start = 0;
3844 int end = INT_MAX;
3845 PyObject *result;
3846
Guido van Rossumb8872e62000-05-09 14:14:27 +00003847 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3848 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 return NULL;
3850
3851 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3852 (PyObject *)substring);
3853 if (substring == NULL)
3854 return NULL;
3855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 if (start < 0)
3857 start += self->length;
3858 if (start < 0)
3859 start = 0;
3860 if (end > self->length)
3861 end = self->length;
3862 if (end < 0)
3863 end += self->length;
3864 if (end < 0)
3865 end = 0;
3866
3867 result = PyInt_FromLong((long) count(self, start, end, substring));
3868
3869 Py_DECREF(substring);
3870 return result;
3871}
3872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003873PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874"S.encode([encoding[,errors]]) -> string\n\
3875\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003876Return an encoded string version of S. Default encoding is the current\n\
3877default string encoding. errors may be given to set a different error\n\
3878handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003879a ValueError. Other possible values are 'ignore' and 'replace'.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880
3881static PyObject *
3882unicode_encode(PyUnicodeObject *self, PyObject *args)
3883{
3884 char *encoding = NULL;
3885 char *errors = NULL;
3886 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3887 return NULL;
3888 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3889}
3890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003891PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892"S.expandtabs([tabsize]) -> unicode\n\
3893\n\
3894Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003895If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896
3897static PyObject*
3898unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3899{
3900 Py_UNICODE *e;
3901 Py_UNICODE *p;
3902 Py_UNICODE *q;
3903 int i, j;
3904 PyUnicodeObject *u;
3905 int tabsize = 8;
3906
3907 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3908 return NULL;
3909
Thomas Wouters7e474022000-07-16 12:04:32 +00003910 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 i = j = 0;
3912 e = self->str + self->length;
3913 for (p = self->str; p < e; p++)
3914 if (*p == '\t') {
3915 if (tabsize > 0)
3916 j += tabsize - (j % tabsize);
3917 }
3918 else {
3919 j++;
3920 if (*p == '\n' || *p == '\r') {
3921 i += j;
3922 j = 0;
3923 }
3924 }
3925
3926 /* Second pass: create output string and fill it */
3927 u = _PyUnicode_New(i + j);
3928 if (!u)
3929 return NULL;
3930
3931 j = 0;
3932 q = u->str;
3933
3934 for (p = self->str; p < e; p++)
3935 if (*p == '\t') {
3936 if (tabsize > 0) {
3937 i = tabsize - (j % tabsize);
3938 j += i;
3939 while (i--)
3940 *q++ = ' ';
3941 }
3942 }
3943 else {
3944 j++;
3945 *q++ = *p;
3946 if (*p == '\n' || *p == '\r')
3947 j = 0;
3948 }
3949
3950 return (PyObject*) u;
3951}
3952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003953PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954"S.find(sub [,start [,end]]) -> int\n\
3955\n\
3956Return the lowest index in S where substring sub is found,\n\
3957such that sub is contained within s[start,end]. Optional\n\
3958arguments start and end are interpreted as in slice notation.\n\
3959\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003960Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961
3962static PyObject *
3963unicode_find(PyUnicodeObject *self, PyObject *args)
3964{
3965 PyUnicodeObject *substring;
3966 int start = 0;
3967 int end = INT_MAX;
3968 PyObject *result;
3969
Guido van Rossumb8872e62000-05-09 14:14:27 +00003970 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3971 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 return NULL;
3973 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3974 (PyObject *)substring);
3975 if (substring == NULL)
3976 return NULL;
3977
3978 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3979
3980 Py_DECREF(substring);
3981 return result;
3982}
3983
3984static PyObject *
3985unicode_getitem(PyUnicodeObject *self, int index)
3986{
3987 if (index < 0 || index >= self->length) {
3988 PyErr_SetString(PyExc_IndexError, "string index out of range");
3989 return NULL;
3990 }
3991
3992 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3993}
3994
3995static long
3996unicode_hash(PyUnicodeObject *self)
3997{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003998 /* Since Unicode objects compare equal to their ASCII string
3999 counterparts, they should use the individual character values
4000 as basis for their hash value. This is needed to assure that
4001 strings and Unicode objects behave in the same way as
4002 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003
Fredrik Lundhdde61642000-07-10 18:27:47 +00004004 register int len;
4005 register Py_UNICODE *p;
4006 register long x;
4007
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 if (self->hash != -1)
4009 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004010 len = PyUnicode_GET_SIZE(self);
4011 p = PyUnicode_AS_UNICODE(self);
4012 x = *p << 7;
4013 while (--len >= 0)
4014 x = (1000003*x) ^ *p++;
4015 x ^= PyUnicode_GET_SIZE(self);
4016 if (x == -1)
4017 x = -2;
4018 self->hash = x;
4019 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020}
4021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004022PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023"S.index(sub [,start [,end]]) -> int\n\
4024\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004025Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026
4027static PyObject *
4028unicode_index(PyUnicodeObject *self, PyObject *args)
4029{
4030 int result;
4031 PyUnicodeObject *substring;
4032 int start = 0;
4033 int end = INT_MAX;
4034
Guido van Rossumb8872e62000-05-09 14:14:27 +00004035 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4036 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 return NULL;
4038
4039 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4040 (PyObject *)substring);
4041 if (substring == NULL)
4042 return NULL;
4043
4044 result = findstring(self, substring, start, end, 1);
4045
4046 Py_DECREF(substring);
4047 if (result < 0) {
4048 PyErr_SetString(PyExc_ValueError, "substring not found");
4049 return NULL;
4050 }
4051 return PyInt_FromLong(result);
4052}
4053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004054PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004055"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004057Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004058at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059
4060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004061unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062{
4063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4064 register const Py_UNICODE *e;
4065 int cased;
4066
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 /* Shortcut for single character strings */
4068 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004069 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004071 /* Special case for empty strings */
4072 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004073 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004074
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 e = p + PyUnicode_GET_SIZE(self);
4076 cased = 0;
4077 for (; p < e; p++) {
4078 register const Py_UNICODE ch = *p;
4079
4080 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004081 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 else if (!cased && Py_UNICODE_ISLOWER(ch))
4083 cased = 1;
4084 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004085 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086}
4087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004088PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004089"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004091Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004092at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093
4094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004095unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096{
4097 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4098 register const Py_UNICODE *e;
4099 int cased;
4100
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 /* Shortcut for single character strings */
4102 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004103 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004105 /* Special case for empty strings */
4106 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004107 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004108
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 e = p + PyUnicode_GET_SIZE(self);
4110 cased = 0;
4111 for (; p < e; p++) {
4112 register const Py_UNICODE ch = *p;
4113
4114 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004115 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 else if (!cased && Py_UNICODE_ISUPPER(ch))
4117 cased = 1;
4118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004119 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120}
4121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004122PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004123"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004125Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4126characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004127only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
4129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004130unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131{
4132 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4133 register const Py_UNICODE *e;
4134 int cased, previous_is_cased;
4135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 /* Shortcut for single character strings */
4137 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004138 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4139 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004141 /* Special case for empty strings */
4142 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004144
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 e = p + PyUnicode_GET_SIZE(self);
4146 cased = 0;
4147 previous_is_cased = 0;
4148 for (; p < e; p++) {
4149 register const Py_UNICODE ch = *p;
4150
4151 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4152 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004153 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 previous_is_cased = 1;
4155 cased = 1;
4156 }
4157 else if (Py_UNICODE_ISLOWER(ch)) {
4158 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004159 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 previous_is_cased = 1;
4161 cased = 1;
4162 }
4163 else
4164 previous_is_cased = 0;
4165 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004166 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167}
4168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004169PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004170"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004172Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004173False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174
4175static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004176unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177{
4178 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4179 register const Py_UNICODE *e;
4180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 /* Shortcut for single character strings */
4182 if (PyUnicode_GET_SIZE(self) == 1 &&
4183 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004184 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004186 /* Special case for empty strings */
4187 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004189
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 e = p + PyUnicode_GET_SIZE(self);
4191 for (; p < e; p++) {
4192 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004193 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004195 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196}
4197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004198PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004199"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004200\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004201Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004202and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004203
4204static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004205unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004206{
4207 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4208 register const Py_UNICODE *e;
4209
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004210 /* Shortcut for single character strings */
4211 if (PyUnicode_GET_SIZE(self) == 1 &&
4212 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004213 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004214
4215 /* Special case for empty strings */
4216 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004218
4219 e = p + PyUnicode_GET_SIZE(self);
4220 for (; p < e; p++) {
4221 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004222 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004223 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004224 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004225}
4226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004227PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004228"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004229\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004230Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004231and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004232
4233static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004234unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004235{
4236 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4237 register const Py_UNICODE *e;
4238
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004239 /* Shortcut for single character strings */
4240 if (PyUnicode_GET_SIZE(self) == 1 &&
4241 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004242 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004243
4244 /* Special case for empty strings */
4245 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004247
4248 e = p + PyUnicode_GET_SIZE(self);
4249 for (; p < e; p++) {
4250 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004251 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004252 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004253 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004254}
4255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004256PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004257"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004259Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004260False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261
4262static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004263unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264{
4265 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4266 register const Py_UNICODE *e;
4267
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 /* Shortcut for single character strings */
4269 if (PyUnicode_GET_SIZE(self) == 1 &&
4270 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004271 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004273 /* Special case for empty strings */
4274 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004276
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 e = p + PyUnicode_GET_SIZE(self);
4278 for (; p < e; p++) {
4279 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004280 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004282 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283}
4284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004285PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004286"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004288Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004289False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290
4291static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004292unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293{
4294 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4295 register const Py_UNICODE *e;
4296
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 /* Shortcut for single character strings */
4298 if (PyUnicode_GET_SIZE(self) == 1 &&
4299 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004300 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004302 /* Special case for empty strings */
4303 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 e = p + PyUnicode_GET_SIZE(self);
4307 for (; p < e; p++) {
4308 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004309 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004311 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312}
4313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004314PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004315"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004317Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004318False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
4320static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004321unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322{
4323 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4324 register const Py_UNICODE *e;
4325
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 /* Shortcut for single character strings */
4327 if (PyUnicode_GET_SIZE(self) == 1 &&
4328 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004329 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004331 /* Special case for empty strings */
4332 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004333 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004334
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 e = p + PyUnicode_GET_SIZE(self);
4336 for (; p < e; p++) {
4337 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004338 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004340 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341}
4342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004343PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344"S.join(sequence) -> unicode\n\
4345\n\
4346Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004347sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348
4349static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004350unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004352 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353}
4354
4355static int
4356unicode_length(PyUnicodeObject *self)
4357{
4358 return self->length;
4359}
4360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004361PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362"S.ljust(width) -> unicode\n\
4363\n\
4364Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004365done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366
4367static PyObject *
4368unicode_ljust(PyUnicodeObject *self, PyObject *args)
4369{
4370 int width;
4371 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4372 return NULL;
4373
Tim Peters7a29bd52001-09-12 03:03:31 +00004374 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375 Py_INCREF(self);
4376 return (PyObject*) self;
4377 }
4378
4379 return (PyObject*) pad(self, 0, width - self->length, ' ');
4380}
4381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004382PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383"S.lower() -> unicode\n\
4384\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004385Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386
4387static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004388unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 return fixup(self, fixlower);
4391}
4392
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004393#define LEFTSTRIP 0
4394#define RIGHTSTRIP 1
4395#define BOTHSTRIP 2
4396
4397/* Arrays indexed by above */
4398static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4399
4400#define STRIPNAME(i) (stripformat[i]+3)
4401
4402static const Py_UNICODE *
4403unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4404{
Tim Peters030a5ce2002-04-22 19:00:10 +00004405 size_t i;
4406 for (i = 0; i < n; ++i)
4407 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004408 return s+i;
4409 return NULL;
4410}
4411
4412/* externally visible for str.strip(unicode) */
4413PyObject *
4414_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4415{
4416 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4417 int len = PyUnicode_GET_SIZE(self);
4418 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4419 int seplen = PyUnicode_GET_SIZE(sepobj);
4420 int i, j;
4421
4422 i = 0;
4423 if (striptype != RIGHTSTRIP) {
4424 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4425 i++;
4426 }
4427 }
4428
4429 j = len;
4430 if (striptype != LEFTSTRIP) {
4431 do {
4432 j--;
4433 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4434 j++;
4435 }
4436
4437 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4438 Py_INCREF(self);
4439 return (PyObject*)self;
4440 }
4441 else
4442 return PyUnicode_FromUnicode(s+i, j-i);
4443}
4444
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
4446static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004447do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004449 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4450 int len = PyUnicode_GET_SIZE(self), i, j;
4451
4452 i = 0;
4453 if (striptype != RIGHTSTRIP) {
4454 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4455 i++;
4456 }
4457 }
4458
4459 j = len;
4460 if (striptype != LEFTSTRIP) {
4461 do {
4462 j--;
4463 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4464 j++;
4465 }
4466
4467 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4468 Py_INCREF(self);
4469 return (PyObject*)self;
4470 }
4471 else
4472 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473}
4474
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004475
4476static PyObject *
4477do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4478{
4479 PyObject *sep = NULL;
4480
4481 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4482 return NULL;
4483
4484 if (sep != NULL && sep != Py_None) {
4485 if (PyUnicode_Check(sep))
4486 return _PyUnicode_XStrip(self, striptype, sep);
4487 else if (PyString_Check(sep)) {
4488 PyObject *res;
4489 sep = PyUnicode_FromObject(sep);
4490 if (sep==NULL)
4491 return NULL;
4492 res = _PyUnicode_XStrip(self, striptype, sep);
4493 Py_DECREF(sep);
4494 return res;
4495 }
4496 else {
4497 PyErr_Format(PyExc_TypeError,
4498 "%s arg must be None, unicode or str",
4499 STRIPNAME(striptype));
4500 return NULL;
4501 }
4502 }
4503
4504 return do_strip(self, striptype);
4505}
4506
4507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004508PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004509"S.strip([sep]) -> unicode\n\
4510\n\
4511Return a copy of the string S with leading and trailing\n\
4512whitespace removed.\n\
4513If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004514If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004515
4516static PyObject *
4517unicode_strip(PyUnicodeObject *self, PyObject *args)
4518{
4519 if (PyTuple_GET_SIZE(args) == 0)
4520 return do_strip(self, BOTHSTRIP); /* Common case */
4521 else
4522 return do_argstrip(self, BOTHSTRIP, args);
4523}
4524
4525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004526PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004527"S.lstrip([sep]) -> unicode\n\
4528\n\
4529Return a copy of the string S with leading whitespace removed.\n\
4530If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004531If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004532
4533static PyObject *
4534unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4535{
4536 if (PyTuple_GET_SIZE(args) == 0)
4537 return do_strip(self, LEFTSTRIP); /* Common case */
4538 else
4539 return do_argstrip(self, LEFTSTRIP, args);
4540}
4541
4542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004543PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004544"S.rstrip([sep]) -> unicode\n\
4545\n\
4546Return a copy of the string S with trailing whitespace removed.\n\
4547If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004548If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004549
4550static PyObject *
4551unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4552{
4553 if (PyTuple_GET_SIZE(args) == 0)
4554 return do_strip(self, RIGHTSTRIP); /* Common case */
4555 else
4556 return do_argstrip(self, RIGHTSTRIP, args);
4557}
4558
4559
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560static PyObject*
4561unicode_repeat(PyUnicodeObject *str, int len)
4562{
4563 PyUnicodeObject *u;
4564 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004565 int nchars;
4566 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
4568 if (len < 0)
4569 len = 0;
4570
Tim Peters7a29bd52001-09-12 03:03:31 +00004571 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 /* no repeat, return original string */
4573 Py_INCREF(str);
4574 return (PyObject*) str;
4575 }
Tim Peters8f422462000-09-09 06:13:41 +00004576
4577 /* ensure # of chars needed doesn't overflow int and # of bytes
4578 * needed doesn't overflow size_t
4579 */
4580 nchars = len * str->length;
4581 if (len && nchars / len != str->length) {
4582 PyErr_SetString(PyExc_OverflowError,
4583 "repeated string is too long");
4584 return NULL;
4585 }
4586 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4587 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4588 PyErr_SetString(PyExc_OverflowError,
4589 "repeated string is too long");
4590 return NULL;
4591 }
4592 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 if (!u)
4594 return NULL;
4595
4596 p = u->str;
4597
4598 while (len-- > 0) {
4599 Py_UNICODE_COPY(p, str->str, str->length);
4600 p += str->length;
4601 }
4602
4603 return (PyObject*) u;
4604}
4605
4606PyObject *PyUnicode_Replace(PyObject *obj,
4607 PyObject *subobj,
4608 PyObject *replobj,
4609 int maxcount)
4610{
4611 PyObject *self;
4612 PyObject *str1;
4613 PyObject *str2;
4614 PyObject *result;
4615
4616 self = PyUnicode_FromObject(obj);
4617 if (self == NULL)
4618 return NULL;
4619 str1 = PyUnicode_FromObject(subobj);
4620 if (str1 == NULL) {
4621 Py_DECREF(self);
4622 return NULL;
4623 }
4624 str2 = PyUnicode_FromObject(replobj);
4625 if (str2 == NULL) {
4626 Py_DECREF(self);
4627 Py_DECREF(str1);
4628 return NULL;
4629 }
4630 result = replace((PyUnicodeObject *)self,
4631 (PyUnicodeObject *)str1,
4632 (PyUnicodeObject *)str2,
4633 maxcount);
4634 Py_DECREF(self);
4635 Py_DECREF(str1);
4636 Py_DECREF(str2);
4637 return result;
4638}
4639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004640PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641"S.replace (old, new[, maxsplit]) -> unicode\n\
4642\n\
4643Return a copy of S with all occurrences of substring\n\
4644old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004645given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646
4647static PyObject*
4648unicode_replace(PyUnicodeObject *self, PyObject *args)
4649{
4650 PyUnicodeObject *str1;
4651 PyUnicodeObject *str2;
4652 int maxcount = -1;
4653 PyObject *result;
4654
4655 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4656 return NULL;
4657 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4658 if (str1 == NULL)
4659 return NULL;
4660 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4661 if (str2 == NULL)
4662 return NULL;
4663
4664 result = replace(self, str1, str2, maxcount);
4665
4666 Py_DECREF(str1);
4667 Py_DECREF(str2);
4668 return result;
4669}
4670
4671static
4672PyObject *unicode_repr(PyObject *unicode)
4673{
4674 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4675 PyUnicode_GET_SIZE(unicode),
4676 1);
4677}
4678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004679PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680"S.rfind(sub [,start [,end]]) -> int\n\
4681\n\
4682Return the highest index in S where substring sub is found,\n\
4683such that sub is contained within s[start,end]. Optional\n\
4684arguments start and end are interpreted as in slice notation.\n\
4685\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004686Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687
4688static PyObject *
4689unicode_rfind(PyUnicodeObject *self, PyObject *args)
4690{
4691 PyUnicodeObject *substring;
4692 int start = 0;
4693 int end = INT_MAX;
4694 PyObject *result;
4695
Guido van Rossumb8872e62000-05-09 14:14:27 +00004696 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4697 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 return NULL;
4699 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4700 (PyObject *)substring);
4701 if (substring == NULL)
4702 return NULL;
4703
4704 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4705
4706 Py_DECREF(substring);
4707 return result;
4708}
4709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004710PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711"S.rindex(sub [,start [,end]]) -> int\n\
4712\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004713Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714
4715static PyObject *
4716unicode_rindex(PyUnicodeObject *self, PyObject *args)
4717{
4718 int result;
4719 PyUnicodeObject *substring;
4720 int start = 0;
4721 int end = INT_MAX;
4722
Guido van Rossumb8872e62000-05-09 14:14:27 +00004723 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4724 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 return NULL;
4726 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4727 (PyObject *)substring);
4728 if (substring == NULL)
4729 return NULL;
4730
4731 result = findstring(self, substring, start, end, -1);
4732
4733 Py_DECREF(substring);
4734 if (result < 0) {
4735 PyErr_SetString(PyExc_ValueError, "substring not found");
4736 return NULL;
4737 }
4738 return PyInt_FromLong(result);
4739}
4740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004741PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742"S.rjust(width) -> unicode\n\
4743\n\
4744Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004745done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
4747static PyObject *
4748unicode_rjust(PyUnicodeObject *self, PyObject *args)
4749{
4750 int width;
4751 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4752 return NULL;
4753
Tim Peters7a29bd52001-09-12 03:03:31 +00004754 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 Py_INCREF(self);
4756 return (PyObject*) self;
4757 }
4758
4759 return (PyObject*) pad(self, width - self->length, 0, ' ');
4760}
4761
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762static PyObject*
4763unicode_slice(PyUnicodeObject *self, int start, int end)
4764{
4765 /* standard clamping */
4766 if (start < 0)
4767 start = 0;
4768 if (end < 0)
4769 end = 0;
4770 if (end > self->length)
4771 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004772 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 /* full slice, return original string */
4774 Py_INCREF(self);
4775 return (PyObject*) self;
4776 }
4777 if (start > end)
4778 start = end;
4779 /* copy slice */
4780 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4781 end - start);
4782}
4783
4784PyObject *PyUnicode_Split(PyObject *s,
4785 PyObject *sep,
4786 int maxsplit)
4787{
4788 PyObject *result;
4789
4790 s = PyUnicode_FromObject(s);
4791 if (s == NULL)
4792 return NULL;
4793 if (sep != NULL) {
4794 sep = PyUnicode_FromObject(sep);
4795 if (sep == NULL) {
4796 Py_DECREF(s);
4797 return NULL;
4798 }
4799 }
4800
4801 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4802
4803 Py_DECREF(s);
4804 Py_XDECREF(sep);
4805 return result;
4806}
4807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004808PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809"S.split([sep [,maxsplit]]) -> list of strings\n\
4810\n\
4811Return a list of the words in S, using sep as the\n\
4812delimiter string. If maxsplit is given, at most maxsplit\n\
4813splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004814is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
4816static PyObject*
4817unicode_split(PyUnicodeObject *self, PyObject *args)
4818{
4819 PyObject *substring = Py_None;
4820 int maxcount = -1;
4821
4822 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4823 return NULL;
4824
4825 if (substring == Py_None)
4826 return split(self, NULL, maxcount);
4827 else if (PyUnicode_Check(substring))
4828 return split(self, (PyUnicodeObject *)substring, maxcount);
4829 else
4830 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4831}
4832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004833PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00004834"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835\n\
4836Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004837Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004838is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839
4840static PyObject*
4841unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4842{
Guido van Rossum86662912000-04-11 15:38:46 +00004843 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844
Guido van Rossum86662912000-04-11 15:38:46 +00004845 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return NULL;
4847
Guido van Rossum86662912000-04-11 15:38:46 +00004848 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849}
4850
4851static
4852PyObject *unicode_str(PyUnicodeObject *self)
4853{
Fred Drakee4315f52000-05-09 19:53:39 +00004854 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855}
4856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004857PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858"S.swapcase() -> unicode\n\
4859\n\
4860Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004861and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
4863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004864unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 return fixup(self, fixswapcase);
4867}
4868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004869PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870"S.translate(table) -> unicode\n\
4871\n\
4872Return a copy of the string S, where all characters have been mapped\n\
4873through the given translation table, which must be a mapping of\n\
4874Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004875are left untouched. Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876
4877static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004878unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 return PyUnicode_TranslateCharmap(self->str,
4881 self->length,
4882 table,
4883 "ignore");
4884}
4885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004886PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887"S.upper() -> unicode\n\
4888\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004889Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890
4891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004892unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 return fixup(self, fixupper);
4895}
4896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004897PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898"S.zfill(width) -> unicode\n\
4899\n\
4900Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004901of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902
4903static PyObject *
4904unicode_zfill(PyUnicodeObject *self, PyObject *args)
4905{
4906 int fill;
4907 PyUnicodeObject *u;
4908
4909 int width;
4910 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4911 return NULL;
4912
4913 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004914 if (PyUnicode_CheckExact(self)) {
4915 Py_INCREF(self);
4916 return (PyObject*) self;
4917 }
4918 else
4919 return PyUnicode_FromUnicode(
4920 PyUnicode_AS_UNICODE(self),
4921 PyUnicode_GET_SIZE(self)
4922 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 }
4924
4925 fill = width - self->length;
4926
4927 u = pad(self, fill, 0, '0');
4928
Walter Dörwald068325e2002-04-15 13:36:47 +00004929 if (u == NULL)
4930 return NULL;
4931
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 if (u->str[fill] == '+' || u->str[fill] == '-') {
4933 /* move sign to beginning of string */
4934 u->str[0] = u->str[fill];
4935 u->str[fill] = '0';
4936 }
4937
4938 return (PyObject*) u;
4939}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940
4941#if 0
4942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004943unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 return PyInt_FromLong(unicode_freelist_size);
4946}
4947#endif
4948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004949PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004950"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004952Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004954comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955
4956static PyObject *
4957unicode_startswith(PyUnicodeObject *self,
4958 PyObject *args)
4959{
4960 PyUnicodeObject *substring;
4961 int start = 0;
4962 int end = INT_MAX;
4963 PyObject *result;
4964
Guido van Rossumb8872e62000-05-09 14:14:27 +00004965 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4966 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 return NULL;
4968 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4969 (PyObject *)substring);
4970 if (substring == NULL)
4971 return NULL;
4972
Guido van Rossum77f6a652002-04-03 22:41:51 +00004973 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974
4975 Py_DECREF(substring);
4976 return result;
4977}
4978
4979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004980PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004981"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004983Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004985comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986
4987static PyObject *
4988unicode_endswith(PyUnicodeObject *self,
4989 PyObject *args)
4990{
4991 PyUnicodeObject *substring;
4992 int start = 0;
4993 int end = INT_MAX;
4994 PyObject *result;
4995
Guido van Rossumb8872e62000-05-09 14:14:27 +00004996 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4997 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 return NULL;
4999 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5000 (PyObject *)substring);
5001 if (substring == NULL)
5002 return NULL;
5003
Guido van Rossum77f6a652002-04-03 22:41:51 +00005004 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005
5006 Py_DECREF(substring);
5007 return result;
5008}
5009
5010
5011static PyMethodDef unicode_methods[] = {
5012
5013 /* Order is according to common usage: often used methods should
5014 appear first, since lookup is done sequentially. */
5015
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005016 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5017 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5018 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5019 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5020 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5021 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5022 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5023 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5024 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5025 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5026 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5027 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5028 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005029 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005030/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5031 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5032 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5033 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005034 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005035 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005036 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005037 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5038 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5039 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5040 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5041 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5042 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5043 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5044 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5045 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5046 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5047 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5048 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5049 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5050 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005051 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005052#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005053 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054#endif
5055
5056#if 0
5057 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005058 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059#endif
5060
5061 {NULL, NULL}
5062};
5063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064static PySequenceMethods unicode_as_sequence = {
5065 (inquiry) unicode_length, /* sq_length */
5066 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5067 (intargfunc) unicode_repeat, /* sq_repeat */
5068 (intargfunc) unicode_getitem, /* sq_item */
5069 (intintargfunc) unicode_slice, /* sq_slice */
5070 0, /* sq_ass_item */
5071 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005072 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073};
5074
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005075static PyObject*
5076unicode_subscript(PyUnicodeObject* self, PyObject* item)
5077{
5078 if (PyInt_Check(item)) {
5079 long i = PyInt_AS_LONG(item);
5080 if (i < 0)
5081 i += PyString_GET_SIZE(self);
5082 return unicode_getitem(self, i);
5083 } else if (PyLong_Check(item)) {
5084 long i = PyLong_AsLong(item);
5085 if (i == -1 && PyErr_Occurred())
5086 return NULL;
5087 if (i < 0)
5088 i += PyString_GET_SIZE(self);
5089 return unicode_getitem(self, i);
5090 } else if (PySlice_Check(item)) {
5091 int start, stop, step, slicelength, cur, i;
5092 Py_UNICODE* source_buf;
5093 Py_UNICODE* result_buf;
5094 PyObject* result;
5095
5096 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5097 &start, &stop, &step, &slicelength) < 0) {
5098 return NULL;
5099 }
5100
5101 if (slicelength <= 0) {
5102 return PyUnicode_FromUnicode(NULL, 0);
5103 } else {
5104 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5105 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5106
5107 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5108 result_buf[i] = source_buf[cur];
5109 }
5110
5111 result = PyUnicode_FromUnicode(result_buf, slicelength);
5112 PyMem_FREE(result_buf);
5113 return result;
5114 }
5115 } else {
5116 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5117 return NULL;
5118 }
5119}
5120
5121static PyMappingMethods unicode_as_mapping = {
5122 (inquiry)unicode_length, /* mp_length */
5123 (binaryfunc)unicode_subscript, /* mp_subscript */
5124 (objobjargproc)0, /* mp_ass_subscript */
5125};
5126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127static int
5128unicode_buffer_getreadbuf(PyUnicodeObject *self,
5129 int index,
5130 const void **ptr)
5131{
5132 if (index != 0) {
5133 PyErr_SetString(PyExc_SystemError,
5134 "accessing non-existent unicode segment");
5135 return -1;
5136 }
5137 *ptr = (void *) self->str;
5138 return PyUnicode_GET_DATA_SIZE(self);
5139}
5140
5141static int
5142unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5143 const void **ptr)
5144{
5145 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005146 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 return -1;
5148}
5149
5150static int
5151unicode_buffer_getsegcount(PyUnicodeObject *self,
5152 int *lenp)
5153{
5154 if (lenp)
5155 *lenp = PyUnicode_GET_DATA_SIZE(self);
5156 return 1;
5157}
5158
5159static int
5160unicode_buffer_getcharbuf(PyUnicodeObject *self,
5161 int index,
5162 const void **ptr)
5163{
5164 PyObject *str;
5165
5166 if (index != 0) {
5167 PyErr_SetString(PyExc_SystemError,
5168 "accessing non-existent unicode segment");
5169 return -1;
5170 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005171 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 if (str == NULL)
5173 return -1;
5174 *ptr = (void *) PyString_AS_STRING(str);
5175 return PyString_GET_SIZE(str);
5176}
5177
5178/* Helpers for PyUnicode_Format() */
5179
5180static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005181getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182{
5183 int argidx = *p_argidx;
5184 if (argidx < arglen) {
5185 (*p_argidx)++;
5186 if (arglen < 0)
5187 return args;
5188 else
5189 return PyTuple_GetItem(args, argidx);
5190 }
5191 PyErr_SetString(PyExc_TypeError,
5192 "not enough arguments for format string");
5193 return NULL;
5194}
5195
5196#define F_LJUST (1<<0)
5197#define F_SIGN (1<<1)
5198#define F_BLANK (1<<2)
5199#define F_ALT (1<<3)
5200#define F_ZERO (1<<4)
5201
5202static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204{
5205 register int i;
5206 int len;
5207 va_list va;
5208 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
5211 /* First, format the string as char array, then expand to Py_UNICODE
5212 array. */
5213 charbuffer = (char *)buffer;
5214 len = vsprintf(charbuffer, format, va);
5215 for (i = len - 1; i >= 0; i--)
5216 buffer[i] = (Py_UNICODE) charbuffer[i];
5217
5218 va_end(va);
5219 return len;
5220}
5221
5222static int
5223formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005224 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 int flags,
5226 int prec,
5227 int type,
5228 PyObject *v)
5229{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005230 /* fmt = '%#.' + `prec` + `type`
5231 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 char fmt[20];
5233 double x;
5234
5235 x = PyFloat_AsDouble(v);
5236 if (x == -1.0 && PyErr_Occurred())
5237 return -1;
5238 if (prec < 0)
5239 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5241 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005242 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5243 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005244 /* worst case length calc to ensure no buffer overrun:
5245 fmt = %#.<prec>g
5246 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5247 for any double rep.)
5248 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5249 If prec=0 the effective precision is 1 (the leading digit is
5250 always given), therefore increase by one to 10+prec. */
5251 if (buflen <= (size_t)10 + (size_t)prec) {
5252 PyErr_SetString(PyExc_OverflowError,
5253 "formatted float is too long (precision too long?)");
5254 return -1;
5255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 return usprintf(buf, fmt, x);
5257}
5258
Tim Peters38fd5b62000-09-21 05:43:11 +00005259static PyObject*
5260formatlong(PyObject *val, int flags, int prec, int type)
5261{
5262 char *buf;
5263 int i, len;
5264 PyObject *str; /* temporary string object. */
5265 PyUnicodeObject *result;
5266
5267 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5268 if (!str)
5269 return NULL;
5270 result = _PyUnicode_New(len);
5271 for (i = 0; i < len; i++)
5272 result->str[i] = buf[i];
5273 result->str[len] = 0;
5274 Py_DECREF(str);
5275 return (PyObject*)result;
5276}
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278static int
5279formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005280 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 int flags,
5282 int prec,
5283 int type,
5284 PyObject *v)
5285{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005286 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005287 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5288 * + 1 + 1
5289 * = 24
5290 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005291 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 long x;
5293
5294 x = PyInt_AsLong(v);
5295 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005296 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005298 prec = 1;
5299
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005300 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005301 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5302 */
5303 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005304 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005305 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005306 return -1;
5307 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005308
5309 if ((flags & F_ALT) &&
5310 (type == 'x' || type == 'X')) {
5311 /* When converting under %#x or %#X, there are a number
5312 * of issues that cause pain:
5313 * - when 0 is being converted, the C standard leaves off
5314 * the '0x' or '0X', which is inconsistent with other
5315 * %#x/%#X conversions and inconsistent with Python's
5316 * hex() function
5317 * - there are platforms that violate the standard and
5318 * convert 0 with the '0x' or '0X'
5319 * (Metrowerks, Compaq Tru64)
5320 * - there are platforms that give '0x' when converting
5321 * under %#X, but convert 0 in accordance with the
5322 * standard (OS/2 EMX)
5323 *
5324 * We can achieve the desired consistency by inserting our
5325 * own '0x' or '0X' prefix, and substituting %x/%X in place
5326 * of %#x/%#X.
5327 *
5328 * Note that this is the same approach as used in
5329 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005330 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005331 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5332 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005333 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005334 else {
5335 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5336 (flags&F_ALT) ? "#" : "",
5337 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 return usprintf(buf, fmt, x);
5340}
5341
5342static int
5343formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005344 size_t buflen,
5345 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005347 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005348 if (PyUnicode_Check(v)) {
5349 if (PyUnicode_GET_SIZE(v) != 1)
5350 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005354 else if (PyString_Check(v)) {
5355 if (PyString_GET_SIZE(v) != 1)
5356 goto onError;
5357 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
5360 else {
5361 /* Integer input truncated to a character */
5362 long x;
5363 x = PyInt_AsLong(v);
5364 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005365 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 buf[0] = (char) x;
5367 }
5368 buf[1] = '\0';
5369 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005370
5371 onError:
5372 PyErr_SetString(PyExc_TypeError,
5373 "%c requires int or char");
5374 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375}
5376
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005377/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5378
5379 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5380 chars are formatted. XXX This is a magic number. Each formatting
5381 routine does bounds checking to ensure no overflow, but a better
5382 solution may be to malloc a buffer of appropriate size for each
5383 format. For now, the current solution is sufficient.
5384*/
5385#define FORMATBUFLEN (size_t)120
5386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387PyObject *PyUnicode_Format(PyObject *format,
5388 PyObject *args)
5389{
5390 Py_UNICODE *fmt, *res;
5391 int fmtcnt, rescnt, reslen, arglen, argidx;
5392 int args_owned = 0;
5393 PyUnicodeObject *result = NULL;
5394 PyObject *dict = NULL;
5395 PyObject *uformat;
5396
5397 if (format == NULL || args == NULL) {
5398 PyErr_BadInternalCall();
5399 return NULL;
5400 }
5401 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005402 if (uformat == NULL)
5403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 fmt = PyUnicode_AS_UNICODE(uformat);
5405 fmtcnt = PyUnicode_GET_SIZE(uformat);
5406
5407 reslen = rescnt = fmtcnt + 100;
5408 result = _PyUnicode_New(reslen);
5409 if (result == NULL)
5410 goto onError;
5411 res = PyUnicode_AS_UNICODE(result);
5412
5413 if (PyTuple_Check(args)) {
5414 arglen = PyTuple_Size(args);
5415 argidx = 0;
5416 }
5417 else {
5418 arglen = -1;
5419 argidx = -2;
5420 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005421 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 dict = args;
5423
5424 while (--fmtcnt >= 0) {
5425 if (*fmt != '%') {
5426 if (--rescnt < 0) {
5427 rescnt = fmtcnt + 100;
5428 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005429 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return NULL;
5431 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5432 --rescnt;
5433 }
5434 *res++ = *fmt++;
5435 }
5436 else {
5437 /* Got a format specifier */
5438 int flags = 0;
5439 int width = -1;
5440 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 Py_UNICODE c = '\0';
5442 Py_UNICODE fill;
5443 PyObject *v = NULL;
5444 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005445 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 Py_UNICODE sign;
5447 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005448 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
5450 fmt++;
5451 if (*fmt == '(') {
5452 Py_UNICODE *keystart;
5453 int keylen;
5454 PyObject *key;
5455 int pcount = 1;
5456
5457 if (dict == NULL) {
5458 PyErr_SetString(PyExc_TypeError,
5459 "format requires a mapping");
5460 goto onError;
5461 }
5462 ++fmt;
5463 --fmtcnt;
5464 keystart = fmt;
5465 /* Skip over balanced parentheses */
5466 while (pcount > 0 && --fmtcnt >= 0) {
5467 if (*fmt == ')')
5468 --pcount;
5469 else if (*fmt == '(')
5470 ++pcount;
5471 fmt++;
5472 }
5473 keylen = fmt - keystart - 1;
5474 if (fmtcnt < 0 || pcount > 0) {
5475 PyErr_SetString(PyExc_ValueError,
5476 "incomplete format key");
5477 goto onError;
5478 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005479#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005480 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 then looked up since Python uses strings to hold
5482 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005483 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 key = PyUnicode_EncodeUTF8(keystart,
5485 keylen,
5486 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005487#else
5488 key = PyUnicode_FromUnicode(keystart, keylen);
5489#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 if (key == NULL)
5491 goto onError;
5492 if (args_owned) {
5493 Py_DECREF(args);
5494 args_owned = 0;
5495 }
5496 args = PyObject_GetItem(dict, key);
5497 Py_DECREF(key);
5498 if (args == NULL) {
5499 goto onError;
5500 }
5501 args_owned = 1;
5502 arglen = -1;
5503 argidx = -2;
5504 }
5505 while (--fmtcnt >= 0) {
5506 switch (c = *fmt++) {
5507 case '-': flags |= F_LJUST; continue;
5508 case '+': flags |= F_SIGN; continue;
5509 case ' ': flags |= F_BLANK; continue;
5510 case '#': flags |= F_ALT; continue;
5511 case '0': flags |= F_ZERO; continue;
5512 }
5513 break;
5514 }
5515 if (c == '*') {
5516 v = getnextarg(args, arglen, &argidx);
5517 if (v == NULL)
5518 goto onError;
5519 if (!PyInt_Check(v)) {
5520 PyErr_SetString(PyExc_TypeError,
5521 "* wants int");
5522 goto onError;
5523 }
5524 width = PyInt_AsLong(v);
5525 if (width < 0) {
5526 flags |= F_LJUST;
5527 width = -width;
5528 }
5529 if (--fmtcnt >= 0)
5530 c = *fmt++;
5531 }
5532 else if (c >= '0' && c <= '9') {
5533 width = c - '0';
5534 while (--fmtcnt >= 0) {
5535 c = *fmt++;
5536 if (c < '0' || c > '9')
5537 break;
5538 if ((width*10) / 10 != width) {
5539 PyErr_SetString(PyExc_ValueError,
5540 "width too big");
5541 goto onError;
5542 }
5543 width = width*10 + (c - '0');
5544 }
5545 }
5546 if (c == '.') {
5547 prec = 0;
5548 if (--fmtcnt >= 0)
5549 c = *fmt++;
5550 if (c == '*') {
5551 v = getnextarg(args, arglen, &argidx);
5552 if (v == NULL)
5553 goto onError;
5554 if (!PyInt_Check(v)) {
5555 PyErr_SetString(PyExc_TypeError,
5556 "* wants int");
5557 goto onError;
5558 }
5559 prec = PyInt_AsLong(v);
5560 if (prec < 0)
5561 prec = 0;
5562 if (--fmtcnt >= 0)
5563 c = *fmt++;
5564 }
5565 else if (c >= '0' && c <= '9') {
5566 prec = c - '0';
5567 while (--fmtcnt >= 0) {
5568 c = Py_CHARMASK(*fmt++);
5569 if (c < '0' || c > '9')
5570 break;
5571 if ((prec*10) / 10 != prec) {
5572 PyErr_SetString(PyExc_ValueError,
5573 "prec too big");
5574 goto onError;
5575 }
5576 prec = prec*10 + (c - '0');
5577 }
5578 }
5579 } /* prec */
5580 if (fmtcnt >= 0) {
5581 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 if (--fmtcnt >= 0)
5583 c = *fmt++;
5584 }
5585 }
5586 if (fmtcnt < 0) {
5587 PyErr_SetString(PyExc_ValueError,
5588 "incomplete format");
5589 goto onError;
5590 }
5591 if (c != '%') {
5592 v = getnextarg(args, arglen, &argidx);
5593 if (v == NULL)
5594 goto onError;
5595 }
5596 sign = 0;
5597 fill = ' ';
5598 switch (c) {
5599
5600 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005601 pbuf = formatbuf;
5602 /* presume that buffer length is at least 1 */
5603 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 len = 1;
5605 break;
5606
5607 case 's':
5608 case 'r':
5609 if (PyUnicode_Check(v) && c == 's') {
5610 temp = v;
5611 Py_INCREF(temp);
5612 }
5613 else {
5614 PyObject *unicode;
5615 if (c == 's')
5616 temp = PyObject_Str(v);
5617 else
5618 temp = PyObject_Repr(v);
5619 if (temp == NULL)
5620 goto onError;
5621 if (!PyString_Check(temp)) {
5622 /* XXX Note: this should never happen, since
5623 PyObject_Repr() and PyObject_Str() assure
5624 this */
5625 Py_DECREF(temp);
5626 PyErr_SetString(PyExc_TypeError,
5627 "%s argument has non-string str()");
5628 goto onError;
5629 }
Fred Drakee4315f52000-05-09 19:53:39 +00005630 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005632 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 "strict");
5634 Py_DECREF(temp);
5635 temp = unicode;
5636 if (temp == NULL)
5637 goto onError;
5638 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005639 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 len = PyUnicode_GET_SIZE(temp);
5641 if (prec >= 0 && len > prec)
5642 len = prec;
5643 break;
5644
5645 case 'i':
5646 case 'd':
5647 case 'u':
5648 case 'o':
5649 case 'x':
5650 case 'X':
5651 if (c == 'i')
5652 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005653 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005654 temp = formatlong(v, flags, prec, c);
5655 if (!temp)
5656 goto onError;
5657 pbuf = PyUnicode_AS_UNICODE(temp);
5658 len = PyUnicode_GET_SIZE(temp);
5659 /* unbounded ints can always produce
5660 a sign character! */
5661 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005663 else {
5664 pbuf = formatbuf;
5665 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5666 flags, prec, c, v);
5667 if (len < 0)
5668 goto onError;
5669 /* only d conversion is signed */
5670 sign = c == 'd';
5671 }
5672 if (flags & F_ZERO)
5673 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 break;
5675
5676 case 'e':
5677 case 'E':
5678 case 'f':
5679 case 'g':
5680 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005681 pbuf = formatbuf;
5682 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5683 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 if (len < 0)
5685 goto onError;
5686 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005687 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 fill = '0';
5689 break;
5690
5691 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005692 pbuf = formatbuf;
5693 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 if (len < 0)
5695 goto onError;
5696 break;
5697
5698 default:
5699 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005700 "unsupported format character '%c' (0x%x) "
5701 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005702 (31<=c && c<=126) ? c : '?',
5703 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 goto onError;
5705 }
5706 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005707 if (*pbuf == '-' || *pbuf == '+') {
5708 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 len--;
5710 }
5711 else if (flags & F_SIGN)
5712 sign = '+';
5713 else if (flags & F_BLANK)
5714 sign = ' ';
5715 else
5716 sign = 0;
5717 }
5718 if (width < len)
5719 width = len;
5720 if (rescnt < width + (sign != 0)) {
5721 reslen -= rescnt;
5722 rescnt = width + fmtcnt + 100;
5723 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005724 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 return NULL;
5726 res = PyUnicode_AS_UNICODE(result)
5727 + reslen - rescnt;
5728 }
5729 if (sign) {
5730 if (fill != ' ')
5731 *res++ = sign;
5732 rescnt--;
5733 if (width > len)
5734 width--;
5735 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005736 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5737 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005738 assert(pbuf[1] == c);
5739 if (fill != ' ') {
5740 *res++ = *pbuf++;
5741 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005742 }
Tim Petersfff53252001-04-12 18:38:48 +00005743 rescnt -= 2;
5744 width -= 2;
5745 if (width < 0)
5746 width = 0;
5747 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 if (width > len && !(flags & F_LJUST)) {
5750 do {
5751 --rescnt;
5752 *res++ = fill;
5753 } while (--width > len);
5754 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005755 if (fill == ' ') {
5756 if (sign)
5757 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005758 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005759 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005760 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005761 *res++ = *pbuf++;
5762 *res++ = *pbuf++;
5763 }
5764 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005765 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 res += len;
5767 rescnt -= len;
5768 while (--width >= len) {
5769 --rescnt;
5770 *res++ = ' ';
5771 }
5772 if (dict && (argidx < arglen) && c != '%') {
5773 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005774 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 goto onError;
5776 }
5777 Py_XDECREF(temp);
5778 } /* '%' */
5779 } /* until end */
5780 if (argidx < arglen && !dict) {
5781 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005782 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 goto onError;
5784 }
5785
5786 if (args_owned) {
5787 Py_DECREF(args);
5788 }
5789 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005790 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005791 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 return (PyObject *)result;
5793
5794 onError:
5795 Py_XDECREF(result);
5796 Py_DECREF(uformat);
5797 if (args_owned) {
5798 Py_DECREF(args);
5799 }
5800 return NULL;
5801}
5802
5803static PyBufferProcs unicode_as_buffer = {
5804 (getreadbufferproc) unicode_buffer_getreadbuf,
5805 (getwritebufferproc) unicode_buffer_getwritebuf,
5806 (getsegcountproc) unicode_buffer_getsegcount,
5807 (getcharbufferproc) unicode_buffer_getcharbuf,
5808};
5809
Jeremy Hylton938ace62002-07-17 16:30:39 +00005810static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00005811unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5812
Tim Peters6d6c1a32001-08-02 04:15:00 +00005813static PyObject *
5814unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5815{
5816 PyObject *x = NULL;
5817 static char *kwlist[] = {"string", "encoding", "errors", 0};
5818 char *encoding = NULL;
5819 char *errors = NULL;
5820
Guido van Rossume023fe02001-08-30 03:12:59 +00005821 if (type != &PyUnicode_Type)
5822 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005823 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5824 kwlist, &x, &encoding, &errors))
5825 return NULL;
5826 if (x == NULL)
5827 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005828 if (encoding == NULL && errors == NULL)
5829 return PyObject_Unicode(x);
5830 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005831 return PyUnicode_FromEncodedObject(x, encoding, errors);
5832}
5833
Guido van Rossume023fe02001-08-30 03:12:59 +00005834static PyObject *
5835unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5836{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005837 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005838 int n;
5839
5840 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5841 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5842 if (tmp == NULL)
5843 return NULL;
5844 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005845 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5846 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005847 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005848 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5849 if (pnew->str == NULL) {
5850 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005851 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005852 return NULL;
5853 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005854 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5855 pnew->length = n;
5856 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005857 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005858 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005859}
5860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005861PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00005862"unicode(string [, encoding[, errors]]) -> object\n\
5863\n\
5864Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00005865encoding defaults to the current default string encoding.\n\
5866errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00005867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868PyTypeObject PyUnicode_Type = {
5869 PyObject_HEAD_INIT(&PyType_Type)
5870 0, /* ob_size */
5871 "unicode", /* tp_name */
5872 sizeof(PyUnicodeObject), /* tp_size */
5873 0, /* tp_itemsize */
5874 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005875 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005877 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 0, /* tp_setattr */
5879 (cmpfunc) unicode_compare, /* tp_compare */
5880 (reprfunc) unicode_repr, /* tp_repr */
5881 0, /* tp_as_number */
5882 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005883 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 (hashfunc) unicode_hash, /* tp_hash*/
5885 0, /* tp_call*/
5886 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005887 PyObject_GenericGetAttr, /* tp_getattro */
5888 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005890 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005891 unicode_doc, /* tp_doc */
5892 0, /* tp_traverse */
5893 0, /* tp_clear */
5894 0, /* tp_richcompare */
5895 0, /* tp_weaklistoffset */
5896 0, /* tp_iter */
5897 0, /* tp_iternext */
5898 unicode_methods, /* tp_methods */
5899 0, /* tp_members */
5900 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00005901 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005902 0, /* tp_dict */
5903 0, /* tp_descr_get */
5904 0, /* tp_descr_set */
5905 0, /* tp_dictoffset */
5906 0, /* tp_init */
5907 0, /* tp_alloc */
5908 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005909 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910};
5911
5912/* Initialize the Unicode implementation */
5913
Thomas Wouters78890102000-07-22 19:25:51 +00005914void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005916 int i;
5917
Fred Drakee4315f52000-05-09 19:53:39 +00005918 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005919 unicode_freelist = NULL;
5920 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005922 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005923 for (i = 0; i < 256; i++)
5924 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00005925 if (PyType_Ready(&PyUnicode_Type) < 0)
5926 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927}
5928
5929/* Finalize the Unicode implementation */
5930
5931void
Thomas Wouters78890102000-07-22 19:25:51 +00005932_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005934 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005935 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005937 Py_XDECREF(unicode_empty);
5938 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005939
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005940 for (i = 0; i < 256; i++) {
5941 if (unicode_latin1[i]) {
5942 Py_DECREF(unicode_latin1[i]);
5943 unicode_latin1[i] = NULL;
5944 }
5945 }
5946
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005947 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 PyUnicodeObject *v = u;
5949 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005950 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005951 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005952 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005953 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005955 unicode_freelist = NULL;
5956 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957}