blob: 1abef89a3c3e2e2d6a5a351b88d0422e93df5180 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456
457 if (obj == NULL) {
458 PyErr_BadInternalCall();
459 return NULL;
460 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000462#if 0
463 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000464 that no encodings is given and then redirect to
465 PyObject_Unicode() which then applies the additional logic for
466 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000467
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000468 NOTE: This API should really only be used for object which
469 represent *encoded* Unicode !
470
471 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000472 if (PyUnicode_Check(obj)) {
473 if (encoding) {
474 PyErr_SetString(PyExc_TypeError,
475 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000479 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480#else
481 if (PyUnicode_Check(obj)) {
482 PyErr_SetString(PyExc_TypeError,
483 "decoding Unicode is not supported");
484 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000485 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000486#endif
487
488 /* Coerce object */
489 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 s = PyString_AS_STRING(obj);
491 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
494 /* Overwrite the error message with something more useful in
495 case of a TypeError. */
496 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000497 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000498 "coercing to Unicode: need string or buffer, "
499 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000500 obj->ob_type->tp_name);
501 goto onError;
502 }
503
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000504 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 if (len == 0) {
506 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000507 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 else
510 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000511
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 return v;
513
514 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516}
517
518PyObject *PyUnicode_Decode(const char *s,
519 int size,
520 const char *encoding,
521 const char *errors)
522{
523 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000524
525 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000526 encoding = PyUnicode_GetDefaultEncoding();
527
528 /* Shortcuts for common default encodings */
529 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000531 else if (strcmp(encoding, "latin-1") == 0)
532 return PyUnicode_DecodeLatin1(s, size, errors);
533 else if (strcmp(encoding, "ascii") == 0)
534 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535
536 /* Decode via the codec registry */
537 buffer = PyBuffer_FromMemory((void *)s, size);
538 if (buffer == NULL)
539 goto onError;
540 unicode = PyCodec_Decode(buffer, encoding, errors);
541 if (unicode == NULL)
542 goto onError;
543 if (!PyUnicode_Check(unicode)) {
544 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000545 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 unicode->ob_type->tp_name);
547 Py_DECREF(unicode);
548 goto onError;
549 }
550 Py_DECREF(buffer);
551 return unicode;
552
553 onError:
554 Py_XDECREF(buffer);
555 return NULL;
556}
557
558PyObject *PyUnicode_Encode(const Py_UNICODE *s,
559 int size,
560 const char *encoding,
561 const char *errors)
562{
563 PyObject *v, *unicode;
564
565 unicode = PyUnicode_FromUnicode(s, size);
566 if (unicode == NULL)
567 return NULL;
568 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
569 Py_DECREF(unicode);
570 return v;
571}
572
573PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
574 const char *encoding,
575 const char *errors)
576{
577 PyObject *v;
578
579 if (!PyUnicode_Check(unicode)) {
580 PyErr_BadArgument();
581 goto onError;
582 }
Fred Drakee4315f52000-05-09 19:53:39 +0000583
584 if (encoding == NULL)
585 encoding = PyUnicode_GetDefaultEncoding();
586
587 /* Shortcuts for common default encodings */
588 if (errors == NULL) {
589 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000590 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000591 else if (strcmp(encoding, "latin-1") == 0)
592 return PyUnicode_AsLatin1String(unicode);
593 else if (strcmp(encoding, "ascii") == 0)
594 return PyUnicode_AsASCIIString(unicode);
595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Encode via the codec registry */
598 v = PyCodec_Encode(unicode, encoding, errors);
599 if (v == NULL)
600 goto onError;
601 /* XXX Should we really enforce this ? */
602 if (!PyString_Check(v)) {
603 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000604 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 v->ob_type->tp_name);
606 Py_DECREF(v);
607 goto onError;
608 }
609 return v;
610
611 onError:
612 return NULL;
613}
614
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000615PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
616 const char *errors)
617{
618 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
619
620 if (v)
621 return v;
622 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
623 if (v && errors == NULL)
624 ((PyUnicodeObject *)unicode)->defenc = v;
625 return v;
626}
627
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
629{
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634 return PyUnicode_AS_UNICODE(unicode);
635
636 onError:
637 return NULL;
638}
639
640int PyUnicode_GetSize(PyObject *unicode)
641{
642 if (!PyUnicode_Check(unicode)) {
643 PyErr_BadArgument();
644 goto onError;
645 }
646 return PyUnicode_GET_SIZE(unicode);
647
648 onError:
649 return -1;
650}
651
Thomas Wouters78890102000-07-22 19:25:51 +0000652const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000653{
654 return unicode_default_encoding;
655}
656
657int PyUnicode_SetDefaultEncoding(const char *encoding)
658{
659 PyObject *v;
660
661 /* Make sure the encoding is valid. As side effect, this also
662 loads the encoding into the codec registry cache. */
663 v = _PyCodec_Lookup(encoding);
664 if (v == NULL)
665 goto onError;
666 Py_DECREF(v);
667 strncpy(unicode_default_encoding,
668 encoding,
669 sizeof(unicode_default_encoding));
670 return 0;
671
672 onError:
673 return -1;
674}
675
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676/* error handling callback helper:
677 build arguments, call the callback and check the arguments,
678 if no exception occured, copy the replacement to the output
679 and adjust various state variables.
680 return 0 on success, -1 on error
681*/
682
683static
684int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
685 const char *encoding, const char *reason,
686 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
687 PyObject **output, int *outpos, Py_UNICODE **outptr)
688{
689 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
690
691 PyObject *restuple = NULL;
692 PyObject *repunicode = NULL;
693 int outsize = PyUnicode_GET_SIZE(*output);
694 int requiredsize;
695 int newpos;
696 Py_UNICODE *repptr;
697 int repsize;
698 int res = -1;
699
700 if (*errorHandler == NULL) {
701 *errorHandler = PyCodec_LookupError(errors);
702 if (*errorHandler == NULL)
703 goto onError;
704 }
705
706 if (*exceptionObject == NULL) {
707 *exceptionObject = PyUnicodeDecodeError_Create(
708 encoding, input, insize, *startinpos, *endinpos, reason);
709 if (*exceptionObject == NULL)
710 goto onError;
711 }
712 else {
713 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
714 goto onError;
715 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
716 goto onError;
717 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
718 goto onError;
719 }
720
721 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
722 if (restuple == NULL)
723 goto onError;
724 if (!PyTuple_Check(restuple)) {
725 PyErr_Format(PyExc_TypeError, &argparse[4]);
726 goto onError;
727 }
728 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
729 goto onError;
730 if (newpos<0)
731 newpos = 0;
732 else if (newpos>insize)
733 newpos = insize;
734
735 /* need more space? (at least enough for what we
736 have+the replacement+the rest of the string (starting
737 at the new input position), so we won't have to check space
738 when there are no errors in the rest of the string) */
739 repptr = PyUnicode_AS_UNICODE(repunicode);
740 repsize = PyUnicode_GET_SIZE(repunicode);
741 requiredsize = *outpos + repsize + insize-newpos;
742 if (requiredsize > outsize) {
743 if (requiredsize<2*outsize)
744 requiredsize = 2*outsize;
745 if (PyUnicode_Resize(output, requiredsize))
746 goto onError;
747 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
748 }
749 *endinpos = newpos;
750 *inptr = input + newpos;
751 Py_UNICODE_COPY(*outptr, repptr, repsize);
752 *outptr += repsize;
753 *outpos += repsize;
754 /* we made it! */
755 res = 0;
756
757 onError:
758 Py_XDECREF(restuple);
759 return res;
760}
761
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000762/* --- UTF-7 Codec -------------------------------------------------------- */
763
764/* see RFC2152 for details */
765
766static
767char utf7_special[128] = {
768 /* indicate whether a UTF-7 character is special i.e. cannot be directly
769 encoded:
770 0 - not special
771 1 - special
772 2 - whitespace (optional)
773 3 - RFC2152 Set O (optional) */
774 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
775 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
776 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
777 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
778 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
779 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
780 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
782
783};
784
785#define SPECIAL(c, encodeO, encodeWS) \
786 (((c)>127 || utf7_special[(c)] == 1) || \
787 (encodeWS && (utf7_special[(c)] == 2)) || \
788 (encodeO && (utf7_special[(c)] == 3)))
789
790#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
791#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
792#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
793 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
794
795#define ENCODE(out, ch, bits) \
796 while (bits >= 6) { \
797 *out++ = B64(ch >> (bits-6)); \
798 bits -= 6; \
799 }
800
801#define DECODE(out, ch, bits, surrogate) \
802 while (bits >= 16) { \
803 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
804 bits -= 16; \
805 if (surrogate) { \
806 /* We have already generated an error for the high surrogate
807 so let's not bother seeing if the low surrogate is correct or not */\
808 surrogate = 0; \
809 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
810 /* This is a surrogate pair. Unfortunately we can't represent \
811 it in a 16-bit character */ \
812 surrogate = 1; \
813 errmsg = "code pairs are not supported"; \
814 goto utf7Error; \
815 } else { \
816 *out++ = outCh; \
817 } \
818 } \
819
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000820PyObject *PyUnicode_DecodeUTF7(const char *s,
821 int size,
822 const char *errors)
823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000824 const char *starts = s;
825 int startinpos;
826 int endinpos;
827 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828 const char *e;
829 PyUnicodeObject *unicode;
830 Py_UNICODE *p;
831 const char *errmsg = "";
832 int inShift = 0;
833 unsigned int bitsleft = 0;
834 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000835 int surrogate = 0;
836 PyObject *errorHandler = NULL;
837 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000838
839 unicode = _PyUnicode_New(size);
840 if (!unicode)
841 return NULL;
842 if (size == 0)
843 return (PyObject *)unicode;
844
845 p = unicode->str;
846 e = s + size;
847
848 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849 Py_UNICODE ch;
850 restart:
851 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000852
853 if (inShift) {
854 if ((ch == '-') || !B64CHAR(ch)) {
855 inShift = 0;
856 s++;
857
858 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
859 if (bitsleft >= 6) {
860 /* The shift sequence has a partial character in it. If
861 bitsleft < 6 then we could just classify it as padding
862 but that is not the case here */
863
864 errmsg = "partial character in shift sequence";
865 goto utf7Error;
866 }
867 /* According to RFC2152 the remaining bits should be zero. We
868 choose to signal an error/insert a replacement character
869 here so indicate the potential of a misencoded character. */
870
871 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
872 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
873 errmsg = "non-zero padding bits in shift sequence";
874 goto utf7Error;
875 }
876
877 if (ch == '-') {
878 if ((s < e) && (*(s) == '-')) {
879 *p++ = '-';
880 inShift = 1;
881 }
882 } else if (SPECIAL(ch,0,0)) {
883 errmsg = "unexpected special character";
884 goto utf7Error;
885 } else {
886 *p++ = ch;
887 }
888 } else {
889 charsleft = (charsleft << 6) | UB64(ch);
890 bitsleft += 6;
891 s++;
892 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
893 }
894 }
895 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000896 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000897 s++;
898 if (s < e && *s == '-') {
899 s++;
900 *p++ = '+';
901 } else
902 {
903 inShift = 1;
904 bitsleft = 0;
905 }
906 }
907 else if (SPECIAL(ch,0,0)) {
908 errmsg = "unexpected special character";
909 s++;
910 goto utf7Error;
911 }
912 else {
913 *p++ = ch;
914 s++;
915 }
916 continue;
917 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 outpos = p-PyUnicode_AS_UNICODE(unicode);
919 endinpos = s-starts;
920 if (unicode_decode_call_errorhandler(
921 errors, &errorHandler,
922 "utf7", errmsg,
923 starts, size, &startinpos, &endinpos, &exc, &s,
924 (PyObject **)&unicode, &outpos, &p))
925 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 }
927
928 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 outpos = p-PyUnicode_AS_UNICODE(unicode);
930 endinpos = size;
931 if (unicode_decode_call_errorhandler(
932 errors, &errorHandler,
933 "utf7", "unterminated shift sequence",
934 starts, size, &startinpos, &endinpos, &exc, &s,
935 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000937 if (s < e)
938 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939 }
940
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942 goto onError;
943
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000944 Py_XDECREF(errorHandler);
945 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 return (PyObject *)unicode;
947
948onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000949 Py_XDECREF(errorHandler);
950 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 Py_DECREF(unicode);
952 return NULL;
953}
954
955
956PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
957 int size,
958 int encodeSetO,
959 int encodeWhiteSpace,
960 const char *errors)
961{
962 PyObject *v;
963 /* It might be possible to tighten this worst case */
964 unsigned int cbAllocated = 5 * size;
965 int inShift = 0;
966 int i = 0;
967 unsigned int bitsleft = 0;
968 unsigned long charsleft = 0;
969 char * out;
970 char * start;
971
972 if (size == 0)
973 return PyString_FromStringAndSize(NULL, 0);
974
975 v = PyString_FromStringAndSize(NULL, cbAllocated);
976 if (v == NULL)
977 return NULL;
978
979 start = out = PyString_AS_STRING(v);
980 for (;i < size; ++i) {
981 Py_UNICODE ch = s[i];
982
983 if (!inShift) {
984 if (ch == '+') {
985 *out++ = '+';
986 *out++ = '-';
987 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
988 charsleft = ch;
989 bitsleft = 16;
990 *out++ = '+';
991 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
992 inShift = bitsleft > 0;
993 } else {
994 *out++ = (char) ch;
995 }
996 } else {
997 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
998 *out++ = B64(charsleft << (6-bitsleft));
999 charsleft = 0;
1000 bitsleft = 0;
1001 /* Characters not in the BASE64 set implicitly unshift the sequence
1002 so no '-' is required, except if the character is itself a '-' */
1003 if (B64CHAR(ch) || ch == '-') {
1004 *out++ = '-';
1005 }
1006 inShift = 0;
1007 *out++ = (char) ch;
1008 } else {
1009 bitsleft += 16;
1010 charsleft = (charsleft << 16) | ch;
1011 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1012
1013 /* If the next character is special then we dont' need to terminate
1014 the shift sequence. If the next character is not a BASE64 character
1015 or '-' then the shift sequence will be terminated implicitly and we
1016 don't have to insert a '-'. */
1017
1018 if (bitsleft == 0) {
1019 if (i + 1 < size) {
1020 Py_UNICODE ch2 = s[i+1];
1021
1022 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1023
1024 } else if (B64CHAR(ch2) || ch2 == '-') {
1025 *out++ = '-';
1026 inShift = 0;
1027 } else {
1028 inShift = 0;
1029 }
1030
1031 }
1032 else {
1033 *out++ = '-';
1034 inShift = 0;
1035 }
1036 }
1037 }
1038 }
1039 }
1040 if (bitsleft) {
1041 *out++= B64(charsleft << (6-bitsleft) );
1042 *out++ = '-';
1043 }
1044
Tim Peters5de98422002-04-27 18:44:32 +00001045 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 return v;
1047}
1048
1049#undef SPECIAL
1050#undef B64
1051#undef B64CHAR
1052#undef UB64
1053#undef ENCODE
1054#undef DECODE
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* --- UTF-8 Codec -------------------------------------------------------- */
1057
1058static
1059char utf8_code_length[256] = {
1060 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1061 illegal prefix. see RFC 2279 for details */
1062 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1063 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1075 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1076 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1077 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1078};
1079
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080PyObject *PyUnicode_DecodeUTF8(const char *s,
1081 int size,
1082 const char *errors)
1083{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001084 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001086 int startinpos;
1087 int endinpos;
1088 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 const char *e;
1090 PyUnicodeObject *unicode;
1091 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001092 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 PyObject *errorHandler = NULL;
1094 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095
1096 /* Note: size will always be longer than the resulting Unicode
1097 character count */
1098 unicode = _PyUnicode_New(size);
1099 if (!unicode)
1100 return NULL;
1101 if (size == 0)
1102 return (PyObject *)unicode;
1103
1104 /* Unpack UTF-8 encoded data */
1105 p = unicode->str;
1106 e = s + size;
1107
1108 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001109 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110
1111 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001112 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 s++;
1114 continue;
1115 }
1116
1117 n = utf8_code_length[ch];
1118
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 if (s + n > e) {
1120 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001121 startinpos = s-starts;
1122 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 goto utf8Error;
1124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125
1126 switch (n) {
1127
1128 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001129 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001130 startinpos = s-starts;
1131 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001132 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001135 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001136 startinpos = s-starts;
1137 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001138 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139
1140 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141 if ((s[1] & 0xc0) != 0x80) {
1142 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001143 startinpos = s-starts;
1144 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001145 goto utf8Error;
1146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001148 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001149 startinpos = s-starts;
1150 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001151 errmsg = "illegal encoding";
1152 goto utf8Error;
1153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 break;
1157
1158 case 3:
1159 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001160 (s[2] & 0xc0) != 0x80) {
1161 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 startinpos = s-starts;
1163 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 goto utf8Error;
1165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001167 if (ch < 0x0800) {
1168 /* Note: UTF-8 encodings of surrogates are considered
1169 legal UTF-8 sequences;
1170
1171 XXX For wide builds (UCS-4) we should probably try
1172 to recombine the surrogates into a single code
1173 unit.
1174 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001175 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001176 startinpos = s-starts;
1177 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 goto utf8Error;
1179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001181 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 break;
1183
1184 case 4:
1185 if ((s[1] & 0xc0) != 0x80 ||
1186 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 (s[3] & 0xc0) != 0x80) {
1188 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001189 startinpos = s-starts;
1190 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001191 goto utf8Error;
1192 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1194 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1195 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001196 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001197 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001198 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001199 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001200 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001202 startinpos = s-starts;
1203 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001204 goto utf8Error;
1205 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001206#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001207 *p++ = (Py_UNICODE)ch;
1208#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001209 /* compute and append the two surrogates: */
1210
1211 /* translate from 10000..10FFFF to 0..FFFF */
1212 ch -= 0x10000;
1213
1214 /* high surrogate = top 10 bits added to D800 */
1215 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1216
1217 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001218 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 break;
1221
1222 default:
1223 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 }
1229 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 continue;
1231
1232 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 outpos = p-PyUnicode_AS_UNICODE(unicode);
1234 if (unicode_decode_call_errorhandler(
1235 errors, &errorHandler,
1236 "utf8", errmsg,
1237 starts, size, &startinpos, &endinpos, &exc, &s,
1238 (PyObject **)&unicode, &outpos, &p))
1239 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
1242 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001243 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 goto onError;
1245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 Py_XDECREF(errorHandler);
1247 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return (PyObject *)unicode;
1249
1250onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001251 Py_XDECREF(errorHandler);
1252 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 Py_DECREF(unicode);
1254 return NULL;
1255}
1256
Tim Peters602f7402002-04-27 18:03:26 +00001257/* Allocation strategy: if the string is short, convert into a stack buffer
1258 and allocate exactly as much space needed at the end. Else allocate the
1259 maximum possible needed (4 result bytes per Unicode character), and return
1260 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001261*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001262PyObject *
1263PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1264 int size,
1265 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266{
Tim Peters602f7402002-04-27 18:03:26 +00001267#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001268
Tim Peters602f7402002-04-27 18:03:26 +00001269 int i; /* index into s of next input byte */
1270 PyObject *v; /* result string object */
1271 char *p; /* next free byte in output buffer */
1272 int nallocated; /* number of result bytes allocated */
1273 int nneeded; /* number of result bytes needed */
1274 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001275
Tim Peters602f7402002-04-27 18:03:26 +00001276 assert(s != NULL);
1277 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278
Tim Peters602f7402002-04-27 18:03:26 +00001279 if (size <= MAX_SHORT_UNICHARS) {
1280 /* Write into the stack buffer; nallocated can't overflow.
1281 * At the end, we'll allocate exactly as much heap space as it
1282 * turns out we need.
1283 */
1284 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1285 v = NULL; /* will allocate after we're done */
1286 p = stackbuf;
1287 }
1288 else {
1289 /* Overallocate on the heap, and give the excess back at the end. */
1290 nallocated = size * 4;
1291 if (nallocated / 4 != size) /* overflow! */
1292 return PyErr_NoMemory();
1293 v = PyString_FromStringAndSize(NULL, nallocated);
1294 if (v == NULL)
1295 return NULL;
1296 p = PyString_AS_STRING(v);
1297 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001300 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001301
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001302 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001303 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001305
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001307 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001308 *p++ = (char)(0xc0 | (ch >> 6));
1309 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001310 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001311 else {
Tim Peters602f7402002-04-27 18:03:26 +00001312 /* Encode UCS2 Unicode ordinals */
1313 if (ch < 0x10000) {
1314 /* Special case: check for high surrogate */
1315 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1316 Py_UCS4 ch2 = s[i];
1317 /* Check for low surrogate and combine the two to
1318 form a UCS4 value */
1319 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001320 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001321 i++;
1322 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 }
Tim Peters602f7402002-04-27 18:03:26 +00001324 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001325 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001326 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001327 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1328 *p++ = (char)(0x80 | (ch & 0x3f));
1329 continue;
1330 }
1331encodeUCS4:
1332 /* Encode UCS4 Unicode ordinals */
1333 *p++ = (char)(0xf0 | (ch >> 18));
1334 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1335 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1336 *p++ = (char)(0x80 | (ch & 0x3f));
1337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001339
Tim Peters602f7402002-04-27 18:03:26 +00001340 if (v == NULL) {
1341 /* This was stack allocated. */
1342 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1343 assert(nneeded <= nallocated);
1344 v = PyString_FromStringAndSize(stackbuf, nneeded);
1345 }
1346 else {
1347 /* Cut back to size actually needed. */
1348 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1349 assert(nneeded <= nallocated);
1350 _PyString_Resize(&v, nneeded);
1351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355}
1356
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1358{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 return NULL;
1362 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001363 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1364 PyUnicode_GET_SIZE(unicode),
1365 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366}
1367
1368/* --- UTF-16 Codec ------------------------------------------------------- */
1369
Tim Peters772747b2001-08-09 22:21:55 +00001370PyObject *
1371PyUnicode_DecodeUTF16(const char *s,
1372 int size,
1373 const char *errors,
1374 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001376 const char *starts = s;
1377 int startinpos;
1378 int endinpos;
1379 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 PyUnicodeObject *unicode;
1381 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001382 const unsigned char *q, *e;
1383 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001384 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001385 /* Offsets from q for retrieving byte pairs in the right order. */
1386#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1387 int ihi = 1, ilo = 0;
1388#else
1389 int ihi = 0, ilo = 1;
1390#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391 PyObject *errorHandler = NULL;
1392 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393
1394 /* Note: size will always be longer than the resulting Unicode
1395 character count */
1396 unicode = _PyUnicode_New(size);
1397 if (!unicode)
1398 return NULL;
1399 if (size == 0)
1400 return (PyObject *)unicode;
1401
1402 /* Unpack UTF-16 encoded data */
1403 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001404 q = (unsigned char *)s;
1405 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406
1407 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001408 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001410 /* Check for BOM marks (U+FEFF) in the input and adjust current
1411 byte order setting accordingly. In native mode, the leading BOM
1412 mark is skipped, in all other modes, it is copied to the output
1413 stream as-is (giving a ZWNBSP character). */
1414 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001415 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001416#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001417 if (bom == 0xFEFF) {
1418 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001419 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001420 }
1421 else if (bom == 0xFFFE) {
1422 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001423 bo = 1;
1424 }
1425#else
Tim Peters772747b2001-08-09 22:21:55 +00001426 if (bom == 0xFEFF) {
1427 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001428 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001429 }
1430 else if (bom == 0xFFFE) {
1431 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001432 bo = -1;
1433 }
1434#endif
1435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436
Tim Peters772747b2001-08-09 22:21:55 +00001437 if (bo == -1) {
1438 /* force LE */
1439 ihi = 1;
1440 ilo = 0;
1441 }
1442 else if (bo == 1) {
1443 /* force BE */
1444 ihi = 0;
1445 ilo = 1;
1446 }
1447
1448 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_UNICODE ch;
1450 /* remaing bytes at the end? (size should be even) */
1451 if (e-q<2) {
1452 errmsg = "truncated data";
1453 startinpos = ((const char *)q)-starts;
1454 endinpos = ((const char *)e)-starts;
1455 goto utf16Error;
1456 /* The remaining input chars are ignored if the callback
1457 chooses to skip the input */
1458 }
1459 ch = (q[ihi] << 8) | q[ilo];
1460
Tim Peters772747b2001-08-09 22:21:55 +00001461 q += 2;
1462
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 if (ch < 0xD800 || ch > 0xDFFF) {
1464 *p++ = ch;
1465 continue;
1466 }
1467
1468 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001469 if (q >= e) {
1470 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001471 startinpos = (((const char *)q)-2)-starts;
1472 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001473 goto utf16Error;
1474 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001475 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001476 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1477 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001478 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001479#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001480 *p++ = ch;
1481 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001482#else
1483 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001484#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001485 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001486 }
1487 else {
1488 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 startinpos = (((const char *)q)-4)-starts;
1490 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001491 goto utf16Error;
1492 }
1493
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001495 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 startinpos = (((const char *)q)-2)-starts;
1497 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001498 /* Fall through to report the error */
1499
1500 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 outpos = p-PyUnicode_AS_UNICODE(unicode);
1502 if (unicode_decode_call_errorhandler(
1503 errors, &errorHandler,
1504 "utf16", errmsg,
1505 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1506 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508 }
1509
1510 if (byteorder)
1511 *byteorder = bo;
1512
1513 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001514 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 goto onError;
1516
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 Py_XDECREF(errorHandler);
1518 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 return (PyObject *)unicode;
1520
1521onError:
1522 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001523 Py_XDECREF(errorHandler);
1524 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 return NULL;
1526}
1527
Tim Peters772747b2001-08-09 22:21:55 +00001528PyObject *
1529PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1530 int size,
1531 const char *errors,
1532 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533{
1534 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001535 unsigned char *p;
1536 int i, pairs;
1537 /* Offsets from p for storing byte pairs in the right order. */
1538#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1539 int ihi = 1, ilo = 0;
1540#else
1541 int ihi = 0, ilo = 1;
1542#endif
1543
1544#define STORECHAR(CH) \
1545 do { \
1546 p[ihi] = ((CH) >> 8) & 0xff; \
1547 p[ilo] = (CH) & 0xff; \
1548 p += 2; \
1549 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001550
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001551 for (i = pairs = 0; i < size; i++)
1552 if (s[i] >= 0x10000)
1553 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001555 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556 if (v == NULL)
1557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Tim Peters772747b2001-08-09 22:21:55 +00001559 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001561 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001562 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001563 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001564
1565 if (byteorder == -1) {
1566 /* force LE */
1567 ihi = 1;
1568 ilo = 0;
1569 }
1570 else if (byteorder == 1) {
1571 /* force BE */
1572 ihi = 0;
1573 ilo = 1;
1574 }
1575
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001576 while (size-- > 0) {
1577 Py_UNICODE ch = *s++;
1578 Py_UNICODE ch2 = 0;
1579 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001580 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1581 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582 }
Tim Peters772747b2001-08-09 22:21:55 +00001583 STORECHAR(ch);
1584 if (ch2)
1585 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001588#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589}
1590
1591PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1592{
1593 if (!PyUnicode_Check(unicode)) {
1594 PyErr_BadArgument();
1595 return NULL;
1596 }
1597 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1598 PyUnicode_GET_SIZE(unicode),
1599 NULL,
1600 0);
1601}
1602
1603/* --- Unicode Escape Codec ----------------------------------------------- */
1604
Fredrik Lundh06d12682001-01-24 07:59:11 +00001605static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001606
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1608 int size,
1609 const char *errors)
1610{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001611 const char *starts = s;
1612 int startinpos;
1613 int endinpos;
1614 int outpos;
1615 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001619 char* message;
1620 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 PyObject *errorHandler = NULL;
1622 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001623
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 /* Escaped strings will always be longer than the resulting
1625 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 length after conversion to the true value.
1627 (but if the error callback returns a long replacement string
1628 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 v = _PyUnicode_New(size);
1630 if (v == NULL)
1631 goto onError;
1632 if (size == 0)
1633 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001637
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 while (s < end) {
1639 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001640 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001641 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642
1643 /* Non-escape characters are interpreted as Unicode ordinals */
1644 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001645 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 continue;
1647 }
1648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650 /* \ - Escapes */
1651 s++;
1652 switch (*s++) {
1653
1654 /* \x escapes */
1655 case '\n': break;
1656 case '\\': *p++ = '\\'; break;
1657 case '\'': *p++ = '\''; break;
1658 case '\"': *p++ = '\"'; break;
1659 case 'b': *p++ = '\b'; break;
1660 case 'f': *p++ = '\014'; break; /* FF */
1661 case 't': *p++ = '\t'; break;
1662 case 'n': *p++ = '\n'; break;
1663 case 'r': *p++ = '\r'; break;
1664 case 'v': *p++ = '\013'; break; /* VT */
1665 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1666
1667 /* \OOO (octal) escapes */
1668 case '0': case '1': case '2': case '3':
1669 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001670 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001672 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001674 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001676 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677 break;
1678
Fredrik Lundhccc74732001-02-18 22:13:49 +00001679 /* hex escapes */
1680 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 digits = 2;
1683 message = "truncated \\xXX escape";
1684 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001688 digits = 4;
1689 message = "truncated \\uXXXX escape";
1690 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691
Fredrik Lundhccc74732001-02-18 22:13:49 +00001692 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001693 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001694 digits = 8;
1695 message = "truncated \\UXXXXXXXX escape";
1696 hexescape:
1697 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 outpos = p-PyUnicode_AS_UNICODE(v);
1699 if (s+digits>end) {
1700 endinpos = size;
1701 if (unicode_decode_call_errorhandler(
1702 errors, &errorHandler,
1703 "unicodeescape", "end of string in escape sequence",
1704 starts, size, &startinpos, &endinpos, &exc, &s,
1705 (PyObject **)&v, &outpos, &p))
1706 goto onError;
1707 goto nextByte;
1708 }
1709 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001710 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001711 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 endinpos = (s+i+1)-starts;
1713 if (unicode_decode_call_errorhandler(
1714 errors, &errorHandler,
1715 "unicodeescape", message,
1716 starts, size, &startinpos, &endinpos, &exc, &s,
1717 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001718 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001720 }
1721 chr = (chr<<4) & ~0xF;
1722 if (c >= '0' && c <= '9')
1723 chr += c - '0';
1724 else if (c >= 'a' && c <= 'f')
1725 chr += 10 + c - 'a';
1726 else
1727 chr += 10 + c - 'A';
1728 }
1729 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001730 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001731 /* _decoding_error will have already written into the
1732 target buffer. */
1733 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001735 /* when we get here, chr is a 32-bit unicode character */
1736 if (chr <= 0xffff)
1737 /* UCS-2 character */
1738 *p++ = (Py_UNICODE) chr;
1739 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001740 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001741 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001742#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001743 *p++ = chr;
1744#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001745 chr -= 0x10000L;
1746 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001747 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001748#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001749 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 endinpos = s-starts;
1751 outpos = p-PyUnicode_AS_UNICODE(v);
1752 if (unicode_decode_call_errorhandler(
1753 errors, &errorHandler,
1754 "unicodeescape", "illegal Unicode character",
1755 starts, size, &startinpos, &endinpos, &exc, &s,
1756 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001757 goto onError;
1758 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001759 break;
1760
1761 /* \N{name} */
1762 case 'N':
1763 message = "malformed \\N character escape";
1764 if (ucnhash_CAPI == NULL) {
1765 /* load the unicode data module */
1766 PyObject *m, *v;
1767 m = PyImport_ImportModule("unicodedata");
1768 if (m == NULL)
1769 goto ucnhashError;
1770 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1771 Py_DECREF(m);
1772 if (v == NULL)
1773 goto ucnhashError;
1774 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1775 Py_DECREF(v);
1776 if (ucnhash_CAPI == NULL)
1777 goto ucnhashError;
1778 }
1779 if (*s == '{') {
1780 const char *start = s+1;
1781 /* look for the closing brace */
1782 while (*s != '}' && s < end)
1783 s++;
1784 if (s > start && s < end && *s == '}') {
1785 /* found a name. look it up in the unicode database */
1786 message = "unknown Unicode character name";
1787 s++;
1788 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1789 goto store;
1790 }
1791 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 endinpos = s-starts;
1793 outpos = p-PyUnicode_AS_UNICODE(v);
1794 if (unicode_decode_call_errorhandler(
1795 errors, &errorHandler,
1796 "unicodeescape", message,
1797 starts, size, &startinpos, &endinpos, &exc, &s,
1798 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800 break;
1801
1802 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001803 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 message = "\\ at end of string";
1805 s--;
1806 endinpos = s-starts;
1807 outpos = p-PyUnicode_AS_UNICODE(v);
1808 if (unicode_decode_call_errorhandler(
1809 errors, &errorHandler,
1810 "unicodeescape", message,
1811 starts, size, &startinpos, &endinpos, &exc, &s,
1812 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001813 goto onError;
1814 }
1815 else {
1816 *p++ = '\\';
1817 *p++ = (unsigned char)s[-1];
1818 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001819 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001820 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 nextByte:
1822 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001827
Fredrik Lundhccc74732001-02-18 22:13:49 +00001828ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001829 PyErr_SetString(
1830 PyExc_UnicodeError,
1831 "\\N escapes not supported (can't load unicodedata module)"
1832 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 Py_XDECREF(errorHandler);
1834 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001835 return NULL;
1836
Fredrik Lundhccc74732001-02-18 22:13:49 +00001837onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 Py_XDECREF(errorHandler);
1840 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 return NULL;
1842}
1843
1844/* Return a Unicode-Escape string version of the Unicode object.
1845
1846 If quotes is true, the string is enclosed in u"" or u'' quotes as
1847 appropriate.
1848
1849*/
1850
Barry Warsaw51ac5802000-03-20 16:36:48 +00001851static const Py_UNICODE *findchar(const Py_UNICODE *s,
1852 int size,
1853 Py_UNICODE ch);
1854
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855static
1856PyObject *unicodeescape_string(const Py_UNICODE *s,
1857 int size,
1858 int quotes)
1859{
1860 PyObject *repr;
1861 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001863 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864
1865 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1866 if (repr == NULL)
1867 return NULL;
1868
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001869 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 *p++ = 'u';
1873 *p++ = (findchar(s, size, '\'') &&
1874 !findchar(s, size, '"')) ? '"' : '\'';
1875 }
1876 while (size-- > 0) {
1877 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001878
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001880 if (quotes &&
1881 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 *p++ = '\\';
1883 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001884 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001886
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001887#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001888 /* Map 21-bit characters to '\U00xxxxxx' */
1889 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001890 int offset = p - PyString_AS_STRING(repr);
1891
1892 /* Resize the string if necessary */
1893 if (offset + 12 > PyString_GET_SIZE(repr)) {
1894 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001895 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001896 p = PyString_AS_STRING(repr) + offset;
1897 }
1898
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899 *p++ = '\\';
1900 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001901 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1902 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1903 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1904 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1905 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1906 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1907 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001908 *p++ = hexdigit[ch & 0x0000000F];
1909 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001910 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001911#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001912 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1913 else if (ch >= 0xD800 && ch < 0xDC00) {
1914 Py_UNICODE ch2;
1915 Py_UCS4 ucs;
1916
1917 ch2 = *s++;
1918 size--;
1919 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1920 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1921 *p++ = '\\';
1922 *p++ = 'U';
1923 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1924 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1925 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1926 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1927 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1928 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1929 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1930 *p++ = hexdigit[ucs & 0x0000000F];
1931 continue;
1932 }
1933 /* Fall through: isolated surrogates are copied as-is */
1934 s--;
1935 size++;
1936 }
1937
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001939 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 *p++ = '\\';
1941 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 *p++ = hexdigit[(ch >> 12) & 0x000F];
1943 *p++ = hexdigit[(ch >> 8) & 0x000F];
1944 *p++ = hexdigit[(ch >> 4) & 0x000F];
1945 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001947
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001948 /* Map special whitespace to '\t', \n', '\r' */
1949 else if (ch == '\t') {
1950 *p++ = '\\';
1951 *p++ = 't';
1952 }
1953 else if (ch == '\n') {
1954 *p++ = '\\';
1955 *p++ = 'n';
1956 }
1957 else if (ch == '\r') {
1958 *p++ = '\\';
1959 *p++ = 'r';
1960 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001961
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001962 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001963 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001965 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001966 *p++ = hexdigit[(ch >> 4) & 0x000F];
1967 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001969
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 /* Copy everything else as-is */
1971 else
1972 *p++ = (char) ch;
1973 }
1974 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001975 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976
1977 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001978 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return repr;
1980}
1981
1982PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1983 int size)
1984{
1985 return unicodeescape_string(s, size, 0);
1986}
1987
1988PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1989{
1990 if (!PyUnicode_Check(unicode)) {
1991 PyErr_BadArgument();
1992 return NULL;
1993 }
1994 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1995 PyUnicode_GET_SIZE(unicode));
1996}
1997
1998/* --- Raw Unicode Escape Codec ------------------------------------------- */
1999
2000PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2001 int size,
2002 const char *errors)
2003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 const char *starts = s;
2005 int startinpos;
2006 int endinpos;
2007 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 const char *end;
2011 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002012 PyObject *errorHandler = NULL;
2013 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002014
2015 /* Escaped strings will always be longer than the resulting
2016 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 length after conversion to the true value. (But decoding error
2018 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 v = _PyUnicode_New(size);
2020 if (v == NULL)
2021 goto onError;
2022 if (size == 0)
2023 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002025 end = s + size;
2026 while (s < end) {
2027 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002028 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 int i;
2030
2031 /* Non-escape characters are interpreted as Unicode ordinals */
2032 if (*s != '\\') {
2033 *p++ = (unsigned char)*s++;
2034 continue;
2035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002036 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 /* \u-escapes are only interpreted iff the number of leading
2039 backslashes if odd */
2040 bs = s;
2041 for (;s < end;) {
2042 if (*s != '\\')
2043 break;
2044 *p++ = (unsigned char)*s++;
2045 }
2046 if (((s - bs) & 1) == 0 ||
2047 s >= end ||
2048 *s != 'u') {
2049 continue;
2050 }
2051 p--;
2052 s++;
2053
2054 /* \uXXXX with 4 hex digits */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002055 outpos = p-PyUnicode_AS_UNICODE(v);
2056 for (x = 0, i = 0; i < 4; ++i, ++s) {
2057 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 endinpos = s-starts;
2060 if (unicode_decode_call_errorhandler(
2061 errors, &errorHandler,
2062 "rawunicodeescape", "truncated \\uXXXX",
2063 starts, size, &startinpos, &endinpos, &exc, &s,
2064 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 }
2068 x = (x<<4) & ~0xF;
2069 if (c >= '0' && c <= '9')
2070 x += c - '0';
2071 else if (c >= 'a' && c <= 'f')
2072 x += 10 + c - 'a';
2073 else
2074 x += 10 + c - 'A';
2075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 *p++ = x;
2077 nextByte:
2078 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 Py_XDECREF(errorHandler);
2083 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 return (PyObject *)v;
2085
2086 onError:
2087 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 Py_XDECREF(errorHandler);
2089 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 return NULL;
2091}
2092
2093PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2094 int size)
2095{
2096 PyObject *repr;
2097 char *p;
2098 char *q;
2099
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002100 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101
2102 repr = PyString_FromStringAndSize(NULL, 6 * size);
2103 if (repr == NULL)
2104 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002105 if (size == 0)
2106 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
2108 p = q = PyString_AS_STRING(repr);
2109 while (size-- > 0) {
2110 Py_UNICODE ch = *s++;
2111 /* Map 16-bit characters to '\uxxxx' */
2112 if (ch >= 256) {
2113 *p++ = '\\';
2114 *p++ = 'u';
2115 *p++ = hexdigit[(ch >> 12) & 0xf];
2116 *p++ = hexdigit[(ch >> 8) & 0xf];
2117 *p++ = hexdigit[(ch >> 4) & 0xf];
2118 *p++ = hexdigit[ch & 15];
2119 }
2120 /* Copy everything else as-is */
2121 else
2122 *p++ = (char) ch;
2123 }
2124 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002125 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 return repr;
2127}
2128
2129PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2130{
2131 if (!PyUnicode_Check(unicode)) {
2132 PyErr_BadArgument();
2133 return NULL;
2134 }
2135 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2136 PyUnicode_GET_SIZE(unicode));
2137}
2138
2139/* --- Latin-1 Codec ------------------------------------------------------ */
2140
2141PyObject *PyUnicode_DecodeLatin1(const char *s,
2142 int size,
2143 const char *errors)
2144{
2145 PyUnicodeObject *v;
2146 Py_UNICODE *p;
2147
2148 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002149 if (size == 1 && *(unsigned char*)s < 256) {
2150 Py_UNICODE r = *(unsigned char*)s;
2151 return PyUnicode_FromUnicode(&r, 1);
2152 }
2153
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 v = _PyUnicode_New(size);
2155 if (v == NULL)
2156 goto onError;
2157 if (size == 0)
2158 return (PyObject *)v;
2159 p = PyUnicode_AS_UNICODE(v);
2160 while (size-- > 0)
2161 *p++ = (unsigned char)*s++;
2162 return (PyObject *)v;
2163
2164 onError:
2165 Py_XDECREF(v);
2166 return NULL;
2167}
2168
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169/* create or adjust a UnicodeEncodeError */
2170static void make_encode_exception(PyObject **exceptionObject,
2171 const char *encoding,
2172 const Py_UNICODE *unicode, int size,
2173 int startpos, int endpos,
2174 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 if (*exceptionObject == NULL) {
2177 *exceptionObject = PyUnicodeEncodeError_Create(
2178 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 }
2180 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2182 goto onError;
2183 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2184 goto onError;
2185 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2186 goto onError;
2187 return;
2188 onError:
2189 Py_DECREF(*exceptionObject);
2190 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 }
2192}
2193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002194/* raises a UnicodeEncodeError */
2195static void raise_encode_exception(PyObject **exceptionObject,
2196 const char *encoding,
2197 const Py_UNICODE *unicode, int size,
2198 int startpos, int endpos,
2199 const char *reason)
2200{
2201 make_encode_exception(exceptionObject,
2202 encoding, unicode, size, startpos, endpos, reason);
2203 if (*exceptionObject != NULL)
2204 PyCodec_StrictErrors(*exceptionObject);
2205}
2206
2207/* error handling callback helper:
2208 build arguments, call the callback and check the arguments,
2209 put the result into newpos and return the replacement string, which
2210 has to be freed by the caller */
2211static PyObject *unicode_encode_call_errorhandler(const char *errors,
2212 PyObject **errorHandler,
2213 const char *encoding, const char *reason,
2214 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2215 int startpos, int endpos,
2216 int *newpos)
2217{
2218 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2219
2220 PyObject *restuple;
2221 PyObject *resunicode;
2222
2223 if (*errorHandler == NULL) {
2224 *errorHandler = PyCodec_LookupError(errors);
2225 if (*errorHandler == NULL)
2226 return NULL;
2227 }
2228
2229 make_encode_exception(exceptionObject,
2230 encoding, unicode, size, startpos, endpos, reason);
2231 if (*exceptionObject == NULL)
2232 return NULL;
2233
2234 restuple = PyObject_CallFunctionObjArgs(
2235 *errorHandler, *exceptionObject, NULL);
2236 if (restuple == NULL)
2237 return NULL;
2238 if (!PyTuple_Check(restuple)) {
2239 PyErr_Format(PyExc_TypeError, &argparse[4]);
2240 Py_DECREF(restuple);
2241 return NULL;
2242 }
2243 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2244 &resunicode, newpos)) {
2245 Py_DECREF(restuple);
2246 return NULL;
2247 }
2248 if (*newpos<0)
2249 *newpos = 0;
2250 else if (*newpos>size)
2251 *newpos = size;
2252 Py_INCREF(resunicode);
2253 Py_DECREF(restuple);
2254 return resunicode;
2255}
2256
2257static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2258 int size,
2259 const char *errors,
2260 int limit)
2261{
2262 /* output object */
2263 PyObject *res;
2264 /* pointers to the beginning and end+1 of input */
2265 const Py_UNICODE *startp = p;
2266 const Py_UNICODE *endp = p + size;
2267 /* pointer to the beginning of the unencodable characters */
2268 /* const Py_UNICODE *badp = NULL; */
2269 /* pointer into the output */
2270 char *str;
2271 /* current output position */
2272 int respos = 0;
2273 int ressize;
2274 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2275 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2276 PyObject *errorHandler = NULL;
2277 PyObject *exc = NULL;
2278 /* the following variable is used for caching string comparisons
2279 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2280 int known_errorHandler = -1;
2281
2282 /* allocate enough for a simple encoding without
2283 replacements, if we need more, we'll resize */
2284 res = PyString_FromStringAndSize(NULL, size);
2285 if (res == NULL)
2286 goto onError;
2287 if (size == 0)
2288 return res;
2289 str = PyString_AS_STRING(res);
2290 ressize = size;
2291
2292 while (p<endp) {
2293 Py_UNICODE c = *p;
2294
2295 /* can we encode this? */
2296 if (c<limit) {
2297 /* no overflow check, because we know that the space is enough */
2298 *str++ = (char)c;
2299 ++p;
2300 }
2301 else {
2302 int unicodepos = p-startp;
2303 int requiredsize;
2304 PyObject *repunicode;
2305 int repsize;
2306 int newpos;
2307 int respos;
2308 Py_UNICODE *uni2;
2309 /* startpos for collecting unencodable chars */
2310 const Py_UNICODE *collstart = p;
2311 const Py_UNICODE *collend = p;
2312 /* find all unecodable characters */
2313 while ((collend < endp) && ((*collend)>=limit))
2314 ++collend;
2315 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2316 if (known_errorHandler==-1) {
2317 if ((errors==NULL) || (!strcmp(errors, "strict")))
2318 known_errorHandler = 1;
2319 else if (!strcmp(errors, "replace"))
2320 known_errorHandler = 2;
2321 else if (!strcmp(errors, "ignore"))
2322 known_errorHandler = 3;
2323 else if (!strcmp(errors, "xmlcharrefreplace"))
2324 known_errorHandler = 4;
2325 else
2326 known_errorHandler = 0;
2327 }
2328 switch (known_errorHandler) {
2329 case 1: /* strict */
2330 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2331 goto onError;
2332 case 2: /* replace */
2333 while (collstart++<collend)
2334 *str++ = '?'; /* fall through */
2335 case 3: /* ignore */
2336 p = collend;
2337 break;
2338 case 4: /* xmlcharrefreplace */
2339 respos = str-PyString_AS_STRING(res);
2340 /* determine replacement size (temporarily (mis)uses p) */
2341 for (p = collstart, repsize = 0; p < collend; ++p) {
2342 if (*p<10)
2343 repsize += 2+1+1;
2344 else if (*p<100)
2345 repsize += 2+2+1;
2346 else if (*p<1000)
2347 repsize += 2+3+1;
2348 else if (*p<10000)
2349 repsize += 2+4+1;
2350 else if (*p<100000)
2351 repsize += 2+5+1;
2352 else if (*p<1000000)
2353 repsize += 2+6+1;
2354 else
2355 repsize += 2+7+1;
2356 }
2357 requiredsize = respos+repsize+(endp-collend);
2358 if (requiredsize > ressize) {
2359 if (requiredsize<2*ressize)
2360 requiredsize = 2*ressize;
2361 if (_PyString_Resize(&res, requiredsize))
2362 goto onError;
2363 str = PyString_AS_STRING(res) + respos;
2364 ressize = requiredsize;
2365 }
2366 /* generate replacement (temporarily (mis)uses p) */
2367 for (p = collstart; p < collend; ++p) {
2368 str += sprintf(str, "&#%d;", (int)*p);
2369 }
2370 p = collend;
2371 break;
2372 default:
2373 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2374 encoding, reason, startp, size, &exc,
2375 collstart-startp, collend-startp, &newpos);
2376 if (repunicode == NULL)
2377 goto onError;
2378 /* need more space? (at least enough for what we
2379 have+the replacement+the rest of the string, so
2380 we won't have to check space for encodable characters) */
2381 respos = str-PyString_AS_STRING(res);
2382 repsize = PyUnicode_GET_SIZE(repunicode);
2383 requiredsize = respos+repsize+(endp-collend);
2384 if (requiredsize > ressize) {
2385 if (requiredsize<2*ressize)
2386 requiredsize = 2*ressize;
2387 if (_PyString_Resize(&res, requiredsize)) {
2388 Py_DECREF(repunicode);
2389 goto onError;
2390 }
2391 str = PyString_AS_STRING(res) + respos;
2392 ressize = requiredsize;
2393 }
2394 /* check if there is anything unencodable in the replacement
2395 and copy it to the output */
2396 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2397 c = *uni2;
2398 if (c >= limit) {
2399 raise_encode_exception(&exc, encoding, startp, size,
2400 unicodepos, unicodepos+1, reason);
2401 Py_DECREF(repunicode);
2402 goto onError;
2403 }
2404 *str = (char)c;
2405 }
2406 p = startp + newpos;
2407 Py_DECREF(repunicode);
2408 }
2409 }
2410 }
2411 /* Resize if we allocated to much */
2412 respos = str-PyString_AS_STRING(res);
2413 if (respos<ressize)
2414 /* If this falls res will be NULL */
2415 _PyString_Resize(&res, respos);
2416 Py_XDECREF(errorHandler);
2417 Py_XDECREF(exc);
2418 return res;
2419
2420 onError:
2421 Py_XDECREF(res);
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return NULL;
2425}
2426
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2428 int size,
2429 const char *errors)
2430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002431 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432}
2433
2434PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2435{
2436 if (!PyUnicode_Check(unicode)) {
2437 PyErr_BadArgument();
2438 return NULL;
2439 }
2440 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2441 PyUnicode_GET_SIZE(unicode),
2442 NULL);
2443}
2444
2445/* --- 7-bit ASCII Codec -------------------------------------------------- */
2446
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447PyObject *PyUnicode_DecodeASCII(const char *s,
2448 int size,
2449 const char *errors)
2450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002451 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 PyUnicodeObject *v;
2453 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 int startinpos;
2455 int endinpos;
2456 int outpos;
2457 const char *e;
2458 PyObject *errorHandler = NULL;
2459 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460
2461 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002462 if (size == 1 && *(unsigned char*)s < 128) {
2463 Py_UNICODE r = *(unsigned char*)s;
2464 return PyUnicode_FromUnicode(&r, 1);
2465 }
2466
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 v = _PyUnicode_New(size);
2468 if (v == NULL)
2469 goto onError;
2470 if (size == 0)
2471 return (PyObject *)v;
2472 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 e = s + size;
2474 while (s < e) {
2475 register unsigned char c = (unsigned char)*s;
2476 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 ++s;
2479 }
2480 else {
2481 startinpos = s-starts;
2482 endinpos = startinpos + 1;
2483 outpos = p-PyUnicode_AS_UNICODE(v);
2484 if (unicode_decode_call_errorhandler(
2485 errors, &errorHandler,
2486 "ascii", "ordinal not in range(128)",
2487 starts, size, &startinpos, &endinpos, &exc, &s,
2488 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002492 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002493 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002494 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 return (PyObject *)v;
2498
2499 onError:
2500 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 return NULL;
2504}
2505
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2507 int size,
2508 const char *errors)
2509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002510 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511}
2512
2513PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2514{
2515 if (!PyUnicode_Check(unicode)) {
2516 PyErr_BadArgument();
2517 return NULL;
2518 }
2519 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2520 PyUnicode_GET_SIZE(unicode),
2521 NULL);
2522}
2523
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002524#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002525
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002526/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002527
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002528PyObject *PyUnicode_DecodeMBCS(const char *s,
2529 int size,
2530 const char *errors)
2531{
2532 PyUnicodeObject *v;
2533 Py_UNICODE *p;
2534
2535 /* First get the size of the result */
2536 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002537 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002538 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2539
2540 v = _PyUnicode_New(usize);
2541 if (v == NULL)
2542 return NULL;
2543 if (usize == 0)
2544 return (PyObject *)v;
2545 p = PyUnicode_AS_UNICODE(v);
2546 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2547 Py_DECREF(v);
2548 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2549 }
2550
2551 return (PyObject *)v;
2552}
2553
2554PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2555 int size,
2556 const char *errors)
2557{
2558 PyObject *repr;
2559 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002560 DWORD mbcssize;
2561
2562 /* If there are no characters, bail now! */
2563 if (size==0)
2564 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002565
2566 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002567 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002568 if (mbcssize==0)
2569 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2570
2571 repr = PyString_FromStringAndSize(NULL, mbcssize);
2572 if (repr == NULL)
2573 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002574 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002575 return repr;
2576
2577 /* Do the conversion */
2578 s = PyString_AS_STRING(repr);
2579 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2580 Py_DECREF(repr);
2581 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2582 }
2583 return repr;
2584}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002586#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002587
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588/* --- Character Mapping Codec -------------------------------------------- */
2589
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590PyObject *PyUnicode_DecodeCharmap(const char *s,
2591 int size,
2592 PyObject *mapping,
2593 const char *errors)
2594{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 const char *starts = s;
2596 int startinpos;
2597 int endinpos;
2598 int outpos;
2599 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 PyUnicodeObject *v;
2601 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002602 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 PyObject *errorHandler = NULL;
2604 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 /* Default to Latin-1 */
2607 if (mapping == NULL)
2608 return PyUnicode_DecodeLatin1(s, size, errors);
2609
2610 v = _PyUnicode_New(size);
2611 if (v == NULL)
2612 goto onError;
2613 if (size == 0)
2614 return (PyObject *)v;
2615 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 e = s + size;
2617 while (s < e) {
2618 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 PyObject *w, *x;
2620
2621 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2622 w = PyInt_FromLong((long)ch);
2623 if (w == NULL)
2624 goto onError;
2625 x = PyObject_GetItem(mapping, w);
2626 Py_DECREF(w);
2627 if (x == NULL) {
2628 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002629 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002631 x = Py_None;
2632 Py_INCREF(x);
2633 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002634 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 }
2636
2637 /* Apply mapping */
2638 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002639 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 if (value < 0 || value > 65535) {
2641 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002642 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 Py_DECREF(x);
2644 goto onError;
2645 }
2646 *p++ = (Py_UNICODE)value;
2647 }
2648 else if (x == Py_None) {
2649 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002650 outpos = p-PyUnicode_AS_UNICODE(v);
2651 startinpos = s-starts;
2652 endinpos = startinpos+1;
2653 if (unicode_decode_call_errorhandler(
2654 errors, &errorHandler,
2655 "charmap", "character maps to <undefined>",
2656 starts, size, &startinpos, &endinpos, &exc, &s,
2657 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 Py_DECREF(x);
2659 goto onError;
2660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 }
2663 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002664 int targetsize = PyUnicode_GET_SIZE(x);
2665
2666 if (targetsize == 1)
2667 /* 1-1 mapping */
2668 *p++ = *PyUnicode_AS_UNICODE(x);
2669
2670 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002672 if (targetsize > extrachars) {
2673 /* resize first */
2674 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2675 int needed = (targetsize - extrachars) + \
2676 (targetsize << 2);
2677 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002678 if (_PyUnicode_Resize(&v,
2679 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002680 Py_DECREF(x);
2681 goto onError;
2682 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002683 p = PyUnicode_AS_UNICODE(v) + oldpos;
2684 }
2685 Py_UNICODE_COPY(p,
2686 PyUnicode_AS_UNICODE(x),
2687 targetsize);
2688 p += targetsize;
2689 extrachars -= targetsize;
2690 }
2691 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 }
2693 else {
2694 /* wrong return value */
2695 PyErr_SetString(PyExc_TypeError,
2696 "character mapping must return integer, None or unicode");
2697 Py_DECREF(x);
2698 goto onError;
2699 }
2700 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 }
2703 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002704 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 Py_XDECREF(errorHandler);
2707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 return (PyObject *)v;
2709
2710 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 Py_XDECREF(v);
2714 return NULL;
2715}
2716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717/* Lookup the character ch in the mapping. If the character
2718 can't be found, Py_None is returned (or NULL, if another
2719 error occured). */
2720static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 PyObject *w = PyInt_FromLong((long)c);
2723 PyObject *x;
2724
2725 if (w == NULL)
2726 return NULL;
2727 x = PyObject_GetItem(mapping, w);
2728 Py_DECREF(w);
2729 if (x == NULL) {
2730 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2731 /* No mapping found means: mapping is undefined. */
2732 PyErr_Clear();
2733 x = Py_None;
2734 Py_INCREF(x);
2735 return x;
2736 } else
2737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002739 else if (x == Py_None)
2740 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 else if (PyInt_Check(x)) {
2742 long value = PyInt_AS_LONG(x);
2743 if (value < 0 || value > 255) {
2744 PyErr_SetString(PyExc_TypeError,
2745 "character mapping must be in range(256)");
2746 Py_DECREF(x);
2747 return NULL;
2748 }
2749 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 else if (PyString_Check(x))
2752 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 /* wrong return value */
2755 PyErr_SetString(PyExc_TypeError,
2756 "character mapping must return integer, None or str");
2757 Py_DECREF(x);
2758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 }
2760}
2761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762/* lookup the character, put the result in the output string and adjust
2763 various state variables. Reallocate the output string if not enough
2764 space is available. Return a new reference to the object that
2765 was put in the output buffer, or Py_None, if the mapping was undefined
2766 (in which case no character was written) or NULL, if a
2767 reallocation error ocurred. The called must decref the result */
2768static
2769PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2770 PyObject **outobj, int *outpos)
2771{
2772 PyObject *rep = charmapencode_lookup(c, mapping);
2773
2774 if (rep==NULL)
2775 return NULL;
2776 else if (rep==Py_None)
2777 return rep;
2778 else {
2779 char *outstart = PyString_AS_STRING(*outobj);
2780 int outsize = PyString_GET_SIZE(*outobj);
2781 if (PyInt_Check(rep)) {
2782 int requiredsize = *outpos+1;
2783 if (outsize<requiredsize) {
2784 /* exponentially overallocate to minimize reallocations */
2785 if (requiredsize < 2*outsize)
2786 requiredsize = 2*outsize;
2787 if (_PyString_Resize(outobj, requiredsize)) {
2788 Py_DECREF(rep);
2789 return NULL;
2790 }
2791 outstart = PyString_AS_STRING(*outobj);
2792 }
2793 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2794 }
2795 else {
2796 const char *repchars = PyString_AS_STRING(rep);
2797 int repsize = PyString_GET_SIZE(rep);
2798 int requiredsize = *outpos+repsize;
2799 if (outsize<requiredsize) {
2800 /* exponentially overallocate to minimize reallocations */
2801 if (requiredsize < 2*outsize)
2802 requiredsize = 2*outsize;
2803 if (_PyString_Resize(outobj, requiredsize)) {
2804 Py_DECREF(rep);
2805 return NULL;
2806 }
2807 outstart = PyString_AS_STRING(*outobj);
2808 }
2809 memcpy(outstart + *outpos, repchars, repsize);
2810 *outpos += repsize;
2811 }
2812 }
2813 return rep;
2814}
2815
2816/* handle an error in PyUnicode_EncodeCharmap
2817 Return 0 on success, -1 on error */
2818static
2819int charmap_encoding_error(
2820 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2821 PyObject **exceptionObject,
2822 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2823 PyObject **res, int *respos)
2824{
2825 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2826 int repsize;
2827 int newpos;
2828 Py_UNICODE *uni2;
2829 /* startpos for collecting unencodable chars */
2830 int collstartpos = *inpos;
2831 int collendpos = *inpos+1;
2832 int collpos;
2833 char *encoding = "charmap";
2834 char *reason = "character maps to <undefined>";
2835
2836 PyObject *x;
2837 /* find all unencodable characters */
2838 while (collendpos < size) {
2839 x = charmapencode_lookup(p[collendpos], mapping);
2840 if (x==NULL)
2841 return -1;
2842 else if (x!=Py_None) {
2843 Py_DECREF(x);
2844 break;
2845 }
2846 Py_DECREF(x);
2847 ++collendpos;
2848 }
2849 /* cache callback name lookup
2850 * (if not done yet, i.e. it's the first error) */
2851 if (*known_errorHandler==-1) {
2852 if ((errors==NULL) || (!strcmp(errors, "strict")))
2853 *known_errorHandler = 1;
2854 else if (!strcmp(errors, "replace"))
2855 *known_errorHandler = 2;
2856 else if (!strcmp(errors, "ignore"))
2857 *known_errorHandler = 3;
2858 else if (!strcmp(errors, "xmlcharrefreplace"))
2859 *known_errorHandler = 4;
2860 else
2861 *known_errorHandler = 0;
2862 }
2863 switch (*known_errorHandler) {
2864 case 1: /* strict */
2865 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2866 return -1;
2867 case 2: /* replace */
2868 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2869 x = charmapencode_output('?', mapping, res, respos);
2870 if (x==NULL) {
2871 return -1;
2872 }
2873 else if (x==Py_None) {
2874 Py_DECREF(x);
2875 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2876 return -1;
2877 }
2878 Py_DECREF(x);
2879 }
2880 /* fall through */
2881 case 3: /* ignore */
2882 *inpos = collendpos;
2883 break;
2884 case 4: /* xmlcharrefreplace */
2885 /* generate replacement (temporarily (mis)uses p) */
2886 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2887 char buffer[2+29+1+1];
2888 char *cp;
2889 sprintf(buffer, "&#%d;", (int)p[collpos]);
2890 for (cp = buffer; *cp; ++cp) {
2891 x = charmapencode_output(*cp, mapping, res, respos);
2892 if (x==NULL)
2893 return -1;
2894 else if (x==Py_None) {
2895 Py_DECREF(x);
2896 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2897 return -1;
2898 }
2899 Py_DECREF(x);
2900 }
2901 }
2902 *inpos = collendpos;
2903 break;
2904 default:
2905 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2906 encoding, reason, p, size, exceptionObject,
2907 collstartpos, collendpos, &newpos);
2908 if (repunicode == NULL)
2909 return -1;
2910 /* generate replacement */
2911 repsize = PyUnicode_GET_SIZE(repunicode);
2912 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2913 x = charmapencode_output(*uni2, mapping, res, respos);
2914 if (x==NULL) {
2915 Py_DECREF(repunicode);
2916 return -1;
2917 }
2918 else if (x==Py_None) {
2919 Py_DECREF(repunicode);
2920 Py_DECREF(x);
2921 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2922 return -1;
2923 }
2924 Py_DECREF(x);
2925 }
2926 *inpos = newpos;
2927 Py_DECREF(repunicode);
2928 }
2929 return 0;
2930}
2931
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2933 int size,
2934 PyObject *mapping,
2935 const char *errors)
2936{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 /* output object */
2938 PyObject *res = NULL;
2939 /* current input position */
2940 int inpos = 0;
2941 /* current output position */
2942 int respos = 0;
2943 PyObject *errorHandler = NULL;
2944 PyObject *exc = NULL;
2945 /* the following variable is used for caching string comparisons
2946 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2947 * 3=ignore, 4=xmlcharrefreplace */
2948 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949
2950 /* Default to Latin-1 */
2951 if (mapping == NULL)
2952 return PyUnicode_EncodeLatin1(p, size, errors);
2953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954 /* allocate enough for a simple encoding without
2955 replacements, if we need more, we'll resize */
2956 res = PyString_FromStringAndSize(NULL, size);
2957 if (res == NULL)
2958 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002959 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 while (inpos<size) {
2963 /* try to encode it */
2964 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2965 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967 if (x==Py_None) { /* unencodable character */
2968 if (charmap_encoding_error(p, size, &inpos, mapping,
2969 &exc,
2970 &known_errorHandler, errorHandler, errors,
2971 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002972 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002974 else
2975 /* done with this character => adjust input position */
2976 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 Py_DECREF(x);
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 /* Resize if we allocated to much */
2981 if (respos<PyString_GET_SIZE(res)) {
2982 if (_PyString_Resize(&res, respos))
2983 goto onError;
2984 }
2985 Py_XDECREF(exc);
2986 Py_XDECREF(errorHandler);
2987 return res;
2988
2989 onError:
2990 Py_XDECREF(res);
2991 Py_XDECREF(exc);
2992 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 return NULL;
2994}
2995
2996PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2997 PyObject *mapping)
2998{
2999 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3000 PyErr_BadArgument();
3001 return NULL;
3002 }
3003 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3004 PyUnicode_GET_SIZE(unicode),
3005 mapping,
3006 NULL);
3007}
3008
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009/* create or adjust a UnicodeTranslateError */
3010static void make_translate_exception(PyObject **exceptionObject,
3011 const Py_UNICODE *unicode, int size,
3012 int startpos, int endpos,
3013 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015 if (*exceptionObject == NULL) {
3016 *exceptionObject = PyUnicodeTranslateError_Create(
3017 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 }
3019 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3021 goto onError;
3022 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3023 goto onError;
3024 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3025 goto onError;
3026 return;
3027 onError:
3028 Py_DECREF(*exceptionObject);
3029 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
3031}
3032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033/* raises a UnicodeTranslateError */
3034static void raise_translate_exception(PyObject **exceptionObject,
3035 const Py_UNICODE *unicode, int size,
3036 int startpos, int endpos,
3037 const char *reason)
3038{
3039 make_translate_exception(exceptionObject,
3040 unicode, size, startpos, endpos, reason);
3041 if (*exceptionObject != NULL)
3042 PyCodec_StrictErrors(*exceptionObject);
3043}
3044
3045/* error handling callback helper:
3046 build arguments, call the callback and check the arguments,
3047 put the result into newpos and return the replacement string, which
3048 has to be freed by the caller */
3049static PyObject *unicode_translate_call_errorhandler(const char *errors,
3050 PyObject **errorHandler,
3051 const char *reason,
3052 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3053 int startpos, int endpos,
3054 int *newpos)
3055{
3056 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3057
3058 PyObject *restuple;
3059 PyObject *resunicode;
3060
3061 if (*errorHandler == NULL) {
3062 *errorHandler = PyCodec_LookupError(errors);
3063 if (*errorHandler == NULL)
3064 return NULL;
3065 }
3066
3067 make_translate_exception(exceptionObject,
3068 unicode, size, startpos, endpos, reason);
3069 if (*exceptionObject == NULL)
3070 return NULL;
3071
3072 restuple = PyObject_CallFunctionObjArgs(
3073 *errorHandler, *exceptionObject, NULL);
3074 if (restuple == NULL)
3075 return NULL;
3076 if (!PyTuple_Check(restuple)) {
3077 PyErr_Format(PyExc_TypeError, &argparse[4]);
3078 Py_DECREF(restuple);
3079 return NULL;
3080 }
3081 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3082 &resunicode, newpos)) {
3083 Py_DECREF(restuple);
3084 return NULL;
3085 }
3086 if (*newpos<0)
3087 *newpos = 0;
3088 else if (*newpos>size)
3089 *newpos = size;
3090 Py_INCREF(resunicode);
3091 Py_DECREF(restuple);
3092 return resunicode;
3093}
3094
3095/* Lookup the character ch in the mapping and put the result in result,
3096 which must be decrefed by the caller.
3097 Return 0 on success, -1 on error */
3098static
3099int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3100{
3101 PyObject *w = PyInt_FromLong((long)c);
3102 PyObject *x;
3103
3104 if (w == NULL)
3105 return -1;
3106 x = PyObject_GetItem(mapping, w);
3107 Py_DECREF(w);
3108 if (x == NULL) {
3109 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3110 /* No mapping found means: use 1:1 mapping. */
3111 PyErr_Clear();
3112 *result = NULL;
3113 return 0;
3114 } else
3115 return -1;
3116 }
3117 else if (x == Py_None) {
3118 *result = x;
3119 return 0;
3120 }
3121 else if (PyInt_Check(x)) {
3122 long value = PyInt_AS_LONG(x);
3123 long max = PyUnicode_GetMax();
3124 if (value < 0 || value > max) {
3125 PyErr_Format(PyExc_TypeError,
3126 "character mapping must be in range(0x%lx)", max+1);
3127 Py_DECREF(x);
3128 return -1;
3129 }
3130 *result = x;
3131 return 0;
3132 }
3133 else if (PyUnicode_Check(x)) {
3134 *result = x;
3135 return 0;
3136 }
3137 else {
3138 /* wrong return value */
3139 PyErr_SetString(PyExc_TypeError,
3140 "character mapping must return integer, None or unicode");
3141 return -1;
3142 }
3143}
3144/* ensure that *outobj is at least requiredsize characters long,
3145if not reallocate and adjust various state variables.
3146Return 0 on success, -1 on error */
3147static
3148int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3149 int requiredsize)
3150{
3151 if (requiredsize > *outsize) {
3152 /* remember old output position */
3153 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3154 /* exponentially overallocate to minimize reallocations */
3155 if (requiredsize < 2 * *outsize)
3156 requiredsize = 2 * *outsize;
3157 if (_PyUnicode_Resize(outobj, requiredsize))
3158 return -1;
3159 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3160 *outsize = requiredsize;
3161 }
3162 return 0;
3163}
3164/* lookup the character, put the result in the output string and adjust
3165 various state variables. Return a new reference to the object that
3166 was put in the output buffer in *result, or Py_None, if the mapping was
3167 undefined (in which case no character was written).
3168 The called must decref result.
3169 Return 0 on success, -1 on error. */
3170static
3171int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3172 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3173{
3174 if (charmaptranslate_lookup(c, mapping, res))
3175 return -1;
3176 if (*res==NULL) {
3177 /* not found => default to 1:1 mapping */
3178 *(*outp)++ = (Py_UNICODE)c;
3179 }
3180 else if (*res==Py_None)
3181 ;
3182 else if (PyInt_Check(*res)) {
3183 /* no overflow check, because we know that the space is enough */
3184 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3185 }
3186 else if (PyUnicode_Check(*res)) {
3187 int repsize = PyUnicode_GET_SIZE(*res);
3188 if (repsize==1) {
3189 /* no overflow check, because we know that the space is enough */
3190 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3191 }
3192 else if (repsize!=0) {
3193 /* more than one character */
3194 int requiredsize = *outsize + repsize - 1;
3195 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3196 return -1;
3197 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3198 *outp += repsize;
3199 }
3200 }
3201 else
3202 return -1;
3203 return 0;
3204}
3205
3206PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 int size,
3208 PyObject *mapping,
3209 const char *errors)
3210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 /* output object */
3212 PyObject *res = NULL;
3213 /* pointers to the beginning and end+1 of input */
3214 const Py_UNICODE *startp = p;
3215 const Py_UNICODE *endp = p + size;
3216 /* pointer into the output */
3217 Py_UNICODE *str;
3218 /* current output position */
3219 int respos = 0;
3220 int ressize;
3221 char *reason = "character maps to <undefined>";
3222 PyObject *errorHandler = NULL;
3223 PyObject *exc = NULL;
3224 /* the following variable is used for caching string comparisons
3225 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3226 * 3=ignore, 4=xmlcharrefreplace */
3227 int known_errorHandler = -1;
3228
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 if (mapping == NULL) {
3230 PyErr_BadArgument();
3231 return NULL;
3232 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233
3234 /* allocate enough for a simple 1:1 translation without
3235 replacements, if we need more, we'll resize */
3236 res = PyUnicode_FromUnicode(NULL, size);
3237 if (res == NULL)
3238 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 return res;
3241 str = PyUnicode_AS_UNICODE(res);
3242 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003244 while (p<endp) {
3245 /* try to encode it */
3246 PyObject *x = NULL;
3247 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3248 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 goto onError;
3250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251 if (x!=Py_None) /* it worked => adjust input pointer */
3252 ++p;
3253 else { /* untranslatable character */
3254 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3255 int repsize;
3256 int newpos;
3257 Py_UNICODE *uni2;
3258 /* startpos for collecting untranslatable chars */
3259 const Py_UNICODE *collstart = p;
3260 const Py_UNICODE *collend = p+1;
3261 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(x);
3264 /* find all untranslatable characters */
3265 while (collend < endp) {
3266 if (charmaptranslate_lookup(*collend, mapping, &x))
3267 goto onError;
3268 Py_XDECREF(x);
3269 if (x!=Py_None)
3270 break;
3271 ++collend;
3272 }
3273 /* cache callback name lookup
3274 * (if not done yet, i.e. it's the first error) */
3275 if (known_errorHandler==-1) {
3276 if ((errors==NULL) || (!strcmp(errors, "strict")))
3277 known_errorHandler = 1;
3278 else if (!strcmp(errors, "replace"))
3279 known_errorHandler = 2;
3280 else if (!strcmp(errors, "ignore"))
3281 known_errorHandler = 3;
3282 else if (!strcmp(errors, "xmlcharrefreplace"))
3283 known_errorHandler = 4;
3284 else
3285 known_errorHandler = 0;
3286 }
3287 switch (known_errorHandler) {
3288 case 1: /* strict */
3289 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3290 goto onError;
3291 case 2: /* replace */
3292 /* No need to check for space, this is a 1:1 replacement */
3293 for (coll = collstart; coll<collend; ++coll)
3294 *str++ = '?';
3295 /* fall through */
3296 case 3: /* ignore */
3297 p = collend;
3298 break;
3299 case 4: /* xmlcharrefreplace */
3300 /* generate replacement (temporarily (mis)uses p) */
3301 for (p = collstart; p < collend; ++p) {
3302 char buffer[2+29+1+1];
3303 char *cp;
3304 sprintf(buffer, "&#%d;", (int)*p);
3305 if (charmaptranslate_makespace(&res, &str, &ressize,
3306 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3307 goto onError;
3308 for (cp = buffer; *cp; ++cp)
3309 *str++ = *cp;
3310 }
3311 p = collend;
3312 break;
3313 default:
3314 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3315 reason, startp, size, &exc,
3316 collstart-startp, collend-startp, &newpos);
3317 if (repunicode == NULL)
3318 goto onError;
3319 /* generate replacement */
3320 repsize = PyUnicode_GET_SIZE(repunicode);
3321 if (charmaptranslate_makespace(&res, &str, &ressize,
3322 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3323 Py_DECREF(repunicode);
3324 goto onError;
3325 }
3326 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3327 *str++ = *uni2;
3328 p = startp + newpos;
3329 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 }
3331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 /* Resize if we allocated to much */
3334 respos = str-PyUnicode_AS_UNICODE(res);
3335 if (respos<ressize) {
3336 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003337 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 }
3339 Py_XDECREF(exc);
3340 Py_XDECREF(errorHandler);
3341 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 onError:
3344 Py_XDECREF(res);
3345 Py_XDECREF(exc);
3346 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 return NULL;
3348}
3349
3350PyObject *PyUnicode_Translate(PyObject *str,
3351 PyObject *mapping,
3352 const char *errors)
3353{
3354 PyObject *result;
3355
3356 str = PyUnicode_FromObject(str);
3357 if (str == NULL)
3358 goto onError;
3359 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3360 PyUnicode_GET_SIZE(str),
3361 mapping,
3362 errors);
3363 Py_DECREF(str);
3364 return result;
3365
3366 onError:
3367 Py_XDECREF(str);
3368 return NULL;
3369}
3370
Guido van Rossum9e896b32000-04-05 20:11:21 +00003371/* --- Decimal Encoder ---------------------------------------------------- */
3372
3373int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3374 int length,
3375 char *output,
3376 const char *errors)
3377{
3378 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 PyObject *errorHandler = NULL;
3380 PyObject *exc = NULL;
3381 const char *encoding = "decimal";
3382 const char *reason = "invalid decimal Unicode string";
3383 /* the following variable is used for caching string comparisons
3384 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3385 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003386
3387 if (output == NULL) {
3388 PyErr_BadArgument();
3389 return -1;
3390 }
3391
3392 p = s;
3393 end = s + length;
3394 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003396 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 PyObject *repunicode;
3398 int repsize;
3399 int newpos;
3400 Py_UNICODE *uni2;
3401 Py_UNICODE *collstart;
3402 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003403
3404 if (Py_UNICODE_ISSPACE(ch)) {
3405 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003407 continue;
3408 }
3409 decimal = Py_UNICODE_TODECIMAL(ch);
3410 if (decimal >= 0) {
3411 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003413 continue;
3414 }
Guido van Rossumba477042000-04-06 18:18:10 +00003415 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003416 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003418 continue;
3419 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 /* All other characters are considered unencodable */
3421 collstart = p;
3422 collend = p+1;
3423 while (collend < end) {
3424 if ((0 < *collend && *collend < 256) ||
3425 !Py_UNICODE_ISSPACE(*collend) ||
3426 Py_UNICODE_TODECIMAL(*collend))
3427 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 /* cache callback name lookup
3430 * (if not done yet, i.e. it's the first error) */
3431 if (known_errorHandler==-1) {
3432 if ((errors==NULL) || (!strcmp(errors, "strict")))
3433 known_errorHandler = 1;
3434 else if (!strcmp(errors, "replace"))
3435 known_errorHandler = 2;
3436 else if (!strcmp(errors, "ignore"))
3437 known_errorHandler = 3;
3438 else if (!strcmp(errors, "xmlcharrefreplace"))
3439 known_errorHandler = 4;
3440 else
3441 known_errorHandler = 0;
3442 }
3443 switch (known_errorHandler) {
3444 case 1: /* strict */
3445 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3446 goto onError;
3447 case 2: /* replace */
3448 for (p = collstart; p < collend; ++p)
3449 *output++ = '?';
3450 /* fall through */
3451 case 3: /* ignore */
3452 p = collend;
3453 break;
3454 case 4: /* xmlcharrefreplace */
3455 /* generate replacement (temporarily (mis)uses p) */
3456 for (p = collstart; p < collend; ++p)
3457 output += sprintf(output, "&#%d;", (int)*p);
3458 p = collend;
3459 break;
3460 default:
3461 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3462 encoding, reason, s, length, &exc,
3463 collstart-s, collend-s, &newpos);
3464 if (repunicode == NULL)
3465 goto onError;
3466 /* generate replacement */
3467 repsize = PyUnicode_GET_SIZE(repunicode);
3468 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3469 Py_UNICODE ch = *uni2;
3470 if (Py_UNICODE_ISSPACE(ch))
3471 *output++ = ' ';
3472 else {
3473 decimal = Py_UNICODE_TODECIMAL(ch);
3474 if (decimal >= 0)
3475 *output++ = '0' + decimal;
3476 else if (0 < ch && ch < 256)
3477 *output++ = (char)ch;
3478 else {
3479 Py_DECREF(repunicode);
3480 raise_encode_exception(&exc, encoding,
3481 s, length, collstart-s, collend-s, reason);
3482 goto onError;
3483 }
3484 }
3485 }
3486 p = s + newpos;
3487 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003488 }
3489 }
3490 /* 0-terminate the output string */
3491 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 Py_XDECREF(exc);
3493 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003494 return 0;
3495
3496 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 Py_XDECREF(exc);
3498 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003499 return -1;
3500}
3501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502/* --- Helpers ------------------------------------------------------------ */
3503
3504static
3505int count(PyUnicodeObject *self,
3506 int start,
3507 int end,
3508 PyUnicodeObject *substring)
3509{
3510 int count = 0;
3511
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003512 if (start < 0)
3513 start += self->length;
3514 if (start < 0)
3515 start = 0;
3516 if (end > self->length)
3517 end = self->length;
3518 if (end < 0)
3519 end += self->length;
3520 if (end < 0)
3521 end = 0;
3522
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003523 if (substring->length == 0)
3524 return (end - start + 1);
3525
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 end -= substring->length;
3527
3528 while (start <= end)
3529 if (Py_UNICODE_MATCH(self, start, substring)) {
3530 count++;
3531 start += substring->length;
3532 } else
3533 start++;
3534
3535 return count;
3536}
3537
3538int PyUnicode_Count(PyObject *str,
3539 PyObject *substr,
3540 int start,
3541 int end)
3542{
3543 int result;
3544
3545 str = PyUnicode_FromObject(str);
3546 if (str == NULL)
3547 return -1;
3548 substr = PyUnicode_FromObject(substr);
3549 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003550 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return -1;
3552 }
3553
3554 result = count((PyUnicodeObject *)str,
3555 start, end,
3556 (PyUnicodeObject *)substr);
3557
3558 Py_DECREF(str);
3559 Py_DECREF(substr);
3560 return result;
3561}
3562
3563static
3564int findstring(PyUnicodeObject *self,
3565 PyUnicodeObject *substring,
3566 int start,
3567 int end,
3568 int direction)
3569{
3570 if (start < 0)
3571 start += self->length;
3572 if (start < 0)
3573 start = 0;
3574
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 if (end > self->length)
3576 end = self->length;
3577 if (end < 0)
3578 end += self->length;
3579 if (end < 0)
3580 end = 0;
3581
Guido van Rossum76afbd92002-08-20 17:29:29 +00003582 if (substring->length == 0)
3583 return (direction > 0) ? start : end;
3584
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 end -= substring->length;
3586
3587 if (direction < 0) {
3588 for (; end >= start; end--)
3589 if (Py_UNICODE_MATCH(self, end, substring))
3590 return end;
3591 } else {
3592 for (; start <= end; start++)
3593 if (Py_UNICODE_MATCH(self, start, substring))
3594 return start;
3595 }
3596
3597 return -1;
3598}
3599
3600int PyUnicode_Find(PyObject *str,
3601 PyObject *substr,
3602 int start,
3603 int end,
3604 int direction)
3605{
3606 int result;
3607
3608 str = PyUnicode_FromObject(str);
3609 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003610 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 substr = PyUnicode_FromObject(substr);
3612 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003613 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003614 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 }
3616
3617 result = findstring((PyUnicodeObject *)str,
3618 (PyUnicodeObject *)substr,
3619 start, end, direction);
3620 Py_DECREF(str);
3621 Py_DECREF(substr);
3622 return result;
3623}
3624
3625static
3626int tailmatch(PyUnicodeObject *self,
3627 PyUnicodeObject *substring,
3628 int start,
3629 int end,
3630 int direction)
3631{
3632 if (start < 0)
3633 start += self->length;
3634 if (start < 0)
3635 start = 0;
3636
3637 if (substring->length == 0)
3638 return 1;
3639
3640 if (end > self->length)
3641 end = self->length;
3642 if (end < 0)
3643 end += self->length;
3644 if (end < 0)
3645 end = 0;
3646
3647 end -= substring->length;
3648 if (end < start)
3649 return 0;
3650
3651 if (direction > 0) {
3652 if (Py_UNICODE_MATCH(self, end, substring))
3653 return 1;
3654 } else {
3655 if (Py_UNICODE_MATCH(self, start, substring))
3656 return 1;
3657 }
3658
3659 return 0;
3660}
3661
3662int PyUnicode_Tailmatch(PyObject *str,
3663 PyObject *substr,
3664 int start,
3665 int end,
3666 int direction)
3667{
3668 int result;
3669
3670 str = PyUnicode_FromObject(str);
3671 if (str == NULL)
3672 return -1;
3673 substr = PyUnicode_FromObject(substr);
3674 if (substr == NULL) {
3675 Py_DECREF(substr);
3676 return -1;
3677 }
3678
3679 result = tailmatch((PyUnicodeObject *)str,
3680 (PyUnicodeObject *)substr,
3681 start, end, direction);
3682 Py_DECREF(str);
3683 Py_DECREF(substr);
3684 return result;
3685}
3686
3687static
3688const Py_UNICODE *findchar(const Py_UNICODE *s,
3689 int size,
3690 Py_UNICODE ch)
3691{
3692 /* like wcschr, but doesn't stop at NULL characters */
3693
3694 while (size-- > 0) {
3695 if (*s == ch)
3696 return s;
3697 s++;
3698 }
3699
3700 return NULL;
3701}
3702
3703/* Apply fixfct filter to the Unicode object self and return a
3704 reference to the modified object */
3705
3706static
3707PyObject *fixup(PyUnicodeObject *self,
3708 int (*fixfct)(PyUnicodeObject *s))
3709{
3710
3711 PyUnicodeObject *u;
3712
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003713 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 if (u == NULL)
3715 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003716
3717 Py_UNICODE_COPY(u->str, self->str, self->length);
3718
Tim Peters7a29bd52001-09-12 03:03:31 +00003719 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 /* fixfct should return TRUE if it modified the buffer. If
3721 FALSE, return a reference to the original buffer instead
3722 (to save space, not time) */
3723 Py_INCREF(self);
3724 Py_DECREF(u);
3725 return (PyObject*) self;
3726 }
3727 return (PyObject*) u;
3728}
3729
3730static
3731int fixupper(PyUnicodeObject *self)
3732{
3733 int len = self->length;
3734 Py_UNICODE *s = self->str;
3735 int status = 0;
3736
3737 while (len-- > 0) {
3738 register Py_UNICODE ch;
3739
3740 ch = Py_UNICODE_TOUPPER(*s);
3741 if (ch != *s) {
3742 status = 1;
3743 *s = ch;
3744 }
3745 s++;
3746 }
3747
3748 return status;
3749}
3750
3751static
3752int fixlower(PyUnicodeObject *self)
3753{
3754 int len = self->length;
3755 Py_UNICODE *s = self->str;
3756 int status = 0;
3757
3758 while (len-- > 0) {
3759 register Py_UNICODE ch;
3760
3761 ch = Py_UNICODE_TOLOWER(*s);
3762 if (ch != *s) {
3763 status = 1;
3764 *s = ch;
3765 }
3766 s++;
3767 }
3768
3769 return status;
3770}
3771
3772static
3773int fixswapcase(PyUnicodeObject *self)
3774{
3775 int len = self->length;
3776 Py_UNICODE *s = self->str;
3777 int status = 0;
3778
3779 while (len-- > 0) {
3780 if (Py_UNICODE_ISUPPER(*s)) {
3781 *s = Py_UNICODE_TOLOWER(*s);
3782 status = 1;
3783 } else if (Py_UNICODE_ISLOWER(*s)) {
3784 *s = Py_UNICODE_TOUPPER(*s);
3785 status = 1;
3786 }
3787 s++;
3788 }
3789
3790 return status;
3791}
3792
3793static
3794int fixcapitalize(PyUnicodeObject *self)
3795{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003796 int len = self->length;
3797 Py_UNICODE *s = self->str;
3798 int status = 0;
3799
3800 if (len == 0)
3801 return 0;
3802 if (Py_UNICODE_ISLOWER(*s)) {
3803 *s = Py_UNICODE_TOUPPER(*s);
3804 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003806 s++;
3807 while (--len > 0) {
3808 if (Py_UNICODE_ISUPPER(*s)) {
3809 *s = Py_UNICODE_TOLOWER(*s);
3810 status = 1;
3811 }
3812 s++;
3813 }
3814 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815}
3816
3817static
3818int fixtitle(PyUnicodeObject *self)
3819{
3820 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3821 register Py_UNICODE *e;
3822 int previous_is_cased;
3823
3824 /* Shortcut for single character strings */
3825 if (PyUnicode_GET_SIZE(self) == 1) {
3826 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3827 if (*p != ch) {
3828 *p = ch;
3829 return 1;
3830 }
3831 else
3832 return 0;
3833 }
3834
3835 e = p + PyUnicode_GET_SIZE(self);
3836 previous_is_cased = 0;
3837 for (; p < e; p++) {
3838 register const Py_UNICODE ch = *p;
3839
3840 if (previous_is_cased)
3841 *p = Py_UNICODE_TOLOWER(ch);
3842 else
3843 *p = Py_UNICODE_TOTITLE(ch);
3844
3845 if (Py_UNICODE_ISLOWER(ch) ||
3846 Py_UNICODE_ISUPPER(ch) ||
3847 Py_UNICODE_ISTITLE(ch))
3848 previous_is_cased = 1;
3849 else
3850 previous_is_cased = 0;
3851 }
3852 return 1;
3853}
3854
3855PyObject *PyUnicode_Join(PyObject *separator,
3856 PyObject *seq)
3857{
3858 Py_UNICODE *sep;
3859 int seplen;
3860 PyUnicodeObject *res = NULL;
3861 int reslen = 0;
3862 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 int sz = 100;
3864 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003865 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866
Tim Peters2cfe3682001-05-05 05:36:48 +00003867 it = PyObject_GetIter(seq);
3868 if (it == NULL)
3869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870
3871 if (separator == NULL) {
3872 Py_UNICODE blank = ' ';
3873 sep = &blank;
3874 seplen = 1;
3875 }
3876 else {
3877 separator = PyUnicode_FromObject(separator);
3878 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 sep = PyUnicode_AS_UNICODE(separator);
3881 seplen = PyUnicode_GET_SIZE(separator);
3882 }
3883
3884 res = _PyUnicode_New(sz);
3885 if (res == NULL)
3886 goto onError;
3887 p = PyUnicode_AS_UNICODE(res);
3888 reslen = 0;
3889
Tim Peters2cfe3682001-05-05 05:36:48 +00003890 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003892 PyObject *item = PyIter_Next(it);
3893 if (item == NULL) {
3894 if (PyErr_Occurred())
3895 goto onError;
3896 break;
3897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 if (!PyUnicode_Check(item)) {
3899 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003900 if (!PyString_Check(item)) {
3901 PyErr_Format(PyExc_TypeError,
3902 "sequence item %i: expected string or Unicode,"
3903 " %.80s found",
3904 i, item->ob_type->tp_name);
3905 Py_DECREF(item);
3906 goto onError;
3907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 v = PyUnicode_FromObject(item);
3909 Py_DECREF(item);
3910 item = v;
3911 if (item == NULL)
3912 goto onError;
3913 }
3914 itemlen = PyUnicode_GET_SIZE(item);
3915 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003916 if (_PyUnicode_Resize(&res, sz*2)) {
3917 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 sz *= 2;
3921 p = PyUnicode_AS_UNICODE(res) + reslen;
3922 }
3923 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003924 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 p += seplen;
3926 reslen += seplen;
3927 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003928 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 p += itemlen;
3930 reslen += itemlen;
3931 Py_DECREF(item);
3932 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003933 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 goto onError;
3935
3936 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003937 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 return (PyObject *)res;
3939
3940 onError:
3941 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003942 Py_XDECREF(res);
3943 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 return NULL;
3945}
3946
3947static
3948PyUnicodeObject *pad(PyUnicodeObject *self,
3949 int left,
3950 int right,
3951 Py_UNICODE fill)
3952{
3953 PyUnicodeObject *u;
3954
3955 if (left < 0)
3956 left = 0;
3957 if (right < 0)
3958 right = 0;
3959
Tim Peters7a29bd52001-09-12 03:03:31 +00003960 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 Py_INCREF(self);
3962 return self;
3963 }
3964
3965 u = _PyUnicode_New(left + self->length + right);
3966 if (u) {
3967 if (left)
3968 Py_UNICODE_FILL(u->str, fill, left);
3969 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3970 if (right)
3971 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3972 }
3973
3974 return u;
3975}
3976
3977#define SPLIT_APPEND(data, left, right) \
3978 str = PyUnicode_FromUnicode(data + left, right - left); \
3979 if (!str) \
3980 goto onError; \
3981 if (PyList_Append(list, str)) { \
3982 Py_DECREF(str); \
3983 goto onError; \
3984 } \
3985 else \
3986 Py_DECREF(str);
3987
3988static
3989PyObject *split_whitespace(PyUnicodeObject *self,
3990 PyObject *list,
3991 int maxcount)
3992{
3993 register int i;
3994 register int j;
3995 int len = self->length;
3996 PyObject *str;
3997
3998 for (i = j = 0; i < len; ) {
3999 /* find a token */
4000 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4001 i++;
4002 j = i;
4003 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4004 i++;
4005 if (j < i) {
4006 if (maxcount-- <= 0)
4007 break;
4008 SPLIT_APPEND(self->str, j, i);
4009 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4010 i++;
4011 j = i;
4012 }
4013 }
4014 if (j < len) {
4015 SPLIT_APPEND(self->str, j, len);
4016 }
4017 return list;
4018
4019 onError:
4020 Py_DECREF(list);
4021 return NULL;
4022}
4023
4024PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004025 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026{
4027 register int i;
4028 register int j;
4029 int len;
4030 PyObject *list;
4031 PyObject *str;
4032 Py_UNICODE *data;
4033
4034 string = PyUnicode_FromObject(string);
4035 if (string == NULL)
4036 return NULL;
4037 data = PyUnicode_AS_UNICODE(string);
4038 len = PyUnicode_GET_SIZE(string);
4039
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 list = PyList_New(0);
4041 if (!list)
4042 goto onError;
4043
4044 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004045 int eol;
4046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 /* Find a line and append it */
4048 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4049 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050
4051 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004052 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (i < len) {
4054 if (data[i] == '\r' && i + 1 < len &&
4055 data[i+1] == '\n')
4056 i += 2;
4057 else
4058 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004059 if (keepends)
4060 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 }
Guido van Rossum86662912000-04-11 15:38:46 +00004062 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 j = i;
4064 }
4065 if (j < len) {
4066 SPLIT_APPEND(data, j, len);
4067 }
4068
4069 Py_DECREF(string);
4070 return list;
4071
4072 onError:
4073 Py_DECREF(list);
4074 Py_DECREF(string);
4075 return NULL;
4076}
4077
4078static
4079PyObject *split_char(PyUnicodeObject *self,
4080 PyObject *list,
4081 Py_UNICODE ch,
4082 int maxcount)
4083{
4084 register int i;
4085 register int j;
4086 int len = self->length;
4087 PyObject *str;
4088
4089 for (i = j = 0; i < len; ) {
4090 if (self->str[i] == ch) {
4091 if (maxcount-- <= 0)
4092 break;
4093 SPLIT_APPEND(self->str, j, i);
4094 i = j = i + 1;
4095 } else
4096 i++;
4097 }
4098 if (j <= len) {
4099 SPLIT_APPEND(self->str, j, len);
4100 }
4101 return list;
4102
4103 onError:
4104 Py_DECREF(list);
4105 return NULL;
4106}
4107
4108static
4109PyObject *split_substring(PyUnicodeObject *self,
4110 PyObject *list,
4111 PyUnicodeObject *substring,
4112 int maxcount)
4113{
4114 register int i;
4115 register int j;
4116 int len = self->length;
4117 int sublen = substring->length;
4118 PyObject *str;
4119
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004120 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 if (Py_UNICODE_MATCH(self, i, substring)) {
4122 if (maxcount-- <= 0)
4123 break;
4124 SPLIT_APPEND(self->str, j, i);
4125 i = j = i + sublen;
4126 } else
4127 i++;
4128 }
4129 if (j <= len) {
4130 SPLIT_APPEND(self->str, j, len);
4131 }
4132 return list;
4133
4134 onError:
4135 Py_DECREF(list);
4136 return NULL;
4137}
4138
4139#undef SPLIT_APPEND
4140
4141static
4142PyObject *split(PyUnicodeObject *self,
4143 PyUnicodeObject *substring,
4144 int maxcount)
4145{
4146 PyObject *list;
4147
4148 if (maxcount < 0)
4149 maxcount = INT_MAX;
4150
4151 list = PyList_New(0);
4152 if (!list)
4153 return NULL;
4154
4155 if (substring == NULL)
4156 return split_whitespace(self,list,maxcount);
4157
4158 else if (substring->length == 1)
4159 return split_char(self,list,substring->str[0],maxcount);
4160
4161 else if (substring->length == 0) {
4162 Py_DECREF(list);
4163 PyErr_SetString(PyExc_ValueError, "empty separator");
4164 return NULL;
4165 }
4166 else
4167 return split_substring(self,list,substring,maxcount);
4168}
4169
4170static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171PyObject *replace(PyUnicodeObject *self,
4172 PyUnicodeObject *str1,
4173 PyUnicodeObject *str2,
4174 int maxcount)
4175{
4176 PyUnicodeObject *u;
4177
4178 if (maxcount < 0)
4179 maxcount = INT_MAX;
4180
4181 if (str1->length == 1 && str2->length == 1) {
4182 int i;
4183
4184 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004185 if (!findchar(self->str, self->length, str1->str[0]) &&
4186 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 /* nothing to replace, return original string */
4188 Py_INCREF(self);
4189 u = self;
4190 } else {
4191 Py_UNICODE u1 = str1->str[0];
4192 Py_UNICODE u2 = str2->str[0];
4193
4194 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004195 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 self->length
4197 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004198 if (u != NULL) {
4199 Py_UNICODE_COPY(u->str, self->str,
4200 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 for (i = 0; i < u->length; i++)
4202 if (u->str[i] == u1) {
4203 if (--maxcount < 0)
4204 break;
4205 u->str[i] = u2;
4206 }
4207 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209
4210 } else {
4211 int n, i;
4212 Py_UNICODE *p;
4213
4214 /* replace strings */
4215 n = count(self, 0, self->length, str1);
4216 if (n > maxcount)
4217 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004218 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004220 if (PyUnicode_CheckExact(self)) {
4221 Py_INCREF(self);
4222 u = self;
4223 }
4224 else {
4225 u = (PyUnicodeObject *)
4226 PyUnicode_FromUnicode(self->str, self->length);
4227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 } else {
4229 u = _PyUnicode_New(
4230 self->length + n * (str2->length - str1->length));
4231 if (u) {
4232 i = 0;
4233 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004234 if (str1->length > 0) {
4235 while (i <= self->length - str1->length)
4236 if (Py_UNICODE_MATCH(self, i, str1)) {
4237 /* replace string segment */
4238 Py_UNICODE_COPY(p, str2->str, str2->length);
4239 p += str2->length;
4240 i += str1->length;
4241 if (--n <= 0) {
4242 /* copy remaining part */
4243 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4244 break;
4245 }
4246 } else
4247 *p++ = self->str[i++];
4248 } else {
4249 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250 Py_UNICODE_COPY(p, str2->str, str2->length);
4251 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004252 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004255 }
4256 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 }
4259 }
4260 }
4261
4262 return (PyObject *) u;
4263}
4264
4265/* --- Unicode Object Methods --------------------------------------------- */
4266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004267PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268"S.title() -> unicode\n\
4269\n\
4270Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004271characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272
4273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004274unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 return fixup(self, fixtitle);
4277}
4278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004279PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280"S.capitalize() -> unicode\n\
4281\n\
4282Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004283have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284
4285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004286unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 return fixup(self, fixcapitalize);
4289}
4290
4291#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004292PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293"S.capwords() -> unicode\n\
4294\n\
4295Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004296normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297
4298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004299unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300{
4301 PyObject *list;
4302 PyObject *item;
4303 int i;
4304
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 /* Split into words */
4306 list = split(self, NULL, -1);
4307 if (!list)
4308 return NULL;
4309
4310 /* Capitalize each word */
4311 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4312 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4313 fixcapitalize);
4314 if (item == NULL)
4315 goto onError;
4316 Py_DECREF(PyList_GET_ITEM(list, i));
4317 PyList_SET_ITEM(list, i, item);
4318 }
4319
4320 /* Join the words to form a new string */
4321 item = PyUnicode_Join(NULL, list);
4322
4323onError:
4324 Py_DECREF(list);
4325 return (PyObject *)item;
4326}
4327#endif
4328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004329PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330"S.center(width) -> unicode\n\
4331\n\
4332Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004333using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334
4335static PyObject *
4336unicode_center(PyUnicodeObject *self, PyObject *args)
4337{
4338 int marg, left;
4339 int width;
4340
4341 if (!PyArg_ParseTuple(args, "i:center", &width))
4342 return NULL;
4343
Tim Peters7a29bd52001-09-12 03:03:31 +00004344 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 Py_INCREF(self);
4346 return (PyObject*) self;
4347 }
4348
4349 marg = width - self->length;
4350 left = marg / 2 + (marg & width & 1);
4351
4352 return (PyObject*) pad(self, left, marg - left, ' ');
4353}
4354
Marc-André Lemburge5034372000-08-08 08:04:29 +00004355#if 0
4356
4357/* This code should go into some future Unicode collation support
4358 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004359 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004360
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004361/* speedy UTF-16 code point order comparison */
4362/* gleaned from: */
4363/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4364
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004365static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004366{
4367 0, 0, 0, 0, 0, 0, 0, 0,
4368 0, 0, 0, 0, 0, 0, 0, 0,
4369 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004370 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004371};
4372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373static int
4374unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4375{
4376 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 Py_UNICODE *s1 = str1->str;
4379 Py_UNICODE *s2 = str2->str;
4380
4381 len1 = str1->length;
4382 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004383
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004385 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004386
4387 c1 = *s1++;
4388 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004389
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004390 if (c1 > (1<<11) * 26)
4391 c1 += utf16Fixup[c1>>11];
4392 if (c2 > (1<<11) * 26)
4393 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004394 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004395
4396 if (c1 != c2)
4397 return (c1 < c2) ? -1 : 1;
4398
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004399 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 }
4401
4402 return (len1 < len2) ? -1 : (len1 != len2);
4403}
4404
Marc-André Lemburge5034372000-08-08 08:04:29 +00004405#else
4406
4407static int
4408unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4409{
4410 register int len1, len2;
4411
4412 Py_UNICODE *s1 = str1->str;
4413 Py_UNICODE *s2 = str2->str;
4414
4415 len1 = str1->length;
4416 len2 = str2->length;
4417
4418 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004419 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004420
Fredrik Lundh45714e92001-06-26 16:39:36 +00004421 c1 = *s1++;
4422 c2 = *s2++;
4423
4424 if (c1 != c2)
4425 return (c1 < c2) ? -1 : 1;
4426
Marc-André Lemburge5034372000-08-08 08:04:29 +00004427 len1--; len2--;
4428 }
4429
4430 return (len1 < len2) ? -1 : (len1 != len2);
4431}
4432
4433#endif
4434
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435int PyUnicode_Compare(PyObject *left,
4436 PyObject *right)
4437{
4438 PyUnicodeObject *u = NULL, *v = NULL;
4439 int result;
4440
4441 /* Coerce the two arguments */
4442 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4443 if (u == NULL)
4444 goto onError;
4445 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4446 if (v == NULL)
4447 goto onError;
4448
Thomas Wouters7e474022000-07-16 12:04:32 +00004449 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 if (v == u) {
4451 Py_DECREF(u);
4452 Py_DECREF(v);
4453 return 0;
4454 }
4455
4456 result = unicode_compare(u, v);
4457
4458 Py_DECREF(u);
4459 Py_DECREF(v);
4460 return result;
4461
4462onError:
4463 Py_XDECREF(u);
4464 Py_XDECREF(v);
4465 return -1;
4466}
4467
Guido van Rossum403d68b2000-03-13 15:55:09 +00004468int PyUnicode_Contains(PyObject *container,
4469 PyObject *element)
4470{
4471 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004472 int result, size;
4473 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004474
4475 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004476 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004477 if (v == NULL) {
4478 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004479 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004480 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004481 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004482 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004483 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004484 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004485
Barry Warsaw817918c2002-08-06 16:58:21 +00004486 size = PyUnicode_GET_SIZE(v);
4487 rhs = PyUnicode_AS_UNICODE(v);
4488 lhs = PyUnicode_AS_UNICODE(u);
4489
Guido van Rossum403d68b2000-03-13 15:55:09 +00004490 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004491 if (size == 1) {
4492 end = lhs + PyUnicode_GET_SIZE(u);
4493 while (lhs < end) {
4494 if (*lhs++ == *rhs) {
4495 result = 1;
4496 break;
4497 }
4498 }
4499 }
4500 else {
4501 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4502 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004503 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004504 result = 1;
4505 break;
4506 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004507 }
4508 }
4509
4510 Py_DECREF(u);
4511 Py_DECREF(v);
4512 return result;
4513
4514onError:
4515 Py_XDECREF(u);
4516 Py_XDECREF(v);
4517 return -1;
4518}
4519
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520/* Concat to string or Unicode object giving a new Unicode object. */
4521
4522PyObject *PyUnicode_Concat(PyObject *left,
4523 PyObject *right)
4524{
4525 PyUnicodeObject *u = NULL, *v = NULL, *w;
4526
4527 /* Coerce the two arguments */
4528 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4529 if (u == NULL)
4530 goto onError;
4531 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4532 if (v == NULL)
4533 goto onError;
4534
4535 /* Shortcuts */
4536 if (v == unicode_empty) {
4537 Py_DECREF(v);
4538 return (PyObject *)u;
4539 }
4540 if (u == unicode_empty) {
4541 Py_DECREF(u);
4542 return (PyObject *)v;
4543 }
4544
4545 /* Concat the two Unicode strings */
4546 w = _PyUnicode_New(u->length + v->length);
4547 if (w == NULL)
4548 goto onError;
4549 Py_UNICODE_COPY(w->str, u->str, u->length);
4550 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4551
4552 Py_DECREF(u);
4553 Py_DECREF(v);
4554 return (PyObject *)w;
4555
4556onError:
4557 Py_XDECREF(u);
4558 Py_XDECREF(v);
4559 return NULL;
4560}
4561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004562PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563"S.count(sub[, start[, end]]) -> int\n\
4564\n\
4565Return the number of occurrences of substring sub in Unicode string\n\
4566S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004567interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568
4569static PyObject *
4570unicode_count(PyUnicodeObject *self, PyObject *args)
4571{
4572 PyUnicodeObject *substring;
4573 int start = 0;
4574 int end = INT_MAX;
4575 PyObject *result;
4576
Guido van Rossumb8872e62000-05-09 14:14:27 +00004577 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 return NULL;
4580
4581 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4582 (PyObject *)substring);
4583 if (substring == NULL)
4584 return NULL;
4585
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 if (start < 0)
4587 start += self->length;
4588 if (start < 0)
4589 start = 0;
4590 if (end > self->length)
4591 end = self->length;
4592 if (end < 0)
4593 end += self->length;
4594 if (end < 0)
4595 end = 0;
4596
4597 result = PyInt_FromLong((long) count(self, start, end, substring));
4598
4599 Py_DECREF(substring);
4600 return result;
4601}
4602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004603PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604"S.encode([encoding[,errors]]) -> string\n\
4605\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004606Return an encoded string version of S. Default encoding is the current\n\
4607default string encoding. errors may be given to set a different error\n\
4608handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4610'xmlcharrefreplace' as well as any other name registered with\n\
4611codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
4613static PyObject *
4614unicode_encode(PyUnicodeObject *self, PyObject *args)
4615{
4616 char *encoding = NULL;
4617 char *errors = NULL;
4618 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4619 return NULL;
4620 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4621}
4622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004623PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624"S.expandtabs([tabsize]) -> unicode\n\
4625\n\
4626Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004627If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
4629static PyObject*
4630unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4631{
4632 Py_UNICODE *e;
4633 Py_UNICODE *p;
4634 Py_UNICODE *q;
4635 int i, j;
4636 PyUnicodeObject *u;
4637 int tabsize = 8;
4638
4639 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4640 return NULL;
4641
Thomas Wouters7e474022000-07-16 12:04:32 +00004642 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 i = j = 0;
4644 e = self->str + self->length;
4645 for (p = self->str; p < e; p++)
4646 if (*p == '\t') {
4647 if (tabsize > 0)
4648 j += tabsize - (j % tabsize);
4649 }
4650 else {
4651 j++;
4652 if (*p == '\n' || *p == '\r') {
4653 i += j;
4654 j = 0;
4655 }
4656 }
4657
4658 /* Second pass: create output string and fill it */
4659 u = _PyUnicode_New(i + j);
4660 if (!u)
4661 return NULL;
4662
4663 j = 0;
4664 q = u->str;
4665
4666 for (p = self->str; p < e; p++)
4667 if (*p == '\t') {
4668 if (tabsize > 0) {
4669 i = tabsize - (j % tabsize);
4670 j += i;
4671 while (i--)
4672 *q++ = ' ';
4673 }
4674 }
4675 else {
4676 j++;
4677 *q++ = *p;
4678 if (*p == '\n' || *p == '\r')
4679 j = 0;
4680 }
4681
4682 return (PyObject*) u;
4683}
4684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004685PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686"S.find(sub [,start [,end]]) -> int\n\
4687\n\
4688Return the lowest index in S where substring sub is found,\n\
4689such that sub is contained within s[start,end]. Optional\n\
4690arguments start and end are interpreted as in slice notation.\n\
4691\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004692Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693
4694static PyObject *
4695unicode_find(PyUnicodeObject *self, PyObject *args)
4696{
4697 PyUnicodeObject *substring;
4698 int start = 0;
4699 int end = INT_MAX;
4700 PyObject *result;
4701
Guido van Rossumb8872e62000-05-09 14:14:27 +00004702 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4703 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 return NULL;
4705 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4706 (PyObject *)substring);
4707 if (substring == NULL)
4708 return NULL;
4709
4710 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4711
4712 Py_DECREF(substring);
4713 return result;
4714}
4715
4716static PyObject *
4717unicode_getitem(PyUnicodeObject *self, int index)
4718{
4719 if (index < 0 || index >= self->length) {
4720 PyErr_SetString(PyExc_IndexError, "string index out of range");
4721 return NULL;
4722 }
4723
4724 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4725}
4726
4727static long
4728unicode_hash(PyUnicodeObject *self)
4729{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004730 /* Since Unicode objects compare equal to their ASCII string
4731 counterparts, they should use the individual character values
4732 as basis for their hash value. This is needed to assure that
4733 strings and Unicode objects behave in the same way as
4734 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735
Fredrik Lundhdde61642000-07-10 18:27:47 +00004736 register int len;
4737 register Py_UNICODE *p;
4738 register long x;
4739
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (self->hash != -1)
4741 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004742 len = PyUnicode_GET_SIZE(self);
4743 p = PyUnicode_AS_UNICODE(self);
4744 x = *p << 7;
4745 while (--len >= 0)
4746 x = (1000003*x) ^ *p++;
4747 x ^= PyUnicode_GET_SIZE(self);
4748 if (x == -1)
4749 x = -2;
4750 self->hash = x;
4751 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752}
4753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004754PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755"S.index(sub [,start [,end]]) -> int\n\
4756\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004757Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759static PyObject *
4760unicode_index(PyUnicodeObject *self, PyObject *args)
4761{
4762 int result;
4763 PyUnicodeObject *substring;
4764 int start = 0;
4765 int end = INT_MAX;
4766
Guido van Rossumb8872e62000-05-09 14:14:27 +00004767 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4768 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 return NULL;
4770
4771 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4772 (PyObject *)substring);
4773 if (substring == NULL)
4774 return NULL;
4775
4776 result = findstring(self, substring, start, end, 1);
4777
4778 Py_DECREF(substring);
4779 if (result < 0) {
4780 PyErr_SetString(PyExc_ValueError, "substring not found");
4781 return NULL;
4782 }
4783 return PyInt_FromLong(result);
4784}
4785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004786PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004787"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004789Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004790at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
4792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004793unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794{
4795 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4796 register const Py_UNICODE *e;
4797 int cased;
4798
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 /* Shortcut for single character strings */
4800 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004801 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004803 /* Special case for empty strings */
4804 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004805 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 e = p + PyUnicode_GET_SIZE(self);
4808 cased = 0;
4809 for (; p < e; p++) {
4810 register const Py_UNICODE ch = *p;
4811
4812 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004813 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 else if (!cased && Py_UNICODE_ISLOWER(ch))
4815 cased = 1;
4816 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004817 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818}
4819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004820PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004821"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004823Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004824at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
4826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004827unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828{
4829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4830 register const Py_UNICODE *e;
4831 int cased;
4832
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 /* Shortcut for single character strings */
4834 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004835 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004837 /* Special case for empty strings */
4838 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004839 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 e = p + PyUnicode_GET_SIZE(self);
4842 cased = 0;
4843 for (; p < e; p++) {
4844 register const Py_UNICODE ch = *p;
4845
4846 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004847 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 else if (!cased && Py_UNICODE_ISUPPER(ch))
4849 cased = 1;
4850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004851 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852}
4853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004854PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004855"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004857Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4858characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004859only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860
4861static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004862unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863{
4864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4865 register const Py_UNICODE *e;
4866 int cased, previous_is_cased;
4867
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 /* Shortcut for single character strings */
4869 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004870 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4871 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004873 /* Special case for empty strings */
4874 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004875 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004876
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 e = p + PyUnicode_GET_SIZE(self);
4878 cased = 0;
4879 previous_is_cased = 0;
4880 for (; p < e; p++) {
4881 register const Py_UNICODE ch = *p;
4882
4883 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4884 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004885 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 previous_is_cased = 1;
4887 cased = 1;
4888 }
4889 else if (Py_UNICODE_ISLOWER(ch)) {
4890 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004891 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 previous_is_cased = 1;
4893 cased = 1;
4894 }
4895 else
4896 previous_is_cased = 0;
4897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004898 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899}
4900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004901PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004902"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004904Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004905False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906
4907static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004908unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909{
4910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4911 register const Py_UNICODE *e;
4912
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 /* Shortcut for single character strings */
4914 if (PyUnicode_GET_SIZE(self) == 1 &&
4915 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004916 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004918 /* Special case for empty strings */
4919 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004920 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004921
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 e = p + PyUnicode_GET_SIZE(self);
4923 for (; p < e; p++) {
4924 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004925 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004930PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004931"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004932\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004933Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004934and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004935
4936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004937unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004938{
4939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4940 register const Py_UNICODE *e;
4941
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004942 /* Shortcut for single character strings */
4943 if (PyUnicode_GET_SIZE(self) == 1 &&
4944 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004945 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004946
4947 /* Special case for empty strings */
4948 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004949 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004950
4951 e = p + PyUnicode_GET_SIZE(self);
4952 for (; p < e; p++) {
4953 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004954 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004956 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004957}
4958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004959PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004960"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004961\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004962Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004963and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004964
4965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004966unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004967{
4968 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4969 register const Py_UNICODE *e;
4970
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004971 /* Shortcut for single character strings */
4972 if (PyUnicode_GET_SIZE(self) == 1 &&
4973 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004974 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004975
4976 /* Special case for empty strings */
4977 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004978 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004979
4980 e = p + PyUnicode_GET_SIZE(self);
4981 for (; p < e; p++) {
4982 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004983 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004984 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004985 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004986}
4987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004988PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004989"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004991Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004992False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993
4994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004995unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996{
4997 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4998 register const Py_UNICODE *e;
4999
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 /* Shortcut for single character strings */
5001 if (PyUnicode_GET_SIZE(self) == 1 &&
5002 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005003 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005005 /* Special case for empty strings */
5006 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005007 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005008
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 e = p + PyUnicode_GET_SIZE(self);
5010 for (; p < e; p++) {
5011 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005012 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005014 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015}
5016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005017PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005018"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005020Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005021False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022
5023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005024unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025{
5026 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5027 register const Py_UNICODE *e;
5028
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 /* Shortcut for single character strings */
5030 if (PyUnicode_GET_SIZE(self) == 1 &&
5031 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005034 /* Special case for empty strings */
5035 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005036 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005037
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 e = p + PyUnicode_GET_SIZE(self);
5039 for (; p < e; p++) {
5040 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005041 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005043 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044}
5045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005046PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005047"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005049Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005050False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051
5052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005053unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054{
5055 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5056 register const Py_UNICODE *e;
5057
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 /* Shortcut for single character strings */
5059 if (PyUnicode_GET_SIZE(self) == 1 &&
5060 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005061 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005063 /* Special case for empty strings */
5064 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005065 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005066
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 e = p + PyUnicode_GET_SIZE(self);
5068 for (; p < e; p++) {
5069 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005070 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005072 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073}
5074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005075PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076"S.join(sequence) -> unicode\n\
5077\n\
5078Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005079sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080
5081static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005082unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005084 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085}
5086
5087static int
5088unicode_length(PyUnicodeObject *self)
5089{
5090 return self->length;
5091}
5092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005093PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094"S.ljust(width) -> unicode\n\
5095\n\
5096Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005097done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
5099static PyObject *
5100unicode_ljust(PyUnicodeObject *self, PyObject *args)
5101{
5102 int width;
5103 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5104 return NULL;
5105
Tim Peters7a29bd52001-09-12 03:03:31 +00005106 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 Py_INCREF(self);
5108 return (PyObject*) self;
5109 }
5110
5111 return (PyObject*) pad(self, 0, width - self->length, ' ');
5112}
5113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005114PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115"S.lower() -> unicode\n\
5116\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005117Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118
5119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005120unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 return fixup(self, fixlower);
5123}
5124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005125#define LEFTSTRIP 0
5126#define RIGHTSTRIP 1
5127#define BOTHSTRIP 2
5128
5129/* Arrays indexed by above */
5130static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5131
5132#define STRIPNAME(i) (stripformat[i]+3)
5133
5134static const Py_UNICODE *
5135unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5136{
Tim Peters030a5ce2002-04-22 19:00:10 +00005137 size_t i;
5138 for (i = 0; i < n; ++i)
5139 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005140 return s+i;
5141 return NULL;
5142}
5143
5144/* externally visible for str.strip(unicode) */
5145PyObject *
5146_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5147{
5148 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5149 int len = PyUnicode_GET_SIZE(self);
5150 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5151 int seplen = PyUnicode_GET_SIZE(sepobj);
5152 int i, j;
5153
5154 i = 0;
5155 if (striptype != RIGHTSTRIP) {
5156 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5157 i++;
5158 }
5159 }
5160
5161 j = len;
5162 if (striptype != LEFTSTRIP) {
5163 do {
5164 j--;
5165 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5166 j++;
5167 }
5168
5169 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5170 Py_INCREF(self);
5171 return (PyObject*)self;
5172 }
5173 else
5174 return PyUnicode_FromUnicode(s+i, j-i);
5175}
5176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177
5178static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005179do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005181 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5182 int len = PyUnicode_GET_SIZE(self), i, j;
5183
5184 i = 0;
5185 if (striptype != RIGHTSTRIP) {
5186 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5187 i++;
5188 }
5189 }
5190
5191 j = len;
5192 if (striptype != LEFTSTRIP) {
5193 do {
5194 j--;
5195 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5196 j++;
5197 }
5198
5199 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5200 Py_INCREF(self);
5201 return (PyObject*)self;
5202 }
5203 else
5204 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205}
5206
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005207
5208static PyObject *
5209do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5210{
5211 PyObject *sep = NULL;
5212
5213 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5214 return NULL;
5215
5216 if (sep != NULL && sep != Py_None) {
5217 if (PyUnicode_Check(sep))
5218 return _PyUnicode_XStrip(self, striptype, sep);
5219 else if (PyString_Check(sep)) {
5220 PyObject *res;
5221 sep = PyUnicode_FromObject(sep);
5222 if (sep==NULL)
5223 return NULL;
5224 res = _PyUnicode_XStrip(self, striptype, sep);
5225 Py_DECREF(sep);
5226 return res;
5227 }
5228 else {
5229 PyErr_Format(PyExc_TypeError,
5230 "%s arg must be None, unicode or str",
5231 STRIPNAME(striptype));
5232 return NULL;
5233 }
5234 }
5235
5236 return do_strip(self, striptype);
5237}
5238
5239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005240PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005241"S.strip([sep]) -> unicode\n\
5242\n\
5243Return a copy of the string S with leading and trailing\n\
5244whitespace removed.\n\
5245If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005246If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005247
5248static PyObject *
5249unicode_strip(PyUnicodeObject *self, PyObject *args)
5250{
5251 if (PyTuple_GET_SIZE(args) == 0)
5252 return do_strip(self, BOTHSTRIP); /* Common case */
5253 else
5254 return do_argstrip(self, BOTHSTRIP, args);
5255}
5256
5257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005258PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005259"S.lstrip([sep]) -> unicode\n\
5260\n\
5261Return a copy of the string S with leading whitespace removed.\n\
5262If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005263If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005264
5265static PyObject *
5266unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5267{
5268 if (PyTuple_GET_SIZE(args) == 0)
5269 return do_strip(self, LEFTSTRIP); /* Common case */
5270 else
5271 return do_argstrip(self, LEFTSTRIP, args);
5272}
5273
5274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005275PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005276"S.rstrip([sep]) -> unicode\n\
5277\n\
5278Return a copy of the string S with trailing whitespace removed.\n\
5279If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005280If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005281
5282static PyObject *
5283unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5284{
5285 if (PyTuple_GET_SIZE(args) == 0)
5286 return do_strip(self, RIGHTSTRIP); /* Common case */
5287 else
5288 return do_argstrip(self, RIGHTSTRIP, args);
5289}
5290
5291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292static PyObject*
5293unicode_repeat(PyUnicodeObject *str, int len)
5294{
5295 PyUnicodeObject *u;
5296 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005297 int nchars;
5298 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
5300 if (len < 0)
5301 len = 0;
5302
Tim Peters7a29bd52001-09-12 03:03:31 +00005303 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 /* no repeat, return original string */
5305 Py_INCREF(str);
5306 return (PyObject*) str;
5307 }
Tim Peters8f422462000-09-09 06:13:41 +00005308
5309 /* ensure # of chars needed doesn't overflow int and # of bytes
5310 * needed doesn't overflow size_t
5311 */
5312 nchars = len * str->length;
5313 if (len && nchars / len != str->length) {
5314 PyErr_SetString(PyExc_OverflowError,
5315 "repeated string is too long");
5316 return NULL;
5317 }
5318 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5319 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5320 PyErr_SetString(PyExc_OverflowError,
5321 "repeated string is too long");
5322 return NULL;
5323 }
5324 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 if (!u)
5326 return NULL;
5327
5328 p = u->str;
5329
5330 while (len-- > 0) {
5331 Py_UNICODE_COPY(p, str->str, str->length);
5332 p += str->length;
5333 }
5334
5335 return (PyObject*) u;
5336}
5337
5338PyObject *PyUnicode_Replace(PyObject *obj,
5339 PyObject *subobj,
5340 PyObject *replobj,
5341 int maxcount)
5342{
5343 PyObject *self;
5344 PyObject *str1;
5345 PyObject *str2;
5346 PyObject *result;
5347
5348 self = PyUnicode_FromObject(obj);
5349 if (self == NULL)
5350 return NULL;
5351 str1 = PyUnicode_FromObject(subobj);
5352 if (str1 == NULL) {
5353 Py_DECREF(self);
5354 return NULL;
5355 }
5356 str2 = PyUnicode_FromObject(replobj);
5357 if (str2 == NULL) {
5358 Py_DECREF(self);
5359 Py_DECREF(str1);
5360 return NULL;
5361 }
5362 result = replace((PyUnicodeObject *)self,
5363 (PyUnicodeObject *)str1,
5364 (PyUnicodeObject *)str2,
5365 maxcount);
5366 Py_DECREF(self);
5367 Py_DECREF(str1);
5368 Py_DECREF(str2);
5369 return result;
5370}
5371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005372PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373"S.replace (old, new[, maxsplit]) -> unicode\n\
5374\n\
5375Return a copy of S with all occurrences of substring\n\
5376old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005377given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378
5379static PyObject*
5380unicode_replace(PyUnicodeObject *self, PyObject *args)
5381{
5382 PyUnicodeObject *str1;
5383 PyUnicodeObject *str2;
5384 int maxcount = -1;
5385 PyObject *result;
5386
5387 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5388 return NULL;
5389 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5390 if (str1 == NULL)
5391 return NULL;
5392 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5393 if (str2 == NULL)
5394 return NULL;
5395
5396 result = replace(self, str1, str2, maxcount);
5397
5398 Py_DECREF(str1);
5399 Py_DECREF(str2);
5400 return result;
5401}
5402
5403static
5404PyObject *unicode_repr(PyObject *unicode)
5405{
5406 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5407 PyUnicode_GET_SIZE(unicode),
5408 1);
5409}
5410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005411PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412"S.rfind(sub [,start [,end]]) -> int\n\
5413\n\
5414Return the highest index in S where substring sub is found,\n\
5415such that sub is contained within s[start,end]. Optional\n\
5416arguments start and end are interpreted as in slice notation.\n\
5417\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005418Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419
5420static PyObject *
5421unicode_rfind(PyUnicodeObject *self, PyObject *args)
5422{
5423 PyUnicodeObject *substring;
5424 int start = 0;
5425 int end = INT_MAX;
5426 PyObject *result;
5427
Guido van Rossumb8872e62000-05-09 14:14:27 +00005428 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5429 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return NULL;
5431 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5432 (PyObject *)substring);
5433 if (substring == NULL)
5434 return NULL;
5435
5436 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5437
5438 Py_DECREF(substring);
5439 return result;
5440}
5441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005442PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443"S.rindex(sub [,start [,end]]) -> int\n\
5444\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005445Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446
5447static PyObject *
5448unicode_rindex(PyUnicodeObject *self, PyObject *args)
5449{
5450 int result;
5451 PyUnicodeObject *substring;
5452 int start = 0;
5453 int end = INT_MAX;
5454
Guido van Rossumb8872e62000-05-09 14:14:27 +00005455 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5456 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 return NULL;
5458 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5459 (PyObject *)substring);
5460 if (substring == NULL)
5461 return NULL;
5462
5463 result = findstring(self, substring, start, end, -1);
5464
5465 Py_DECREF(substring);
5466 if (result < 0) {
5467 PyErr_SetString(PyExc_ValueError, "substring not found");
5468 return NULL;
5469 }
5470 return PyInt_FromLong(result);
5471}
5472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005473PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474"S.rjust(width) -> unicode\n\
5475\n\
5476Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005477done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
5479static PyObject *
5480unicode_rjust(PyUnicodeObject *self, PyObject *args)
5481{
5482 int width;
5483 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5484 return NULL;
5485
Tim Peters7a29bd52001-09-12 03:03:31 +00005486 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 Py_INCREF(self);
5488 return (PyObject*) self;
5489 }
5490
5491 return (PyObject*) pad(self, width - self->length, 0, ' ');
5492}
5493
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494static PyObject*
5495unicode_slice(PyUnicodeObject *self, int start, int end)
5496{
5497 /* standard clamping */
5498 if (start < 0)
5499 start = 0;
5500 if (end < 0)
5501 end = 0;
5502 if (end > self->length)
5503 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005504 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 /* full slice, return original string */
5506 Py_INCREF(self);
5507 return (PyObject*) self;
5508 }
5509 if (start > end)
5510 start = end;
5511 /* copy slice */
5512 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5513 end - start);
5514}
5515
5516PyObject *PyUnicode_Split(PyObject *s,
5517 PyObject *sep,
5518 int maxsplit)
5519{
5520 PyObject *result;
5521
5522 s = PyUnicode_FromObject(s);
5523 if (s == NULL)
5524 return NULL;
5525 if (sep != NULL) {
5526 sep = PyUnicode_FromObject(sep);
5527 if (sep == NULL) {
5528 Py_DECREF(s);
5529 return NULL;
5530 }
5531 }
5532
5533 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5534
5535 Py_DECREF(s);
5536 Py_XDECREF(sep);
5537 return result;
5538}
5539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005540PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541"S.split([sep [,maxsplit]]) -> list of strings\n\
5542\n\
5543Return a list of the words in S, using sep as the\n\
5544delimiter string. If maxsplit is given, at most maxsplit\n\
5545splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005546is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547
5548static PyObject*
5549unicode_split(PyUnicodeObject *self, PyObject *args)
5550{
5551 PyObject *substring = Py_None;
5552 int maxcount = -1;
5553
5554 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5555 return NULL;
5556
5557 if (substring == Py_None)
5558 return split(self, NULL, maxcount);
5559 else if (PyUnicode_Check(substring))
5560 return split(self, (PyUnicodeObject *)substring, maxcount);
5561 else
5562 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5563}
5564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005565PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005566"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567\n\
5568Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005569Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005570is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
5572static PyObject*
5573unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5574{
Guido van Rossum86662912000-04-11 15:38:46 +00005575 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Guido van Rossum86662912000-04-11 15:38:46 +00005577 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 return NULL;
5579
Guido van Rossum86662912000-04-11 15:38:46 +00005580 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581}
5582
5583static
5584PyObject *unicode_str(PyUnicodeObject *self)
5585{
Fred Drakee4315f52000-05-09 19:53:39 +00005586 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587}
5588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005589PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590"S.swapcase() -> unicode\n\
5591\n\
5592Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005593and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
5595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005596unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 return fixup(self, fixswapcase);
5599}
5600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005601PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602"S.translate(table) -> unicode\n\
5603\n\
5604Return a copy of the string S, where all characters have been mapped\n\
5605through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005606Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5607Unmapped characters are left untouched. Characters mapped to None\n\
5608are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609
5610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005611unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 return PyUnicode_TranslateCharmap(self->str,
5614 self->length,
5615 table,
5616 "ignore");
5617}
5618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005619PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620"S.upper() -> unicode\n\
5621\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005622Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
5624static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005625unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return fixup(self, fixupper);
5628}
5629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005630PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631"S.zfill(width) -> unicode\n\
5632\n\
5633Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005634of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636static PyObject *
5637unicode_zfill(PyUnicodeObject *self, PyObject *args)
5638{
5639 int fill;
5640 PyUnicodeObject *u;
5641
5642 int width;
5643 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5644 return NULL;
5645
5646 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005647 if (PyUnicode_CheckExact(self)) {
5648 Py_INCREF(self);
5649 return (PyObject*) self;
5650 }
5651 else
5652 return PyUnicode_FromUnicode(
5653 PyUnicode_AS_UNICODE(self),
5654 PyUnicode_GET_SIZE(self)
5655 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 }
5657
5658 fill = width - self->length;
5659
5660 u = pad(self, fill, 0, '0');
5661
Walter Dörwald068325e2002-04-15 13:36:47 +00005662 if (u == NULL)
5663 return NULL;
5664
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 if (u->str[fill] == '+' || u->str[fill] == '-') {
5666 /* move sign to beginning of string */
5667 u->str[0] = u->str[fill];
5668 u->str[fill] = '0';
5669 }
5670
5671 return (PyObject*) u;
5672}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
5674#if 0
5675static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005676unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 return PyInt_FromLong(unicode_freelist_size);
5679}
5680#endif
5681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005683"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005685Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005687comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
5689static PyObject *
5690unicode_startswith(PyUnicodeObject *self,
5691 PyObject *args)
5692{
5693 PyUnicodeObject *substring;
5694 int start = 0;
5695 int end = INT_MAX;
5696 PyObject *result;
5697
Guido van Rossumb8872e62000-05-09 14:14:27 +00005698 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5699 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 return NULL;
5701 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5702 (PyObject *)substring);
5703 if (substring == NULL)
5704 return NULL;
5705
Guido van Rossum77f6a652002-04-03 22:41:51 +00005706 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
5708 Py_DECREF(substring);
5709 return result;
5710}
5711
5712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005713PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005714"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005716Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005718comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719
5720static PyObject *
5721unicode_endswith(PyUnicodeObject *self,
5722 PyObject *args)
5723{
5724 PyUnicodeObject *substring;
5725 int start = 0;
5726 int end = INT_MAX;
5727 PyObject *result;
5728
Guido van Rossumb8872e62000-05-09 14:14:27 +00005729 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5730 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 return NULL;
5732 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5733 (PyObject *)substring);
5734 if (substring == NULL)
5735 return NULL;
5736
Guido van Rossum77f6a652002-04-03 22:41:51 +00005737 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
5739 Py_DECREF(substring);
5740 return result;
5741}
5742
5743
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005744
5745static PyObject *
5746unicode_getnewargs(PyUnicodeObject *v)
5747{
5748 return Py_BuildValue("(u#)", v->str, v->length);
5749}
5750
5751
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752static PyMethodDef unicode_methods[] = {
5753
5754 /* Order is according to common usage: often used methods should
5755 appear first, since lookup is done sequentially. */
5756
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005757 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5758 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5759 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5760 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5761 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5762 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5763 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5764 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5765 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5766 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5767 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5768 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5769 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005770 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005771/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5772 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5773 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5774 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005775 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005776 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005777 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005778 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5779 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5780 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5781 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5782 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5783 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5784 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5785 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5786 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5787 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5788 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5789 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5790 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5791 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005792 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005793#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005794 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795#endif
5796
5797#if 0
5798 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005799 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800#endif
5801
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005802 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 {NULL, NULL}
5804};
5805
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005806static PyObject *
5807unicode_mod(PyObject *v, PyObject *w)
5808{
5809 if (!PyUnicode_Check(v)) {
5810 Py_INCREF(Py_NotImplemented);
5811 return Py_NotImplemented;
5812 }
5813 return PyUnicode_Format(v, w);
5814}
5815
5816static PyNumberMethods unicode_as_number = {
5817 0, /*nb_add*/
5818 0, /*nb_subtract*/
5819 0, /*nb_multiply*/
5820 0, /*nb_divide*/
5821 unicode_mod, /*nb_remainder*/
5822};
5823
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824static PySequenceMethods unicode_as_sequence = {
5825 (inquiry) unicode_length, /* sq_length */
5826 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5827 (intargfunc) unicode_repeat, /* sq_repeat */
5828 (intargfunc) unicode_getitem, /* sq_item */
5829 (intintargfunc) unicode_slice, /* sq_slice */
5830 0, /* sq_ass_item */
5831 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005832 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833};
5834
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005835static PyObject*
5836unicode_subscript(PyUnicodeObject* self, PyObject* item)
5837{
5838 if (PyInt_Check(item)) {
5839 long i = PyInt_AS_LONG(item);
5840 if (i < 0)
5841 i += PyString_GET_SIZE(self);
5842 return unicode_getitem(self, i);
5843 } else if (PyLong_Check(item)) {
5844 long i = PyLong_AsLong(item);
5845 if (i == -1 && PyErr_Occurred())
5846 return NULL;
5847 if (i < 0)
5848 i += PyString_GET_SIZE(self);
5849 return unicode_getitem(self, i);
5850 } else if (PySlice_Check(item)) {
5851 int start, stop, step, slicelength, cur, i;
5852 Py_UNICODE* source_buf;
5853 Py_UNICODE* result_buf;
5854 PyObject* result;
5855
5856 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5857 &start, &stop, &step, &slicelength) < 0) {
5858 return NULL;
5859 }
5860
5861 if (slicelength <= 0) {
5862 return PyUnicode_FromUnicode(NULL, 0);
5863 } else {
5864 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5865 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5866
5867 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5868 result_buf[i] = source_buf[cur];
5869 }
5870
5871 result = PyUnicode_FromUnicode(result_buf, slicelength);
5872 PyMem_FREE(result_buf);
5873 return result;
5874 }
5875 } else {
5876 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5877 return NULL;
5878 }
5879}
5880
5881static PyMappingMethods unicode_as_mapping = {
5882 (inquiry)unicode_length, /* mp_length */
5883 (binaryfunc)unicode_subscript, /* mp_subscript */
5884 (objobjargproc)0, /* mp_ass_subscript */
5885};
5886
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887static int
5888unicode_buffer_getreadbuf(PyUnicodeObject *self,
5889 int index,
5890 const void **ptr)
5891{
5892 if (index != 0) {
5893 PyErr_SetString(PyExc_SystemError,
5894 "accessing non-existent unicode segment");
5895 return -1;
5896 }
5897 *ptr = (void *) self->str;
5898 return PyUnicode_GET_DATA_SIZE(self);
5899}
5900
5901static int
5902unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5903 const void **ptr)
5904{
5905 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005906 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 return -1;
5908}
5909
5910static int
5911unicode_buffer_getsegcount(PyUnicodeObject *self,
5912 int *lenp)
5913{
5914 if (lenp)
5915 *lenp = PyUnicode_GET_DATA_SIZE(self);
5916 return 1;
5917}
5918
5919static int
5920unicode_buffer_getcharbuf(PyUnicodeObject *self,
5921 int index,
5922 const void **ptr)
5923{
5924 PyObject *str;
5925
5926 if (index != 0) {
5927 PyErr_SetString(PyExc_SystemError,
5928 "accessing non-existent unicode segment");
5929 return -1;
5930 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005931 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 if (str == NULL)
5933 return -1;
5934 *ptr = (void *) PyString_AS_STRING(str);
5935 return PyString_GET_SIZE(str);
5936}
5937
5938/* Helpers for PyUnicode_Format() */
5939
5940static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005941getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
5943 int argidx = *p_argidx;
5944 if (argidx < arglen) {
5945 (*p_argidx)++;
5946 if (arglen < 0)
5947 return args;
5948 else
5949 return PyTuple_GetItem(args, argidx);
5950 }
5951 PyErr_SetString(PyExc_TypeError,
5952 "not enough arguments for format string");
5953 return NULL;
5954}
5955
5956#define F_LJUST (1<<0)
5957#define F_SIGN (1<<1)
5958#define F_BLANK (1<<2)
5959#define F_ALT (1<<3)
5960#define F_ZERO (1<<4)
5961
5962static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
5965 register int i;
5966 int len;
5967 va_list va;
5968 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
5971 /* First, format the string as char array, then expand to Py_UNICODE
5972 array. */
5973 charbuffer = (char *)buffer;
5974 len = vsprintf(charbuffer, format, va);
5975 for (i = len - 1; i >= 0; i--)
5976 buffer[i] = (Py_UNICODE) charbuffer[i];
5977
5978 va_end(va);
5979 return len;
5980}
5981
Guido van Rossum078151d2002-08-11 04:24:12 +00005982/* XXX To save some code duplication, formatfloat/long/int could have been
5983 shared with stringobject.c, converting from 8-bit to Unicode after the
5984 formatting is done. */
5985
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986static int
5987formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005988 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 int flags,
5990 int prec,
5991 int type,
5992 PyObject *v)
5993{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005994 /* fmt = '%#.' + `prec` + `type`
5995 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 char fmt[20];
5997 double x;
5998
5999 x = PyFloat_AsDouble(v);
6000 if (x == -1.0 && PyErr_Occurred())
6001 return -1;
6002 if (prec < 0)
6003 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6005 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006006 /* Worst case length calc to ensure no buffer overrun:
6007
6008 'g' formats:
6009 fmt = %#.<prec>g
6010 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6011 for any double rep.)
6012 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6013
6014 'f' formats:
6015 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6016 len = 1 + 50 + 1 + prec = 52 + prec
6017
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006018 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006019 always given), therefore increase the length by one.
6020
6021 */
6022 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6023 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006024 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006025 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006026 return -1;
6027 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006028 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6029 (flags&F_ALT) ? "#" : "",
6030 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return usprintf(buf, fmt, x);
6032}
6033
Tim Peters38fd5b62000-09-21 05:43:11 +00006034static PyObject*
6035formatlong(PyObject *val, int flags, int prec, int type)
6036{
6037 char *buf;
6038 int i, len;
6039 PyObject *str; /* temporary string object. */
6040 PyUnicodeObject *result;
6041
6042 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6043 if (!str)
6044 return NULL;
6045 result = _PyUnicode_New(len);
6046 for (i = 0; i < len; i++)
6047 result->str[i] = buf[i];
6048 result->str[len] = 0;
6049 Py_DECREF(str);
6050 return (PyObject*)result;
6051}
6052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053static int
6054formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006055 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 int flags,
6057 int prec,
6058 int type,
6059 PyObject *v)
6060{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006061 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006062 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6063 * + 1 + 1
6064 * = 24
6065 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006066 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 long x;
6068
6069 x = PyInt_AsLong(v);
6070 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006071 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006072 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006073 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006074 "%u/%o/%x/%X of negative int will return "
6075 "a signed string in Python 2.4 and up") < 0)
6076 return -1;
6077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006079 prec = 1;
6080
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006081 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006082 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6083 */
6084 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006085 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006086 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006087 return -1;
6088 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006089
6090 if ((flags & F_ALT) &&
6091 (type == 'x' || type == 'X')) {
6092 /* When converting under %#x or %#X, there are a number
6093 * of issues that cause pain:
6094 * - when 0 is being converted, the C standard leaves off
6095 * the '0x' or '0X', which is inconsistent with other
6096 * %#x/%#X conversions and inconsistent with Python's
6097 * hex() function
6098 * - there are platforms that violate the standard and
6099 * convert 0 with the '0x' or '0X'
6100 * (Metrowerks, Compaq Tru64)
6101 * - there are platforms that give '0x' when converting
6102 * under %#X, but convert 0 in accordance with the
6103 * standard (OS/2 EMX)
6104 *
6105 * We can achieve the desired consistency by inserting our
6106 * own '0x' or '0X' prefix, and substituting %x/%X in place
6107 * of %#x/%#X.
6108 *
6109 * Note that this is the same approach as used in
6110 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006111 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006112 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6113 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006114 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006115 else {
6116 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6117 (flags&F_ALT) ? "#" : "",
6118 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 return usprintf(buf, fmt, x);
6121}
6122
6123static int
6124formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006125 size_t buflen,
6126 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006128 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006129 if (PyUnicode_Check(v)) {
6130 if (PyUnicode_GET_SIZE(v) != 1)
6131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006135 else if (PyString_Check(v)) {
6136 if (PyString_GET_SIZE(v) != 1)
6137 goto onError;
6138 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
6141 else {
6142 /* Integer input truncated to a character */
6143 long x;
6144 x = PyInt_AsLong(v);
6145 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006146 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006147#ifdef Py_UNICODE_WIDE
6148 if (x < 0 || x > 0x10ffff) {
6149 PyErr_SetString(PyExc_ValueError,
6150 "%c arg not in range(0x110000) "
6151 "(wide Python build)");
6152 return -1;
6153 }
6154#else
6155 if (x < 0 || x > 0xffff) {
6156 PyErr_SetString(PyExc_ValueError,
6157 "%c arg not in range(0x10000) "
6158 "(narrow Python build)");
6159 return -1;
6160 }
6161#endif
6162 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
6164 buf[1] = '\0';
6165 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006166
6167 onError:
6168 PyErr_SetString(PyExc_TypeError,
6169 "%c requires int or char");
6170 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171}
6172
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006173/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6174
6175 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6176 chars are formatted. XXX This is a magic number. Each formatting
6177 routine does bounds checking to ensure no overflow, but a better
6178 solution may be to malloc a buffer of appropriate size for each
6179 format. For now, the current solution is sufficient.
6180*/
6181#define FORMATBUFLEN (size_t)120
6182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183PyObject *PyUnicode_Format(PyObject *format,
6184 PyObject *args)
6185{
6186 Py_UNICODE *fmt, *res;
6187 int fmtcnt, rescnt, reslen, arglen, argidx;
6188 int args_owned = 0;
6189 PyUnicodeObject *result = NULL;
6190 PyObject *dict = NULL;
6191 PyObject *uformat;
6192
6193 if (format == NULL || args == NULL) {
6194 PyErr_BadInternalCall();
6195 return NULL;
6196 }
6197 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006198 if (uformat == NULL)
6199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 fmt = PyUnicode_AS_UNICODE(uformat);
6201 fmtcnt = PyUnicode_GET_SIZE(uformat);
6202
6203 reslen = rescnt = fmtcnt + 100;
6204 result = _PyUnicode_New(reslen);
6205 if (result == NULL)
6206 goto onError;
6207 res = PyUnicode_AS_UNICODE(result);
6208
6209 if (PyTuple_Check(args)) {
6210 arglen = PyTuple_Size(args);
6211 argidx = 0;
6212 }
6213 else {
6214 arglen = -1;
6215 argidx = -2;
6216 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006217 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6218 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 dict = args;
6220
6221 while (--fmtcnt >= 0) {
6222 if (*fmt != '%') {
6223 if (--rescnt < 0) {
6224 rescnt = fmtcnt + 100;
6225 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006226 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return NULL;
6228 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6229 --rescnt;
6230 }
6231 *res++ = *fmt++;
6232 }
6233 else {
6234 /* Got a format specifier */
6235 int flags = 0;
6236 int width = -1;
6237 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 Py_UNICODE c = '\0';
6239 Py_UNICODE fill;
6240 PyObject *v = NULL;
6241 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006242 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 Py_UNICODE sign;
6244 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006245 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246
6247 fmt++;
6248 if (*fmt == '(') {
6249 Py_UNICODE *keystart;
6250 int keylen;
6251 PyObject *key;
6252 int pcount = 1;
6253
6254 if (dict == NULL) {
6255 PyErr_SetString(PyExc_TypeError,
6256 "format requires a mapping");
6257 goto onError;
6258 }
6259 ++fmt;
6260 --fmtcnt;
6261 keystart = fmt;
6262 /* Skip over balanced parentheses */
6263 while (pcount > 0 && --fmtcnt >= 0) {
6264 if (*fmt == ')')
6265 --pcount;
6266 else if (*fmt == '(')
6267 ++pcount;
6268 fmt++;
6269 }
6270 keylen = fmt - keystart - 1;
6271 if (fmtcnt < 0 || pcount > 0) {
6272 PyErr_SetString(PyExc_ValueError,
6273 "incomplete format key");
6274 goto onError;
6275 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006276#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006277 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 then looked up since Python uses strings to hold
6279 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006280 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 key = PyUnicode_EncodeUTF8(keystart,
6282 keylen,
6283 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006284#else
6285 key = PyUnicode_FromUnicode(keystart, keylen);
6286#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 if (key == NULL)
6288 goto onError;
6289 if (args_owned) {
6290 Py_DECREF(args);
6291 args_owned = 0;
6292 }
6293 args = PyObject_GetItem(dict, key);
6294 Py_DECREF(key);
6295 if (args == NULL) {
6296 goto onError;
6297 }
6298 args_owned = 1;
6299 arglen = -1;
6300 argidx = -2;
6301 }
6302 while (--fmtcnt >= 0) {
6303 switch (c = *fmt++) {
6304 case '-': flags |= F_LJUST; continue;
6305 case '+': flags |= F_SIGN; continue;
6306 case ' ': flags |= F_BLANK; continue;
6307 case '#': flags |= F_ALT; continue;
6308 case '0': flags |= F_ZERO; continue;
6309 }
6310 break;
6311 }
6312 if (c == '*') {
6313 v = getnextarg(args, arglen, &argidx);
6314 if (v == NULL)
6315 goto onError;
6316 if (!PyInt_Check(v)) {
6317 PyErr_SetString(PyExc_TypeError,
6318 "* wants int");
6319 goto onError;
6320 }
6321 width = PyInt_AsLong(v);
6322 if (width < 0) {
6323 flags |= F_LJUST;
6324 width = -width;
6325 }
6326 if (--fmtcnt >= 0)
6327 c = *fmt++;
6328 }
6329 else if (c >= '0' && c <= '9') {
6330 width = c - '0';
6331 while (--fmtcnt >= 0) {
6332 c = *fmt++;
6333 if (c < '0' || c > '9')
6334 break;
6335 if ((width*10) / 10 != width) {
6336 PyErr_SetString(PyExc_ValueError,
6337 "width too big");
6338 goto onError;
6339 }
6340 width = width*10 + (c - '0');
6341 }
6342 }
6343 if (c == '.') {
6344 prec = 0;
6345 if (--fmtcnt >= 0)
6346 c = *fmt++;
6347 if (c == '*') {
6348 v = getnextarg(args, arglen, &argidx);
6349 if (v == NULL)
6350 goto onError;
6351 if (!PyInt_Check(v)) {
6352 PyErr_SetString(PyExc_TypeError,
6353 "* wants int");
6354 goto onError;
6355 }
6356 prec = PyInt_AsLong(v);
6357 if (prec < 0)
6358 prec = 0;
6359 if (--fmtcnt >= 0)
6360 c = *fmt++;
6361 }
6362 else if (c >= '0' && c <= '9') {
6363 prec = c - '0';
6364 while (--fmtcnt >= 0) {
6365 c = Py_CHARMASK(*fmt++);
6366 if (c < '0' || c > '9')
6367 break;
6368 if ((prec*10) / 10 != prec) {
6369 PyErr_SetString(PyExc_ValueError,
6370 "prec too big");
6371 goto onError;
6372 }
6373 prec = prec*10 + (c - '0');
6374 }
6375 }
6376 } /* prec */
6377 if (fmtcnt >= 0) {
6378 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 if (--fmtcnt >= 0)
6380 c = *fmt++;
6381 }
6382 }
6383 if (fmtcnt < 0) {
6384 PyErr_SetString(PyExc_ValueError,
6385 "incomplete format");
6386 goto onError;
6387 }
6388 if (c != '%') {
6389 v = getnextarg(args, arglen, &argidx);
6390 if (v == NULL)
6391 goto onError;
6392 }
6393 sign = 0;
6394 fill = ' ';
6395 switch (c) {
6396
6397 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006398 pbuf = formatbuf;
6399 /* presume that buffer length is at least 1 */
6400 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 len = 1;
6402 break;
6403
6404 case 's':
6405 case 'r':
6406 if (PyUnicode_Check(v) && c == 's') {
6407 temp = v;
6408 Py_INCREF(temp);
6409 }
6410 else {
6411 PyObject *unicode;
6412 if (c == 's')
6413 temp = PyObject_Str(v);
6414 else
6415 temp = PyObject_Repr(v);
6416 if (temp == NULL)
6417 goto onError;
6418 if (!PyString_Check(temp)) {
6419 /* XXX Note: this should never happen, since
6420 PyObject_Repr() and PyObject_Str() assure
6421 this */
6422 Py_DECREF(temp);
6423 PyErr_SetString(PyExc_TypeError,
6424 "%s argument has non-string str()");
6425 goto onError;
6426 }
Fred Drakee4315f52000-05-09 19:53:39 +00006427 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006429 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 "strict");
6431 Py_DECREF(temp);
6432 temp = unicode;
6433 if (temp == NULL)
6434 goto onError;
6435 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006436 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 len = PyUnicode_GET_SIZE(temp);
6438 if (prec >= 0 && len > prec)
6439 len = prec;
6440 break;
6441
6442 case 'i':
6443 case 'd':
6444 case 'u':
6445 case 'o':
6446 case 'x':
6447 case 'X':
6448 if (c == 'i')
6449 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006450 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006451 temp = formatlong(v, flags, prec, c);
6452 if (!temp)
6453 goto onError;
6454 pbuf = PyUnicode_AS_UNICODE(temp);
6455 len = PyUnicode_GET_SIZE(temp);
6456 /* unbounded ints can always produce
6457 a sign character! */
6458 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006460 else {
6461 pbuf = formatbuf;
6462 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6463 flags, prec, c, v);
6464 if (len < 0)
6465 goto onError;
6466 /* only d conversion is signed */
6467 sign = c == 'd';
6468 }
6469 if (flags & F_ZERO)
6470 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 break;
6472
6473 case 'e':
6474 case 'E':
6475 case 'f':
6476 case 'g':
6477 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006478 pbuf = formatbuf;
6479 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6480 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 if (len < 0)
6482 goto onError;
6483 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006484 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 fill = '0';
6486 break;
6487
6488 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006489 pbuf = formatbuf;
6490 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 if (len < 0)
6492 goto onError;
6493 break;
6494
6495 default:
6496 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006497 "unsupported format character '%c' (0x%x) "
6498 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006499 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006500 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006501 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 goto onError;
6503 }
6504 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006505 if (*pbuf == '-' || *pbuf == '+') {
6506 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 len--;
6508 }
6509 else if (flags & F_SIGN)
6510 sign = '+';
6511 else if (flags & F_BLANK)
6512 sign = ' ';
6513 else
6514 sign = 0;
6515 }
6516 if (width < len)
6517 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006518 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 reslen -= rescnt;
6520 rescnt = width + fmtcnt + 100;
6521 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006522 if (reslen < 0) {
6523 Py_DECREF(result);
6524 return PyErr_NoMemory();
6525 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006526 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 return NULL;
6528 res = PyUnicode_AS_UNICODE(result)
6529 + reslen - rescnt;
6530 }
6531 if (sign) {
6532 if (fill != ' ')
6533 *res++ = sign;
6534 rescnt--;
6535 if (width > len)
6536 width--;
6537 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006538 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6539 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006540 assert(pbuf[1] == c);
6541 if (fill != ' ') {
6542 *res++ = *pbuf++;
6543 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006544 }
Tim Petersfff53252001-04-12 18:38:48 +00006545 rescnt -= 2;
6546 width -= 2;
6547 if (width < 0)
6548 width = 0;
6549 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 if (width > len && !(flags & F_LJUST)) {
6552 do {
6553 --rescnt;
6554 *res++ = fill;
6555 } while (--width > len);
6556 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006557 if (fill == ' ') {
6558 if (sign)
6559 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006560 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006561 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006562 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006563 *res++ = *pbuf++;
6564 *res++ = *pbuf++;
6565 }
6566 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006567 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 res += len;
6569 rescnt -= len;
6570 while (--width >= len) {
6571 --rescnt;
6572 *res++ = ' ';
6573 }
6574 if (dict && (argidx < arglen) && c != '%') {
6575 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006576 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 goto onError;
6578 }
6579 Py_XDECREF(temp);
6580 } /* '%' */
6581 } /* until end */
6582 if (argidx < arglen && !dict) {
6583 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006584 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 goto onError;
6586 }
6587
6588 if (args_owned) {
6589 Py_DECREF(args);
6590 }
6591 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006592 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006593 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 return (PyObject *)result;
6595
6596 onError:
6597 Py_XDECREF(result);
6598 Py_DECREF(uformat);
6599 if (args_owned) {
6600 Py_DECREF(args);
6601 }
6602 return NULL;
6603}
6604
6605static PyBufferProcs unicode_as_buffer = {
6606 (getreadbufferproc) unicode_buffer_getreadbuf,
6607 (getwritebufferproc) unicode_buffer_getwritebuf,
6608 (getsegcountproc) unicode_buffer_getsegcount,
6609 (getcharbufferproc) unicode_buffer_getcharbuf,
6610};
6611
Jeremy Hylton938ace62002-07-17 16:30:39 +00006612static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006613unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6614
Tim Peters6d6c1a32001-08-02 04:15:00 +00006615static PyObject *
6616unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6617{
6618 PyObject *x = NULL;
6619 static char *kwlist[] = {"string", "encoding", "errors", 0};
6620 char *encoding = NULL;
6621 char *errors = NULL;
6622
Guido van Rossume023fe02001-08-30 03:12:59 +00006623 if (type != &PyUnicode_Type)
6624 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006625 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6626 kwlist, &x, &encoding, &errors))
6627 return NULL;
6628 if (x == NULL)
6629 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006630 if (encoding == NULL && errors == NULL)
6631 return PyObject_Unicode(x);
6632 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006633 return PyUnicode_FromEncodedObject(x, encoding, errors);
6634}
6635
Guido van Rossume023fe02001-08-30 03:12:59 +00006636static PyObject *
6637unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6638{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006639 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006640 int n;
6641
6642 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6643 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6644 if (tmp == NULL)
6645 return NULL;
6646 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006647 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6648 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00006649 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00006650 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6651 if (pnew->str == NULL) {
6652 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006653 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00006654 return NULL;
6655 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006656 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6657 pnew->length = n;
6658 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006659 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006660 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006664"unicode(string [, encoding[, errors]]) -> object\n\
6665\n\
6666Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006667encoding defaults to the current default string encoding.\n\
6668errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670PyTypeObject PyUnicode_Type = {
6671 PyObject_HEAD_INIT(&PyType_Type)
6672 0, /* ob_size */
6673 "unicode", /* tp_name */
6674 sizeof(PyUnicodeObject), /* tp_size */
6675 0, /* tp_itemsize */
6676 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006677 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006679 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 0, /* tp_setattr */
6681 (cmpfunc) unicode_compare, /* tp_compare */
6682 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006683 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006685 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 (hashfunc) unicode_hash, /* tp_hash*/
6687 0, /* tp_call*/
6688 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006689 PyObject_GenericGetAttr, /* tp_getattro */
6690 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006692 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6693 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006694 unicode_doc, /* tp_doc */
6695 0, /* tp_traverse */
6696 0, /* tp_clear */
6697 0, /* tp_richcompare */
6698 0, /* tp_weaklistoffset */
6699 0, /* tp_iter */
6700 0, /* tp_iternext */
6701 unicode_methods, /* tp_methods */
6702 0, /* tp_members */
6703 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006704 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006705 0, /* tp_dict */
6706 0, /* tp_descr_get */
6707 0, /* tp_descr_set */
6708 0, /* tp_dictoffset */
6709 0, /* tp_init */
6710 0, /* tp_alloc */
6711 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006712 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713};
6714
6715/* Initialize the Unicode implementation */
6716
Thomas Wouters78890102000-07-22 19:25:51 +00006717void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006719 int i;
6720
Fred Drakee4315f52000-05-09 19:53:39 +00006721 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006722 unicode_freelist = NULL;
6723 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006725 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006726 for (i = 0; i < 256; i++)
6727 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006728 if (PyType_Ready(&PyUnicode_Type) < 0)
6729 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
6732/* Finalize the Unicode implementation */
6733
6734void
Thomas Wouters78890102000-07-22 19:25:51 +00006735_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006737 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006738 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006740 Py_XDECREF(unicode_empty);
6741 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006742
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006743 for (i = 0; i < 256; i++) {
6744 if (unicode_latin1[i]) {
6745 Py_DECREF(unicode_latin1[i]);
6746 unicode_latin1[i] = NULL;
6747 }
6748 }
6749
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006750 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 PyUnicodeObject *v = u;
6752 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006753 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006754 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006755 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006756 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006758 unicode_freelist = NULL;
6759 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760}