blob: 8565fb15454918d16fea3e07d23f31b5122f66f9 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 int owned = 0;
456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Greg Steinaf36a3a2000-07-17 09:04:43 +0000513 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000515 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000519 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523}
524
525PyObject *PyUnicode_Decode(const char *s,
526 int size,
527 const char *encoding,
528 const char *errors)
529{
530 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000531
532 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000533 encoding = PyUnicode_GetDefaultEncoding();
534
535 /* Shortcuts for common default encodings */
536 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "latin-1") == 0)
539 return PyUnicode_DecodeLatin1(s, size, errors);
540 else if (strcmp(encoding, "ascii") == 0)
541 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000542
543 /* Decode via the codec registry */
544 buffer = PyBuffer_FromMemory((void *)s, size);
545 if (buffer == NULL)
546 goto onError;
547 unicode = PyCodec_Decode(buffer, encoding, errors);
548 if (unicode == NULL)
549 goto onError;
550 if (!PyUnicode_Check(unicode)) {
551 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000552 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 unicode->ob_type->tp_name);
554 Py_DECREF(unicode);
555 goto onError;
556 }
557 Py_DECREF(buffer);
558 return unicode;
559
560 onError:
561 Py_XDECREF(buffer);
562 return NULL;
563}
564
565PyObject *PyUnicode_Encode(const Py_UNICODE *s,
566 int size,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v, *unicode;
571
572 unicode = PyUnicode_FromUnicode(s, size);
573 if (unicode == NULL)
574 return NULL;
575 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
576 Py_DECREF(unicode);
577 return v;
578}
579
580PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
581 const char *encoding,
582 const char *errors)
583{
584 PyObject *v;
585
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
Fred Drakee4315f52000-05-09 19:53:39 +0000590
591 if (encoding == NULL)
592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (errors == NULL) {
596 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000597 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "latin-1") == 0)
599 return PyUnicode_AsLatin1String(unicode);
600 else if (strcmp(encoding, "ascii") == 0)
601 return PyUnicode_AsASCIIString(unicode);
602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
604 /* Encode via the codec registry */
605 v = PyCodec_Encode(unicode, encoding, errors);
606 if (v == NULL)
607 goto onError;
608 /* XXX Should we really enforce this ? */
609 if (!PyString_Check(v)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 v->ob_type->tp_name);
613 Py_DECREF(v);
614 goto onError;
615 }
616 return v;
617
618 onError:
619 return NULL;
620}
621
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
623 const char *errors)
624{
625 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
626
627 if (v)
628 return v;
629 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
630 if (v && errors == NULL)
631 ((PyUnicodeObject *)unicode)->defenc = v;
632 return v;
633}
634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
636{
637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
641 return PyUnicode_AS_UNICODE(unicode);
642
643 onError:
644 return NULL;
645}
646
647int PyUnicode_GetSize(PyObject *unicode)
648{
649 if (!PyUnicode_Check(unicode)) {
650 PyErr_BadArgument();
651 goto onError;
652 }
653 return PyUnicode_GET_SIZE(unicode);
654
655 onError:
656 return -1;
657}
658
Thomas Wouters78890102000-07-22 19:25:51 +0000659const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000660{
661 return unicode_default_encoding;
662}
663
664int PyUnicode_SetDefaultEncoding(const char *encoding)
665{
666 PyObject *v;
667
668 /* Make sure the encoding is valid. As side effect, this also
669 loads the encoding into the codec registry cache. */
670 v = _PyCodec_Lookup(encoding);
671 if (v == NULL)
672 goto onError;
673 Py_DECREF(v);
674 strncpy(unicode_default_encoding,
675 encoding,
676 sizeof(unicode_default_encoding));
677 return 0;
678
679 onError:
680 return -1;
681}
682
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000683/* error handling callback helper:
684 build arguments, call the callback and check the arguments,
685 if no exception occured, copy the replacement to the output
686 and adjust various state variables.
687 return 0 on success, -1 on error
688*/
689
690static
691int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
692 const char *encoding, const char *reason,
693 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
694 PyObject **output, int *outpos, Py_UNICODE **outptr)
695{
696 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
697
698 PyObject *restuple = NULL;
699 PyObject *repunicode = NULL;
700 int outsize = PyUnicode_GET_SIZE(*output);
701 int requiredsize;
702 int newpos;
703 Py_UNICODE *repptr;
704 int repsize;
705 int res = -1;
706
707 if (*errorHandler == NULL) {
708 *errorHandler = PyCodec_LookupError(errors);
709 if (*errorHandler == NULL)
710 goto onError;
711 }
712
713 if (*exceptionObject == NULL) {
714 *exceptionObject = PyUnicodeDecodeError_Create(
715 encoding, input, insize, *startinpos, *endinpos, reason);
716 if (*exceptionObject == NULL)
717 goto onError;
718 }
719 else {
720 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
721 goto onError;
722 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
723 goto onError;
724 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
725 goto onError;
726 }
727
728 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
729 if (restuple == NULL)
730 goto onError;
731 if (!PyTuple_Check(restuple)) {
732 PyErr_Format(PyExc_TypeError, &argparse[4]);
733 goto onError;
734 }
735 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
736 goto onError;
737 if (newpos<0)
738 newpos = 0;
739 else if (newpos>insize)
740 newpos = insize;
741
742 /* need more space? (at least enough for what we
743 have+the replacement+the rest of the string (starting
744 at the new input position), so we won't have to check space
745 when there are no errors in the rest of the string) */
746 repptr = PyUnicode_AS_UNICODE(repunicode);
747 repsize = PyUnicode_GET_SIZE(repunicode);
748 requiredsize = *outpos + repsize + insize-newpos;
749 if (requiredsize > outsize) {
750 if (requiredsize<2*outsize)
751 requiredsize = 2*outsize;
752 if (PyUnicode_Resize(output, requiredsize))
753 goto onError;
754 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
755 }
756 *endinpos = newpos;
757 *inptr = input + newpos;
758 Py_UNICODE_COPY(*outptr, repptr, repsize);
759 *outptr += repsize;
760 *outpos += repsize;
761 /* we made it! */
762 res = 0;
763
764 onError:
765 Py_XDECREF(restuple);
766 return res;
767}
768
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000769/* --- UTF-7 Codec -------------------------------------------------------- */
770
771/* see RFC2152 for details */
772
773static
774char utf7_special[128] = {
775 /* indicate whether a UTF-7 character is special i.e. cannot be directly
776 encoded:
777 0 - not special
778 1 - special
779 2 - whitespace (optional)
780 3 - RFC2152 Set O (optional) */
781 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
783 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
785 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
787 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
789
790};
791
792#define SPECIAL(c, encodeO, encodeWS) \
793 (((c)>127 || utf7_special[(c)] == 1) || \
794 (encodeWS && (utf7_special[(c)] == 2)) || \
795 (encodeO && (utf7_special[(c)] == 3)))
796
797#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
798#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
799#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
800 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
801
802#define ENCODE(out, ch, bits) \
803 while (bits >= 6) { \
804 *out++ = B64(ch >> (bits-6)); \
805 bits -= 6; \
806 }
807
808#define DECODE(out, ch, bits, surrogate) \
809 while (bits >= 16) { \
810 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
811 bits -= 16; \
812 if (surrogate) { \
813 /* We have already generated an error for the high surrogate
814 so let's not bother seeing if the low surrogate is correct or not */\
815 surrogate = 0; \
816 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
817 /* This is a surrogate pair. Unfortunately we can't represent \
818 it in a 16-bit character */ \
819 surrogate = 1; \
820 errmsg = "code pairs are not supported"; \
821 goto utf7Error; \
822 } else { \
823 *out++ = outCh; \
824 } \
825 } \
826
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000827PyObject *PyUnicode_DecodeUTF7(const char *s,
828 int size,
829 const char *errors)
830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000831 const char *starts = s;
832 int startinpos;
833 int endinpos;
834 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000835 const char *e;
836 PyUnicodeObject *unicode;
837 Py_UNICODE *p;
838 const char *errmsg = "";
839 int inShift = 0;
840 unsigned int bitsleft = 0;
841 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000842 int surrogate = 0;
843 PyObject *errorHandler = NULL;
844 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000845
846 unicode = _PyUnicode_New(size);
847 if (!unicode)
848 return NULL;
849 if (size == 0)
850 return (PyObject *)unicode;
851
852 p = unicode->str;
853 e = s + size;
854
855 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000856 Py_UNICODE ch;
857 restart:
858 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000859
860 if (inShift) {
861 if ((ch == '-') || !B64CHAR(ch)) {
862 inShift = 0;
863 s++;
864
865 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
866 if (bitsleft >= 6) {
867 /* The shift sequence has a partial character in it. If
868 bitsleft < 6 then we could just classify it as padding
869 but that is not the case here */
870
871 errmsg = "partial character in shift sequence";
872 goto utf7Error;
873 }
874 /* According to RFC2152 the remaining bits should be zero. We
875 choose to signal an error/insert a replacement character
876 here so indicate the potential of a misencoded character. */
877
878 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
879 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
880 errmsg = "non-zero padding bits in shift sequence";
881 goto utf7Error;
882 }
883
884 if (ch == '-') {
885 if ((s < e) && (*(s) == '-')) {
886 *p++ = '-';
887 inShift = 1;
888 }
889 } else if (SPECIAL(ch,0,0)) {
890 errmsg = "unexpected special character";
891 goto utf7Error;
892 } else {
893 *p++ = ch;
894 }
895 } else {
896 charsleft = (charsleft << 6) | UB64(ch);
897 bitsleft += 6;
898 s++;
899 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
900 }
901 }
902 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000903 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904 s++;
905 if (s < e && *s == '-') {
906 s++;
907 *p++ = '+';
908 } else
909 {
910 inShift = 1;
911 bitsleft = 0;
912 }
913 }
914 else if (SPECIAL(ch,0,0)) {
915 errmsg = "unexpected special character";
916 s++;
917 goto utf7Error;
918 }
919 else {
920 *p++ = ch;
921 s++;
922 }
923 continue;
924 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000925 outpos = p-PyUnicode_AS_UNICODE(unicode);
926 endinpos = s-starts;
927 if (unicode_decode_call_errorhandler(
928 errors, &errorHandler,
929 "utf7", errmsg,
930 starts, size, &startinpos, &endinpos, &exc, &s,
931 (PyObject **)&unicode, &outpos, &p))
932 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 }
934
935 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000936 outpos = p-PyUnicode_AS_UNICODE(unicode);
937 endinpos = size;
938 if (unicode_decode_call_errorhandler(
939 errors, &errorHandler,
940 "utf7", "unterminated shift sequence",
941 starts, size, &startinpos, &endinpos, &exc, &s,
942 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000943 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000944 if (s < e)
945 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 }
947
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000948 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 goto onError;
950
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 Py_XDECREF(errorHandler);
952 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 return (PyObject *)unicode;
954
955onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000956 Py_XDECREF(errorHandler);
957 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000958 Py_DECREF(unicode);
959 return NULL;
960}
961
962
963PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
964 int size,
965 int encodeSetO,
966 int encodeWhiteSpace,
967 const char *errors)
968{
969 PyObject *v;
970 /* It might be possible to tighten this worst case */
971 unsigned int cbAllocated = 5 * size;
972 int inShift = 0;
973 int i = 0;
974 unsigned int bitsleft = 0;
975 unsigned long charsleft = 0;
976 char * out;
977 char * start;
978
979 if (size == 0)
980 return PyString_FromStringAndSize(NULL, 0);
981
982 v = PyString_FromStringAndSize(NULL, cbAllocated);
983 if (v == NULL)
984 return NULL;
985
986 start = out = PyString_AS_STRING(v);
987 for (;i < size; ++i) {
988 Py_UNICODE ch = s[i];
989
990 if (!inShift) {
991 if (ch == '+') {
992 *out++ = '+';
993 *out++ = '-';
994 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
995 charsleft = ch;
996 bitsleft = 16;
997 *out++ = '+';
998 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
999 inShift = bitsleft > 0;
1000 } else {
1001 *out++ = (char) ch;
1002 }
1003 } else {
1004 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1005 *out++ = B64(charsleft << (6-bitsleft));
1006 charsleft = 0;
1007 bitsleft = 0;
1008 /* Characters not in the BASE64 set implicitly unshift the sequence
1009 so no '-' is required, except if the character is itself a '-' */
1010 if (B64CHAR(ch) || ch == '-') {
1011 *out++ = '-';
1012 }
1013 inShift = 0;
1014 *out++ = (char) ch;
1015 } else {
1016 bitsleft += 16;
1017 charsleft = (charsleft << 16) | ch;
1018 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1019
1020 /* If the next character is special then we dont' need to terminate
1021 the shift sequence. If the next character is not a BASE64 character
1022 or '-' then the shift sequence will be terminated implicitly and we
1023 don't have to insert a '-'. */
1024
1025 if (bitsleft == 0) {
1026 if (i + 1 < size) {
1027 Py_UNICODE ch2 = s[i+1];
1028
1029 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1030
1031 } else if (B64CHAR(ch2) || ch2 == '-') {
1032 *out++ = '-';
1033 inShift = 0;
1034 } else {
1035 inShift = 0;
1036 }
1037
1038 }
1039 else {
1040 *out++ = '-';
1041 inShift = 0;
1042 }
1043 }
1044 }
1045 }
1046 }
1047 if (bitsleft) {
1048 *out++= B64(charsleft << (6-bitsleft) );
1049 *out++ = '-';
1050 }
1051
Tim Peters5de98422002-04-27 18:44:32 +00001052 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 return v;
1054}
1055
1056#undef SPECIAL
1057#undef B64
1058#undef B64CHAR
1059#undef UB64
1060#undef ENCODE
1061#undef DECODE
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063/* --- UTF-8 Codec -------------------------------------------------------- */
1064
1065static
1066char utf8_code_length[256] = {
1067 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1068 illegal prefix. see RFC 2279 for details */
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1081 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1082 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1083 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1084 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1085};
1086
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087PyObject *PyUnicode_DecodeUTF8(const char *s,
1088 int size,
1089 const char *errors)
1090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001091 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 int startinpos;
1094 int endinpos;
1095 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 const char *e;
1097 PyUnicodeObject *unicode;
1098 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001099 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001100 PyObject *errorHandler = NULL;
1101 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102
1103 /* Note: size will always be longer than the resulting Unicode
1104 character count */
1105 unicode = _PyUnicode_New(size);
1106 if (!unicode)
1107 return NULL;
1108 if (size == 0)
1109 return (PyObject *)unicode;
1110
1111 /* Unpack UTF-8 encoded data */
1112 p = unicode->str;
1113 e = s + size;
1114
1115 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001116 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117
1118 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001119 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 s++;
1121 continue;
1122 }
1123
1124 n = utf8_code_length[ch];
1125
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001126 if (s + n > e) {
1127 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001128 startinpos = s-starts;
1129 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001130 goto utf8Error;
1131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132
1133 switch (n) {
1134
1135 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001136 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001137 startinpos = s-starts;
1138 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001139 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
1141 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001143 startinpos = s-starts;
1144 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001145 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
1147 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001148 if ((s[1] & 0xc0) != 0x80) {
1149 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001150 startinpos = s-starts;
1151 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 goto utf8Error;
1153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001156 startinpos = s-starts;
1157 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "illegal encoding";
1159 goto utf8Error;
1160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 break;
1164
1165 case 3:
1166 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001167 (s[2] & 0xc0) != 0x80) {
1168 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001169 startinpos = s-starts;
1170 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 goto utf8Error;
1172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001174 if (ch < 0x0800) {
1175 /* Note: UTF-8 encodings of surrogates are considered
1176 legal UTF-8 sequences;
1177
1178 XXX For wide builds (UCS-4) we should probably try
1179 to recombine the surrogates into a single code
1180 unit.
1181 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001182 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001183 startinpos = s-starts;
1184 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001185 goto utf8Error;
1186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001188 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 break;
1190
1191 case 4:
1192 if ((s[1] & 0xc0) != 0x80 ||
1193 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001194 (s[3] & 0xc0) != 0x80) {
1195 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001196 startinpos = s-starts;
1197 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 goto utf8Error;
1199 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001200 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1201 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1202 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001203 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001205 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001206 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001207 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001209 startinpos = s-starts;
1210 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 goto utf8Error;
1212 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001213#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001214 *p++ = (Py_UNICODE)ch;
1215#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 /* compute and append the two surrogates: */
1217
1218 /* translate from 10000..10FFFF to 0..FFFF */
1219 ch -= 0x10000;
1220
1221 /* high surrogate = top 10 bits added to D800 */
1222 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1223
1224 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001225 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001226#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 break;
1228
1229 default:
1230 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001232 startinpos = s-starts;
1233 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 }
1236 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001237 continue;
1238
1239 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001240 outpos = p-PyUnicode_AS_UNICODE(unicode);
1241 if (unicode_decode_call_errorhandler(
1242 errors, &errorHandler,
1243 "utf8", errmsg,
1244 starts, size, &startinpos, &endinpos, &exc, &s,
1245 (PyObject **)&unicode, &outpos, &p))
1246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 }
1248
1249 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001250 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 goto onError;
1252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253 Py_XDECREF(errorHandler);
1254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return (PyObject *)unicode;
1256
1257onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001258 Py_XDECREF(errorHandler);
1259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 Py_DECREF(unicode);
1261 return NULL;
1262}
1263
Tim Peters602f7402002-04-27 18:03:26 +00001264/* Allocation strategy: if the string is short, convert into a stack buffer
1265 and allocate exactly as much space needed at the end. Else allocate the
1266 maximum possible needed (4 result bytes per Unicode character), and return
1267 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001268*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001269PyObject *
1270PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1271 int size,
1272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273{
Tim Peters602f7402002-04-27 18:03:26 +00001274#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001275
Tim Peters602f7402002-04-27 18:03:26 +00001276 int i; /* index into s of next input byte */
1277 PyObject *v; /* result string object */
1278 char *p; /* next free byte in output buffer */
1279 int nallocated; /* number of result bytes allocated */
1280 int nneeded; /* number of result bytes needed */
1281 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001282
Tim Peters602f7402002-04-27 18:03:26 +00001283 assert(s != NULL);
1284 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285
Tim Peters602f7402002-04-27 18:03:26 +00001286 if (size <= MAX_SHORT_UNICHARS) {
1287 /* Write into the stack buffer; nallocated can't overflow.
1288 * At the end, we'll allocate exactly as much heap space as it
1289 * turns out we need.
1290 */
1291 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1292 v = NULL; /* will allocate after we're done */
1293 p = stackbuf;
1294 }
1295 else {
1296 /* Overallocate on the heap, and give the excess back at the end. */
1297 nallocated = size * 4;
1298 if (nallocated / 4 != size) /* overflow! */
1299 return PyErr_NoMemory();
1300 v = PyString_FromStringAndSize(NULL, nallocated);
1301 if (v == NULL)
1302 return NULL;
1303 p = PyString_AS_STRING(v);
1304 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001305
Tim Peters602f7402002-04-27 18:03:26 +00001306 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001307 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001308
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001309 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001310 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001312
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001314 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001315 *p++ = (char)(0xc0 | (ch >> 6));
1316 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001317 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001318 else {
Tim Peters602f7402002-04-27 18:03:26 +00001319 /* Encode UCS2 Unicode ordinals */
1320 if (ch < 0x10000) {
1321 /* Special case: check for high surrogate */
1322 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1323 Py_UCS4 ch2 = s[i];
1324 /* Check for low surrogate and combine the two to
1325 form a UCS4 value */
1326 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001327 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001328 i++;
1329 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 }
Tim Peters602f7402002-04-27 18:03:26 +00001331 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001334 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1335 *p++ = (char)(0x80 | (ch & 0x3f));
1336 continue;
1337 }
1338encodeUCS4:
1339 /* Encode UCS4 Unicode ordinals */
1340 *p++ = (char)(0xf0 | (ch >> 18));
1341 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1343 *p++ = (char)(0x80 | (ch & 0x3f));
1344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001346
Tim Peters602f7402002-04-27 18:03:26 +00001347 if (v == NULL) {
1348 /* This was stack allocated. */
1349 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1350 assert(nneeded <= nallocated);
1351 v = PyString_FromStringAndSize(stackbuf, nneeded);
1352 }
1353 else {
1354 /* Cut back to size actually needed. */
1355 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1356 assert(nneeded <= nallocated);
1357 _PyString_Resize(&v, nneeded);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001360
Tim Peters602f7402002-04-27 18:03:26 +00001361#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362}
1363
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1365{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 if (!PyUnicode_Check(unicode)) {
1367 PyErr_BadArgument();
1368 return NULL;
1369 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001370 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1371 PyUnicode_GET_SIZE(unicode),
1372 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373}
1374
1375/* --- UTF-16 Codec ------------------------------------------------------- */
1376
Tim Peters772747b2001-08-09 22:21:55 +00001377PyObject *
1378PyUnicode_DecodeUTF16(const char *s,
1379 int size,
1380 const char *errors,
1381 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 const char *starts = s;
1384 int startinpos;
1385 int endinpos;
1386 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 PyUnicodeObject *unicode;
1388 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001389 const unsigned char *q, *e;
1390 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001391 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001392 /* Offsets from q for retrieving byte pairs in the right order. */
1393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1394 int ihi = 1, ilo = 0;
1395#else
1396 int ihi = 0, ilo = 1;
1397#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001398 PyObject *errorHandler = NULL;
1399 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400
1401 /* Note: size will always be longer than the resulting Unicode
1402 character count */
1403 unicode = _PyUnicode_New(size);
1404 if (!unicode)
1405 return NULL;
1406 if (size == 0)
1407 return (PyObject *)unicode;
1408
1409 /* Unpack UTF-16 encoded data */
1410 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001411 q = (unsigned char *)s;
1412 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413
1414 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001415 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001417 /* Check for BOM marks (U+FEFF) in the input and adjust current
1418 byte order setting accordingly. In native mode, the leading BOM
1419 mark is skipped, in all other modes, it is copied to the output
1420 stream as-is (giving a ZWNBSP character). */
1421 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001422 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001424 if (bom == 0xFEFF) {
1425 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001426 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001427 }
1428 else if (bom == 0xFFFE) {
1429 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001430 bo = 1;
1431 }
1432#else
Tim Peters772747b2001-08-09 22:21:55 +00001433 if (bom == 0xFEFF) {
1434 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001435 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001436 }
1437 else if (bom == 0xFFFE) {
1438 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439 bo = -1;
1440 }
1441#endif
1442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443
Tim Peters772747b2001-08-09 22:21:55 +00001444 if (bo == -1) {
1445 /* force LE */
1446 ihi = 1;
1447 ilo = 0;
1448 }
1449 else if (bo == 1) {
1450 /* force BE */
1451 ihi = 0;
1452 ilo = 1;
1453 }
1454
1455 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 Py_UNICODE ch;
1457 /* remaing bytes at the end? (size should be even) */
1458 if (e-q<2) {
1459 errmsg = "truncated data";
1460 startinpos = ((const char *)q)-starts;
1461 endinpos = ((const char *)e)-starts;
1462 goto utf16Error;
1463 /* The remaining input chars are ignored if the callback
1464 chooses to skip the input */
1465 }
1466 ch = (q[ihi] << 8) | q[ilo];
1467
Tim Peters772747b2001-08-09 22:21:55 +00001468 q += 2;
1469
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (ch < 0xD800 || ch > 0xDFFF) {
1471 *p++ = ch;
1472 continue;
1473 }
1474
1475 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001476 if (q >= e) {
1477 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 startinpos = (((const char *)q)-2)-starts;
1479 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 goto utf16Error;
1481 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001482 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001483 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1484 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001485 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001486#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001487 *p++ = ch;
1488 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489#else
1490 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001491#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001492 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493 }
1494 else {
1495 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 startinpos = (((const char *)q)-4)-starts;
1497 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001498 goto utf16Error;
1499 }
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001502 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 startinpos = (((const char *)q)-2)-starts;
1504 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001505 /* Fall through to report the error */
1506
1507 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001508 outpos = p-PyUnicode_AS_UNICODE(unicode);
1509 if (unicode_decode_call_errorhandler(
1510 errors, &errorHandler,
1511 "utf16", errmsg,
1512 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1513 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001514 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 }
1516
1517 if (byteorder)
1518 *byteorder = bo;
1519
1520 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001521 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 goto onError;
1523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 Py_XDECREF(errorHandler);
1525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return (PyObject *)unicode;
1527
1528onError:
1529 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 Py_XDECREF(errorHandler);
1531 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 return NULL;
1533}
1534
Tim Peters772747b2001-08-09 22:21:55 +00001535PyObject *
1536PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1537 int size,
1538 const char *errors,
1539 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540{
1541 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001542 unsigned char *p;
1543 int i, pairs;
1544 /* Offsets from p for storing byte pairs in the right order. */
1545#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1546 int ihi = 1, ilo = 0;
1547#else
1548 int ihi = 0, ilo = 1;
1549#endif
1550
1551#define STORECHAR(CH) \
1552 do { \
1553 p[ihi] = ((CH) >> 8) & 0xff; \
1554 p[ilo] = (CH) & 0xff; \
1555 p += 2; \
1556 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001558 for (i = pairs = 0; i < size; i++)
1559 if (s[i] >= 0x10000)
1560 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001562 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (v == NULL)
1564 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
Tim Peters772747b2001-08-09 22:21:55 +00001566 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001568 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001569 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001570 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001571
1572 if (byteorder == -1) {
1573 /* force LE */
1574 ihi = 1;
1575 ilo = 0;
1576 }
1577 else if (byteorder == 1) {
1578 /* force BE */
1579 ihi = 0;
1580 ilo = 1;
1581 }
1582
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583 while (size-- > 0) {
1584 Py_UNICODE ch = *s++;
1585 Py_UNICODE ch2 = 0;
1586 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001587 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1588 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 }
Tim Peters772747b2001-08-09 22:21:55 +00001590 STORECHAR(ch);
1591 if (ch2)
1592 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001595#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596}
1597
1598PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1599{
1600 if (!PyUnicode_Check(unicode)) {
1601 PyErr_BadArgument();
1602 return NULL;
1603 }
1604 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1605 PyUnicode_GET_SIZE(unicode),
1606 NULL,
1607 0);
1608}
1609
1610/* --- Unicode Escape Codec ----------------------------------------------- */
1611
Fredrik Lundh06d12682001-01-24 07:59:11 +00001612static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001613
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1615 int size,
1616 const char *errors)
1617{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 const char *starts = s;
1619 int startinpos;
1620 int endinpos;
1621 int outpos;
1622 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001625 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626 char* message;
1627 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 PyObject *errorHandler = NULL;
1629 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001630
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631 /* Escaped strings will always be longer than the resulting
1632 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 length after conversion to the true value.
1634 (but if the error callback returns a long replacement string
1635 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636 v = _PyUnicode_New(size);
1637 if (v == NULL)
1638 goto onError;
1639 if (size == 0)
1640 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001644
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645 while (s < end) {
1646 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001647 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649
1650 /* Non-escape characters are interpreted as Unicode ordinals */
1651 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001652 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 continue;
1654 }
1655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 /* \ - Escapes */
1658 s++;
1659 switch (*s++) {
1660
1661 /* \x escapes */
1662 case '\n': break;
1663 case '\\': *p++ = '\\'; break;
1664 case '\'': *p++ = '\''; break;
1665 case '\"': *p++ = '\"'; break;
1666 case 'b': *p++ = '\b'; break;
1667 case 'f': *p++ = '\014'; break; /* FF */
1668 case 't': *p++ = '\t'; break;
1669 case 'n': *p++ = '\n'; break;
1670 case 'r': *p++ = '\r'; break;
1671 case 'v': *p++ = '\013'; break; /* VT */
1672 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1673
1674 /* \OOO (octal) escapes */
1675 case '0': case '1': case '2': case '3':
1676 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001677 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001679 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001681 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001683 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 break;
1685
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686 /* hex escapes */
1687 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001689 digits = 2;
1690 message = "truncated \\xXX escape";
1691 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692
Fredrik Lundhccc74732001-02-18 22:13:49 +00001693 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001695 digits = 4;
1696 message = "truncated \\uXXXX escape";
1697 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698
Fredrik Lundhccc74732001-02-18 22:13:49 +00001699 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001700 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001701 digits = 8;
1702 message = "truncated \\UXXXXXXXX escape";
1703 hexescape:
1704 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001705 outpos = p-PyUnicode_AS_UNICODE(v);
1706 if (s+digits>end) {
1707 endinpos = size;
1708 if (unicode_decode_call_errorhandler(
1709 errors, &errorHandler,
1710 "unicodeescape", "end of string in escape sequence",
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 (PyObject **)&v, &outpos, &p))
1713 goto onError;
1714 goto nextByte;
1715 }
1716 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001718 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 endinpos = (s+i+1)-starts;
1720 if (unicode_decode_call_errorhandler(
1721 errors, &errorHandler,
1722 "unicodeescape", message,
1723 starts, size, &startinpos, &endinpos, &exc, &s,
1724 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001725 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001726 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001727 }
1728 chr = (chr<<4) & ~0xF;
1729 if (c >= '0' && c <= '9')
1730 chr += c - '0';
1731 else if (c >= 'a' && c <= 'f')
1732 chr += 10 + c - 'a';
1733 else
1734 chr += 10 + c - 'A';
1735 }
1736 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001737 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 /* _decoding_error will have already written into the
1739 target buffer. */
1740 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001741 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001742 /* when we get here, chr is a 32-bit unicode character */
1743 if (chr <= 0xffff)
1744 /* UCS-2 character */
1745 *p++ = (Py_UNICODE) chr;
1746 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001747 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001748 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001749#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 *p++ = chr;
1751#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001752 chr -= 0x10000L;
1753 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001754 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 endinpos = s-starts;
1758 outpos = p-PyUnicode_AS_UNICODE(v);
1759 if (unicode_decode_call_errorhandler(
1760 errors, &errorHandler,
1761 "unicodeescape", "illegal Unicode character",
1762 starts, size, &startinpos, &endinpos, &exc, &s,
1763 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001764 goto onError;
1765 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001766 break;
1767
1768 /* \N{name} */
1769 case 'N':
1770 message = "malformed \\N character escape";
1771 if (ucnhash_CAPI == NULL) {
1772 /* load the unicode data module */
1773 PyObject *m, *v;
1774 m = PyImport_ImportModule("unicodedata");
1775 if (m == NULL)
1776 goto ucnhashError;
1777 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1778 Py_DECREF(m);
1779 if (v == NULL)
1780 goto ucnhashError;
1781 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1782 Py_DECREF(v);
1783 if (ucnhash_CAPI == NULL)
1784 goto ucnhashError;
1785 }
1786 if (*s == '{') {
1787 const char *start = s+1;
1788 /* look for the closing brace */
1789 while (*s != '}' && s < end)
1790 s++;
1791 if (s > start && s < end && *s == '}') {
1792 /* found a name. look it up in the unicode database */
1793 message = "unknown Unicode character name";
1794 s++;
1795 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1796 goto store;
1797 }
1798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 endinpos = s-starts;
1800 outpos = p-PyUnicode_AS_UNICODE(v);
1801 if (unicode_decode_call_errorhandler(
1802 errors, &errorHandler,
1803 "unicodeescape", message,
1804 starts, size, &startinpos, &endinpos, &exc, &s,
1805 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001807 break;
1808
1809 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001810 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 message = "\\ at end of string";
1812 s--;
1813 endinpos = s-starts;
1814 outpos = p-PyUnicode_AS_UNICODE(v);
1815 if (unicode_decode_call_errorhandler(
1816 errors, &errorHandler,
1817 "unicodeescape", message,
1818 starts, size, &startinpos, &endinpos, &exc, &s,
1819 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001820 goto onError;
1821 }
1822 else {
1823 *p++ = '\\';
1824 *p++ = (unsigned char)s[-1];
1825 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001826 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 nextByte:
1829 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1832 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001834
Fredrik Lundhccc74732001-02-18 22:13:49 +00001835ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001836 PyErr_SetString(
1837 PyExc_UnicodeError,
1838 "\\N escapes not supported (can't load unicodedata module)"
1839 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001840 Py_XDECREF(errorHandler);
1841 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001842 return NULL;
1843
Fredrik Lundhccc74732001-02-18 22:13:49 +00001844onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 Py_XDECREF(errorHandler);
1847 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 return NULL;
1849}
1850
1851/* Return a Unicode-Escape string version of the Unicode object.
1852
1853 If quotes is true, the string is enclosed in u"" or u'' quotes as
1854 appropriate.
1855
1856*/
1857
Barry Warsaw51ac5802000-03-20 16:36:48 +00001858static const Py_UNICODE *findchar(const Py_UNICODE *s,
1859 int size,
1860 Py_UNICODE ch);
1861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862static
1863PyObject *unicodeescape_string(const Py_UNICODE *s,
1864 int size,
1865 int quotes)
1866{
1867 PyObject *repr;
1868 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001870 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871
1872 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1873 if (repr == NULL)
1874 return NULL;
1875
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001876 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877
1878 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 *p++ = 'u';
1880 *p++ = (findchar(s, size, '\'') &&
1881 !findchar(s, size, '"')) ? '"' : '\'';
1882 }
1883 while (size-- > 0) {
1884 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001887 if (quotes &&
1888 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 *p++ = '\\';
1890 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001891 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001893
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001894#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001895 /* Map 21-bit characters to '\U00xxxxxx' */
1896 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001897 int offset = p - PyString_AS_STRING(repr);
1898
1899 /* Resize the string if necessary */
1900 if (offset + 12 > PyString_GET_SIZE(repr)) {
1901 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001902 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903 p = PyString_AS_STRING(repr) + offset;
1904 }
1905
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001906 *p++ = '\\';
1907 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001908 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1909 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1910 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1911 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1912 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1913 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1914 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 *p++ = hexdigit[ch & 0x0000000F];
1916 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001918#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001919 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1920 else if (ch >= 0xD800 && ch < 0xDC00) {
1921 Py_UNICODE ch2;
1922 Py_UCS4 ucs;
1923
1924 ch2 = *s++;
1925 size--;
1926 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1927 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1928 *p++ = '\\';
1929 *p++ = 'U';
1930 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1931 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1932 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1933 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1934 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1935 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1936 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1937 *p++ = hexdigit[ucs & 0x0000000F];
1938 continue;
1939 }
1940 /* Fall through: isolated surrogates are copied as-is */
1941 s--;
1942 size++;
1943 }
1944
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001946 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 *p++ = '\\';
1948 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001949 *p++ = hexdigit[(ch >> 12) & 0x000F];
1950 *p++ = hexdigit[(ch >> 8) & 0x000F];
1951 *p++ = hexdigit[(ch >> 4) & 0x000F];
1952 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001954
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001955 /* Map special whitespace to '\t', \n', '\r' */
1956 else if (ch == '\t') {
1957 *p++ = '\\';
1958 *p++ = 't';
1959 }
1960 else if (ch == '\n') {
1961 *p++ = '\\';
1962 *p++ = 'n';
1963 }
1964 else if (ch == '\r') {
1965 *p++ = '\\';
1966 *p++ = 'r';
1967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001968
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001969 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001970 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001972 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001973 *p++ = hexdigit[(ch >> 4) & 0x000F];
1974 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001976
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 /* Copy everything else as-is */
1978 else
1979 *p++ = (char) ch;
1980 }
1981 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001985 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 return repr;
1987}
1988
1989PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1990 int size)
1991{
1992 return unicodeescape_string(s, size, 0);
1993}
1994
1995PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1996{
1997 if (!PyUnicode_Check(unicode)) {
1998 PyErr_BadArgument();
1999 return NULL;
2000 }
2001 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2002 PyUnicode_GET_SIZE(unicode));
2003}
2004
2005/* --- Raw Unicode Escape Codec ------------------------------------------- */
2006
2007PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2008 int size,
2009 const char *errors)
2010{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 const char *starts = s;
2012 int startinpos;
2013 int endinpos;
2014 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 const char *end;
2018 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 /* Escaped strings will always be longer than the resulting
2023 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 length after conversion to the true value. (But decoding error
2025 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 v = _PyUnicode_New(size);
2027 if (v == NULL)
2028 goto onError;
2029 if (size == 0)
2030 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 end = s + size;
2033 while (s < end) {
2034 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002035 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int i;
2037
2038 /* Non-escape characters are interpreted as Unicode ordinals */
2039 if (*s != '\\') {
2040 *p++ = (unsigned char)*s++;
2041 continue;
2042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* \u-escapes are only interpreted iff the number of leading
2046 backslashes if odd */
2047 bs = s;
2048 for (;s < end;) {
2049 if (*s != '\\')
2050 break;
2051 *p++ = (unsigned char)*s++;
2052 }
2053 if (((s - bs) & 1) == 0 ||
2054 s >= end ||
2055 *s != 'u') {
2056 continue;
2057 }
2058 p--;
2059 s++;
2060
2061 /* \uXXXX with 4 hex digits */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 outpos = p-PyUnicode_AS_UNICODE(v);
2063 for (x = 0, i = 0; i < 4; ++i, ++s) {
2064 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 endinpos = s-starts;
2067 if (unicode_decode_call_errorhandler(
2068 errors, &errorHandler,
2069 "rawunicodeescape", "truncated \\uXXXX",
2070 starts, size, &startinpos, &endinpos, &exc, &s,
2071 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002073 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 }
2075 x = (x<<4) & ~0xF;
2076 if (c >= '0' && c <= '9')
2077 x += c - '0';
2078 else if (c >= 'a' && c <= 'f')
2079 x += 10 + c - 'a';
2080 else
2081 x += 10 + c - 'A';
2082 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 *p++ = x;
2084 nextByte:
2085 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002087 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 Py_XDECREF(errorHandler);
2090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 return (PyObject *)v;
2092
2093 onError:
2094 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 Py_XDECREF(errorHandler);
2096 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 return NULL;
2098}
2099
2100PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2101 int size)
2102{
2103 PyObject *repr;
2104 char *p;
2105 char *q;
2106
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002107 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
2109 repr = PyString_FromStringAndSize(NULL, 6 * size);
2110 if (repr == NULL)
2111 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002112 if (size == 0)
2113 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114
2115 p = q = PyString_AS_STRING(repr);
2116 while (size-- > 0) {
2117 Py_UNICODE ch = *s++;
2118 /* Map 16-bit characters to '\uxxxx' */
2119 if (ch >= 256) {
2120 *p++ = '\\';
2121 *p++ = 'u';
2122 *p++ = hexdigit[(ch >> 12) & 0xf];
2123 *p++ = hexdigit[(ch >> 8) & 0xf];
2124 *p++ = hexdigit[(ch >> 4) & 0xf];
2125 *p++ = hexdigit[ch & 15];
2126 }
2127 /* Copy everything else as-is */
2128 else
2129 *p++ = (char) ch;
2130 }
2131 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002132 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 return repr;
2134}
2135
2136PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2137{
2138 if (!PyUnicode_Check(unicode)) {
2139 PyErr_BadArgument();
2140 return NULL;
2141 }
2142 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2143 PyUnicode_GET_SIZE(unicode));
2144}
2145
2146/* --- Latin-1 Codec ------------------------------------------------------ */
2147
2148PyObject *PyUnicode_DecodeLatin1(const char *s,
2149 int size,
2150 const char *errors)
2151{
2152 PyUnicodeObject *v;
2153 Py_UNICODE *p;
2154
2155 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 if (size == 1 && *(unsigned char*)s < 256) {
2157 Py_UNICODE r = *(unsigned char*)s;
2158 return PyUnicode_FromUnicode(&r, 1);
2159 }
2160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 v = _PyUnicode_New(size);
2162 if (v == NULL)
2163 goto onError;
2164 if (size == 0)
2165 return (PyObject *)v;
2166 p = PyUnicode_AS_UNICODE(v);
2167 while (size-- > 0)
2168 *p++ = (unsigned char)*s++;
2169 return (PyObject *)v;
2170
2171 onError:
2172 Py_XDECREF(v);
2173 return NULL;
2174}
2175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176/* create or adjust a UnicodeEncodeError */
2177static void make_encode_exception(PyObject **exceptionObject,
2178 const char *encoding,
2179 const Py_UNICODE *unicode, int size,
2180 int startpos, int endpos,
2181 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 if (*exceptionObject == NULL) {
2184 *exceptionObject = PyUnicodeEncodeError_Create(
2185 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 }
2187 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002188 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2189 goto onError;
2190 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2191 goto onError;
2192 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2193 goto onError;
2194 return;
2195 onError:
2196 Py_DECREF(*exceptionObject);
2197 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 }
2199}
2200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002201/* raises a UnicodeEncodeError */
2202static void raise_encode_exception(PyObject **exceptionObject,
2203 const char *encoding,
2204 const Py_UNICODE *unicode, int size,
2205 int startpos, int endpos,
2206 const char *reason)
2207{
2208 make_encode_exception(exceptionObject,
2209 encoding, unicode, size, startpos, endpos, reason);
2210 if (*exceptionObject != NULL)
2211 PyCodec_StrictErrors(*exceptionObject);
2212}
2213
2214/* error handling callback helper:
2215 build arguments, call the callback and check the arguments,
2216 put the result into newpos and return the replacement string, which
2217 has to be freed by the caller */
2218static PyObject *unicode_encode_call_errorhandler(const char *errors,
2219 PyObject **errorHandler,
2220 const char *encoding, const char *reason,
2221 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2222 int startpos, int endpos,
2223 int *newpos)
2224{
2225 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2226
2227 PyObject *restuple;
2228 PyObject *resunicode;
2229
2230 if (*errorHandler == NULL) {
2231 *errorHandler = PyCodec_LookupError(errors);
2232 if (*errorHandler == NULL)
2233 return NULL;
2234 }
2235
2236 make_encode_exception(exceptionObject,
2237 encoding, unicode, size, startpos, endpos, reason);
2238 if (*exceptionObject == NULL)
2239 return NULL;
2240
2241 restuple = PyObject_CallFunctionObjArgs(
2242 *errorHandler, *exceptionObject, NULL);
2243 if (restuple == NULL)
2244 return NULL;
2245 if (!PyTuple_Check(restuple)) {
2246 PyErr_Format(PyExc_TypeError, &argparse[4]);
2247 Py_DECREF(restuple);
2248 return NULL;
2249 }
2250 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2251 &resunicode, newpos)) {
2252 Py_DECREF(restuple);
2253 return NULL;
2254 }
2255 if (*newpos<0)
2256 *newpos = 0;
2257 else if (*newpos>size)
2258 *newpos = size;
2259 Py_INCREF(resunicode);
2260 Py_DECREF(restuple);
2261 return resunicode;
2262}
2263
2264static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2265 int size,
2266 const char *errors,
2267 int limit)
2268{
2269 /* output object */
2270 PyObject *res;
2271 /* pointers to the beginning and end+1 of input */
2272 const Py_UNICODE *startp = p;
2273 const Py_UNICODE *endp = p + size;
2274 /* pointer to the beginning of the unencodable characters */
2275 /* const Py_UNICODE *badp = NULL; */
2276 /* pointer into the output */
2277 char *str;
2278 /* current output position */
2279 int respos = 0;
2280 int ressize;
2281 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2282 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2283 PyObject *errorHandler = NULL;
2284 PyObject *exc = NULL;
2285 /* the following variable is used for caching string comparisons
2286 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2287 int known_errorHandler = -1;
2288
2289 /* allocate enough for a simple encoding without
2290 replacements, if we need more, we'll resize */
2291 res = PyString_FromStringAndSize(NULL, size);
2292 if (res == NULL)
2293 goto onError;
2294 if (size == 0)
2295 return res;
2296 str = PyString_AS_STRING(res);
2297 ressize = size;
2298
2299 while (p<endp) {
2300 Py_UNICODE c = *p;
2301
2302 /* can we encode this? */
2303 if (c<limit) {
2304 /* no overflow check, because we know that the space is enough */
2305 *str++ = (char)c;
2306 ++p;
2307 }
2308 else {
2309 int unicodepos = p-startp;
2310 int requiredsize;
2311 PyObject *repunicode;
2312 int repsize;
2313 int newpos;
2314 int respos;
2315 Py_UNICODE *uni2;
2316 /* startpos for collecting unencodable chars */
2317 const Py_UNICODE *collstart = p;
2318 const Py_UNICODE *collend = p;
2319 /* find all unecodable characters */
2320 while ((collend < endp) && ((*collend)>=limit))
2321 ++collend;
2322 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2323 if (known_errorHandler==-1) {
2324 if ((errors==NULL) || (!strcmp(errors, "strict")))
2325 known_errorHandler = 1;
2326 else if (!strcmp(errors, "replace"))
2327 known_errorHandler = 2;
2328 else if (!strcmp(errors, "ignore"))
2329 known_errorHandler = 3;
2330 else if (!strcmp(errors, "xmlcharrefreplace"))
2331 known_errorHandler = 4;
2332 else
2333 known_errorHandler = 0;
2334 }
2335 switch (known_errorHandler) {
2336 case 1: /* strict */
2337 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2338 goto onError;
2339 case 2: /* replace */
2340 while (collstart++<collend)
2341 *str++ = '?'; /* fall through */
2342 case 3: /* ignore */
2343 p = collend;
2344 break;
2345 case 4: /* xmlcharrefreplace */
2346 respos = str-PyString_AS_STRING(res);
2347 /* determine replacement size (temporarily (mis)uses p) */
2348 for (p = collstart, repsize = 0; p < collend; ++p) {
2349 if (*p<10)
2350 repsize += 2+1+1;
2351 else if (*p<100)
2352 repsize += 2+2+1;
2353 else if (*p<1000)
2354 repsize += 2+3+1;
2355 else if (*p<10000)
2356 repsize += 2+4+1;
2357 else if (*p<100000)
2358 repsize += 2+5+1;
2359 else if (*p<1000000)
2360 repsize += 2+6+1;
2361 else
2362 repsize += 2+7+1;
2363 }
2364 requiredsize = respos+repsize+(endp-collend);
2365 if (requiredsize > ressize) {
2366 if (requiredsize<2*ressize)
2367 requiredsize = 2*ressize;
2368 if (_PyString_Resize(&res, requiredsize))
2369 goto onError;
2370 str = PyString_AS_STRING(res) + respos;
2371 ressize = requiredsize;
2372 }
2373 /* generate replacement (temporarily (mis)uses p) */
2374 for (p = collstart; p < collend; ++p) {
2375 str += sprintf(str, "&#%d;", (int)*p);
2376 }
2377 p = collend;
2378 break;
2379 default:
2380 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2381 encoding, reason, startp, size, &exc,
2382 collstart-startp, collend-startp, &newpos);
2383 if (repunicode == NULL)
2384 goto onError;
2385 /* need more space? (at least enough for what we
2386 have+the replacement+the rest of the string, so
2387 we won't have to check space for encodable characters) */
2388 respos = str-PyString_AS_STRING(res);
2389 repsize = PyUnicode_GET_SIZE(repunicode);
2390 requiredsize = respos+repsize+(endp-collend);
2391 if (requiredsize > ressize) {
2392 if (requiredsize<2*ressize)
2393 requiredsize = 2*ressize;
2394 if (_PyString_Resize(&res, requiredsize)) {
2395 Py_DECREF(repunicode);
2396 goto onError;
2397 }
2398 str = PyString_AS_STRING(res) + respos;
2399 ressize = requiredsize;
2400 }
2401 /* check if there is anything unencodable in the replacement
2402 and copy it to the output */
2403 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2404 c = *uni2;
2405 if (c >= limit) {
2406 raise_encode_exception(&exc, encoding, startp, size,
2407 unicodepos, unicodepos+1, reason);
2408 Py_DECREF(repunicode);
2409 goto onError;
2410 }
2411 *str = (char)c;
2412 }
2413 p = startp + newpos;
2414 Py_DECREF(repunicode);
2415 }
2416 }
2417 }
2418 /* Resize if we allocated to much */
2419 respos = str-PyString_AS_STRING(res);
2420 if (respos<ressize)
2421 /* If this falls res will be NULL */
2422 _PyString_Resize(&res, respos);
2423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
2425 return res;
2426
2427 onError:
2428 Py_XDECREF(res);
2429 Py_XDECREF(errorHandler);
2430 Py_XDECREF(exc);
2431 return NULL;
2432}
2433
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2435 int size,
2436 const char *errors)
2437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002438 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439}
2440
2441PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2442{
2443 if (!PyUnicode_Check(unicode)) {
2444 PyErr_BadArgument();
2445 return NULL;
2446 }
2447 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2448 PyUnicode_GET_SIZE(unicode),
2449 NULL);
2450}
2451
2452/* --- 7-bit ASCII Codec -------------------------------------------------- */
2453
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454PyObject *PyUnicode_DecodeASCII(const char *s,
2455 int size,
2456 const char *errors)
2457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002458 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 PyUnicodeObject *v;
2460 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 int startinpos;
2462 int endinpos;
2463 int outpos;
2464 const char *e;
2465 PyObject *errorHandler = NULL;
2466 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467
2468 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002469 if (size == 1 && *(unsigned char*)s < 128) {
2470 Py_UNICODE r = *(unsigned char*)s;
2471 return PyUnicode_FromUnicode(&r, 1);
2472 }
2473
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 v = _PyUnicode_New(size);
2475 if (v == NULL)
2476 goto onError;
2477 if (size == 0)
2478 return (PyObject *)v;
2479 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 e = s + size;
2481 while (s < e) {
2482 register unsigned char c = (unsigned char)*s;
2483 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 ++s;
2486 }
2487 else {
2488 startinpos = s-starts;
2489 endinpos = startinpos + 1;
2490 outpos = p-PyUnicode_AS_UNICODE(v);
2491 if (unicode_decode_call_errorhandler(
2492 errors, &errorHandler,
2493 "ascii", "ordinal not in range(128)",
2494 starts, size, &startinpos, &endinpos, &exc, &s,
2495 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002499 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002500 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002501 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 Py_XDECREF(errorHandler);
2503 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 return (PyObject *)v;
2505
2506 onError:
2507 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002508 Py_XDECREF(errorHandler);
2509 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return NULL;
2511}
2512
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2514 int size,
2515 const char *errors)
2516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002517 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518}
2519
2520PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2521{
2522 if (!PyUnicode_Check(unicode)) {
2523 PyErr_BadArgument();
2524 return NULL;
2525 }
2526 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2527 PyUnicode_GET_SIZE(unicode),
2528 NULL);
2529}
2530
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002532
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002533/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002534
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002535PyObject *PyUnicode_DecodeMBCS(const char *s,
2536 int size,
2537 const char *errors)
2538{
2539 PyUnicodeObject *v;
2540 Py_UNICODE *p;
2541
2542 /* First get the size of the result */
2543 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002544 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002545 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2546
2547 v = _PyUnicode_New(usize);
2548 if (v == NULL)
2549 return NULL;
2550 if (usize == 0)
2551 return (PyObject *)v;
2552 p = PyUnicode_AS_UNICODE(v);
2553 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2554 Py_DECREF(v);
2555 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2556 }
2557
2558 return (PyObject *)v;
2559}
2560
2561PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2562 int size,
2563 const char *errors)
2564{
2565 PyObject *repr;
2566 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002567 DWORD mbcssize;
2568
2569 /* If there are no characters, bail now! */
2570 if (size==0)
2571 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002572
2573 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002574 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002575 if (mbcssize==0)
2576 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2577
2578 repr = PyString_FromStringAndSize(NULL, mbcssize);
2579 if (repr == NULL)
2580 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002581 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002582 return repr;
2583
2584 /* Do the conversion */
2585 s = PyString_AS_STRING(repr);
2586 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2587 Py_DECREF(repr);
2588 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2589 }
2590 return repr;
2591}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002592
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002593#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002594
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595/* --- Character Mapping Codec -------------------------------------------- */
2596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597PyObject *PyUnicode_DecodeCharmap(const char *s,
2598 int size,
2599 PyObject *mapping,
2600 const char *errors)
2601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002602 const char *starts = s;
2603 int startinpos;
2604 int endinpos;
2605 int outpos;
2606 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 PyUnicodeObject *v;
2608 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002609 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 PyObject *errorHandler = NULL;
2611 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612
2613 /* Default to Latin-1 */
2614 if (mapping == NULL)
2615 return PyUnicode_DecodeLatin1(s, size, errors);
2616
2617 v = _PyUnicode_New(size);
2618 if (v == NULL)
2619 goto onError;
2620 if (size == 0)
2621 return (PyObject *)v;
2622 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 e = s + size;
2624 while (s < e) {
2625 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 PyObject *w, *x;
2627
2628 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2629 w = PyInt_FromLong((long)ch);
2630 if (w == NULL)
2631 goto onError;
2632 x = PyObject_GetItem(mapping, w);
2633 Py_DECREF(w);
2634 if (x == NULL) {
2635 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002636 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002638 x = Py_None;
2639 Py_INCREF(x);
2640 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002641 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 }
2643
2644 /* Apply mapping */
2645 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002646 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 if (value < 0 || value > 65535) {
2648 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002649 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 Py_DECREF(x);
2651 goto onError;
2652 }
2653 *p++ = (Py_UNICODE)value;
2654 }
2655 else if (x == Py_None) {
2656 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 outpos = p-PyUnicode_AS_UNICODE(v);
2658 startinpos = s-starts;
2659 endinpos = startinpos+1;
2660 if (unicode_decode_call_errorhandler(
2661 errors, &errorHandler,
2662 "charmap", "character maps to <undefined>",
2663 starts, size, &startinpos, &endinpos, &exc, &s,
2664 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 Py_DECREF(x);
2666 goto onError;
2667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 }
2670 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002671 int targetsize = PyUnicode_GET_SIZE(x);
2672
2673 if (targetsize == 1)
2674 /* 1-1 mapping */
2675 *p++ = *PyUnicode_AS_UNICODE(x);
2676
2677 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002679 if (targetsize > extrachars) {
2680 /* resize first */
2681 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2682 int needed = (targetsize - extrachars) + \
2683 (targetsize << 2);
2684 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002685 if (_PyUnicode_Resize(&v,
2686 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002687 Py_DECREF(x);
2688 goto onError;
2689 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002690 p = PyUnicode_AS_UNICODE(v) + oldpos;
2691 }
2692 Py_UNICODE_COPY(p,
2693 PyUnicode_AS_UNICODE(x),
2694 targetsize);
2695 p += targetsize;
2696 extrachars -= targetsize;
2697 }
2698 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
2700 else {
2701 /* wrong return value */
2702 PyErr_SetString(PyExc_TypeError,
2703 "character mapping must return integer, None or unicode");
2704 Py_DECREF(x);
2705 goto onError;
2706 }
2707 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 }
2710 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002711 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_XDECREF(errorHandler);
2714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 return (PyObject *)v;
2716
2717 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 Py_XDECREF(errorHandler);
2719 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 Py_XDECREF(v);
2721 return NULL;
2722}
2723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724/* Lookup the character ch in the mapping. If the character
2725 can't be found, Py_None is returned (or NULL, if another
2726 error occured). */
2727static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 PyObject *w = PyInt_FromLong((long)c);
2730 PyObject *x;
2731
2732 if (w == NULL)
2733 return NULL;
2734 x = PyObject_GetItem(mapping, w);
2735 Py_DECREF(w);
2736 if (x == NULL) {
2737 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2738 /* No mapping found means: mapping is undefined. */
2739 PyErr_Clear();
2740 x = Py_None;
2741 Py_INCREF(x);
2742 return x;
2743 } else
2744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 else if (PyInt_Check(x)) {
2747 long value = PyInt_AS_LONG(x);
2748 if (value < 0 || value > 255) {
2749 PyErr_SetString(PyExc_TypeError,
2750 "character mapping must be in range(256)");
2751 Py_DECREF(x);
2752 return NULL;
2753 }
2754 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 else if (PyString_Check(x))
2757 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 /* wrong return value */
2760 PyErr_SetString(PyExc_TypeError,
2761 "character mapping must return integer, None or str");
2762 Py_DECREF(x);
2763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
2765}
2766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767/* lookup the character, put the result in the output string and adjust
2768 various state variables. Reallocate the output string if not enough
2769 space is available. Return a new reference to the object that
2770 was put in the output buffer, or Py_None, if the mapping was undefined
2771 (in which case no character was written) or NULL, if a
2772 reallocation error ocurred. The called must decref the result */
2773static
2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2775 PyObject **outobj, int *outpos)
2776{
2777 PyObject *rep = charmapencode_lookup(c, mapping);
2778
2779 if (rep==NULL)
2780 return NULL;
2781 else if (rep==Py_None)
2782 return rep;
2783 else {
2784 char *outstart = PyString_AS_STRING(*outobj);
2785 int outsize = PyString_GET_SIZE(*outobj);
2786 if (PyInt_Check(rep)) {
2787 int requiredsize = *outpos+1;
2788 if (outsize<requiredsize) {
2789 /* exponentially overallocate to minimize reallocations */
2790 if (requiredsize < 2*outsize)
2791 requiredsize = 2*outsize;
2792 if (_PyString_Resize(outobj, requiredsize)) {
2793 Py_DECREF(rep);
2794 return NULL;
2795 }
2796 outstart = PyString_AS_STRING(*outobj);
2797 }
2798 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2799 }
2800 else {
2801 const char *repchars = PyString_AS_STRING(rep);
2802 int repsize = PyString_GET_SIZE(rep);
2803 int requiredsize = *outpos+repsize;
2804 if (outsize<requiredsize) {
2805 /* exponentially overallocate to minimize reallocations */
2806 if (requiredsize < 2*outsize)
2807 requiredsize = 2*outsize;
2808 if (_PyString_Resize(outobj, requiredsize)) {
2809 Py_DECREF(rep);
2810 return NULL;
2811 }
2812 outstart = PyString_AS_STRING(*outobj);
2813 }
2814 memcpy(outstart + *outpos, repchars, repsize);
2815 *outpos += repsize;
2816 }
2817 }
2818 return rep;
2819}
2820
2821/* handle an error in PyUnicode_EncodeCharmap
2822 Return 0 on success, -1 on error */
2823static
2824int charmap_encoding_error(
2825 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2826 PyObject **exceptionObject,
2827 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2828 PyObject **res, int *respos)
2829{
2830 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2831 int repsize;
2832 int newpos;
2833 Py_UNICODE *uni2;
2834 /* startpos for collecting unencodable chars */
2835 int collstartpos = *inpos;
2836 int collendpos = *inpos+1;
2837 int collpos;
2838 char *encoding = "charmap";
2839 char *reason = "character maps to <undefined>";
2840
2841 PyObject *x;
2842 /* find all unencodable characters */
2843 while (collendpos < size) {
2844 x = charmapencode_lookup(p[collendpos], mapping);
2845 if (x==NULL)
2846 return -1;
2847 else if (x!=Py_None) {
2848 Py_DECREF(x);
2849 break;
2850 }
2851 Py_DECREF(x);
2852 ++collendpos;
2853 }
2854 /* cache callback name lookup
2855 * (if not done yet, i.e. it's the first error) */
2856 if (*known_errorHandler==-1) {
2857 if ((errors==NULL) || (!strcmp(errors, "strict")))
2858 *known_errorHandler = 1;
2859 else if (!strcmp(errors, "replace"))
2860 *known_errorHandler = 2;
2861 else if (!strcmp(errors, "ignore"))
2862 *known_errorHandler = 3;
2863 else if (!strcmp(errors, "xmlcharrefreplace"))
2864 *known_errorHandler = 4;
2865 else
2866 *known_errorHandler = 0;
2867 }
2868 switch (*known_errorHandler) {
2869 case 1: /* strict */
2870 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2871 return -1;
2872 case 2: /* replace */
2873 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2874 x = charmapencode_output('?', mapping, res, respos);
2875 if (x==NULL) {
2876 return -1;
2877 }
2878 else if (x==Py_None) {
2879 Py_DECREF(x);
2880 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2881 return -1;
2882 }
2883 Py_DECREF(x);
2884 }
2885 /* fall through */
2886 case 3: /* ignore */
2887 *inpos = collendpos;
2888 break;
2889 case 4: /* xmlcharrefreplace */
2890 /* generate replacement (temporarily (mis)uses p) */
2891 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2892 char buffer[2+29+1+1];
2893 char *cp;
2894 sprintf(buffer, "&#%d;", (int)p[collpos]);
2895 for (cp = buffer; *cp; ++cp) {
2896 x = charmapencode_output(*cp, mapping, res, respos);
2897 if (x==NULL)
2898 return -1;
2899 else if (x==Py_None) {
2900 Py_DECREF(x);
2901 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2902 return -1;
2903 }
2904 Py_DECREF(x);
2905 }
2906 }
2907 *inpos = collendpos;
2908 break;
2909 default:
2910 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2911 encoding, reason, p, size, exceptionObject,
2912 collstartpos, collendpos, &newpos);
2913 if (repunicode == NULL)
2914 return -1;
2915 /* generate replacement */
2916 repsize = PyUnicode_GET_SIZE(repunicode);
2917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2918 x = charmapencode_output(*uni2, mapping, res, respos);
2919 if (x==NULL) {
2920 Py_DECREF(repunicode);
2921 return -1;
2922 }
2923 else if (x==Py_None) {
2924 Py_DECREF(repunicode);
2925 Py_DECREF(x);
2926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2927 return -1;
2928 }
2929 Py_DECREF(x);
2930 }
2931 *inpos = newpos;
2932 Py_DECREF(repunicode);
2933 }
2934 return 0;
2935}
2936
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2938 int size,
2939 PyObject *mapping,
2940 const char *errors)
2941{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 /* output object */
2943 PyObject *res = NULL;
2944 /* current input position */
2945 int inpos = 0;
2946 /* current output position */
2947 int respos = 0;
2948 PyObject *errorHandler = NULL;
2949 PyObject *exc = NULL;
2950 /* the following variable is used for caching string comparisons
2951 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2952 * 3=ignore, 4=xmlcharrefreplace */
2953 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
2955 /* Default to Latin-1 */
2956 if (mapping == NULL)
2957 return PyUnicode_EncodeLatin1(p, size, errors);
2958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 /* allocate enough for a simple encoding without
2960 replacements, if we need more, we'll resize */
2961 res = PyString_FromStringAndSize(NULL, size);
2962 if (res == NULL)
2963 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002964 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967 while (inpos<size) {
2968 /* try to encode it */
2969 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2970 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 if (x==Py_None) { /* unencodable character */
2973 if (charmap_encoding_error(p, size, &inpos, mapping,
2974 &exc,
2975 &known_errorHandler, errorHandler, errors,
2976 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 else
2980 /* done with this character => adjust input position */
2981 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 Py_DECREF(x);
2983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 /* Resize if we allocated to much */
2986 if (respos<PyString_GET_SIZE(res)) {
2987 if (_PyString_Resize(&res, respos))
2988 goto onError;
2989 }
2990 Py_XDECREF(exc);
2991 Py_XDECREF(errorHandler);
2992 return res;
2993
2994 onError:
2995 Py_XDECREF(res);
2996 Py_XDECREF(exc);
2997 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 return NULL;
2999}
3000
3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3002 PyObject *mapping)
3003{
3004 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3005 PyErr_BadArgument();
3006 return NULL;
3007 }
3008 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3009 PyUnicode_GET_SIZE(unicode),
3010 mapping,
3011 NULL);
3012}
3013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014/* create or adjust a UnicodeTranslateError */
3015static void make_translate_exception(PyObject **exceptionObject,
3016 const Py_UNICODE *unicode, int size,
3017 int startpos, int endpos,
3018 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 if (*exceptionObject == NULL) {
3021 *exceptionObject = PyUnicodeTranslateError_Create(
3022 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 }
3024 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3026 goto onError;
3027 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3028 goto onError;
3029 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3030 goto onError;
3031 return;
3032 onError:
3033 Py_DECREF(*exceptionObject);
3034 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 }
3036}
3037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038/* raises a UnicodeTranslateError */
3039static void raise_translate_exception(PyObject **exceptionObject,
3040 const Py_UNICODE *unicode, int size,
3041 int startpos, int endpos,
3042 const char *reason)
3043{
3044 make_translate_exception(exceptionObject,
3045 unicode, size, startpos, endpos, reason);
3046 if (*exceptionObject != NULL)
3047 PyCodec_StrictErrors(*exceptionObject);
3048}
3049
3050/* error handling callback helper:
3051 build arguments, call the callback and check the arguments,
3052 put the result into newpos and return the replacement string, which
3053 has to be freed by the caller */
3054static PyObject *unicode_translate_call_errorhandler(const char *errors,
3055 PyObject **errorHandler,
3056 const char *reason,
3057 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3058 int startpos, int endpos,
3059 int *newpos)
3060{
3061 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3062
3063 PyObject *restuple;
3064 PyObject *resunicode;
3065
3066 if (*errorHandler == NULL) {
3067 *errorHandler = PyCodec_LookupError(errors);
3068 if (*errorHandler == NULL)
3069 return NULL;
3070 }
3071
3072 make_translate_exception(exceptionObject,
3073 unicode, size, startpos, endpos, reason);
3074 if (*exceptionObject == NULL)
3075 return NULL;
3076
3077 restuple = PyObject_CallFunctionObjArgs(
3078 *errorHandler, *exceptionObject, NULL);
3079 if (restuple == NULL)
3080 return NULL;
3081 if (!PyTuple_Check(restuple)) {
3082 PyErr_Format(PyExc_TypeError, &argparse[4]);
3083 Py_DECREF(restuple);
3084 return NULL;
3085 }
3086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3087 &resunicode, newpos)) {
3088 Py_DECREF(restuple);
3089 return NULL;
3090 }
3091 if (*newpos<0)
3092 *newpos = 0;
3093 else if (*newpos>size)
3094 *newpos = size;
3095 Py_INCREF(resunicode);
3096 Py_DECREF(restuple);
3097 return resunicode;
3098}
3099
3100/* Lookup the character ch in the mapping and put the result in result,
3101 which must be decrefed by the caller.
3102 Return 0 on success, -1 on error */
3103static
3104int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3105{
3106 PyObject *w = PyInt_FromLong((long)c);
3107 PyObject *x;
3108
3109 if (w == NULL)
3110 return -1;
3111 x = PyObject_GetItem(mapping, w);
3112 Py_DECREF(w);
3113 if (x == NULL) {
3114 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3115 /* No mapping found means: use 1:1 mapping. */
3116 PyErr_Clear();
3117 *result = NULL;
3118 return 0;
3119 } else
3120 return -1;
3121 }
3122 else if (x == Py_None) {
3123 *result = x;
3124 return 0;
3125 }
3126 else if (PyInt_Check(x)) {
3127 long value = PyInt_AS_LONG(x);
3128 long max = PyUnicode_GetMax();
3129 if (value < 0 || value > max) {
3130 PyErr_Format(PyExc_TypeError,
3131 "character mapping must be in range(0x%lx)", max+1);
3132 Py_DECREF(x);
3133 return -1;
3134 }
3135 *result = x;
3136 return 0;
3137 }
3138 else if (PyUnicode_Check(x)) {
3139 *result = x;
3140 return 0;
3141 }
3142 else {
3143 /* wrong return value */
3144 PyErr_SetString(PyExc_TypeError,
3145 "character mapping must return integer, None or unicode");
3146 return -1;
3147 }
3148}
3149/* ensure that *outobj is at least requiredsize characters long,
3150if not reallocate and adjust various state variables.
3151Return 0 on success, -1 on error */
3152static
3153int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3154 int requiredsize)
3155{
3156 if (requiredsize > *outsize) {
3157 /* remember old output position */
3158 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3159 /* exponentially overallocate to minimize reallocations */
3160 if (requiredsize < 2 * *outsize)
3161 requiredsize = 2 * *outsize;
3162 if (_PyUnicode_Resize(outobj, requiredsize))
3163 return -1;
3164 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3165 *outsize = requiredsize;
3166 }
3167 return 0;
3168}
3169/* lookup the character, put the result in the output string and adjust
3170 various state variables. Return a new reference to the object that
3171 was put in the output buffer in *result, or Py_None, if the mapping was
3172 undefined (in which case no character was written).
3173 The called must decref result.
3174 Return 0 on success, -1 on error. */
3175static
3176int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3177 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3178{
3179 if (charmaptranslate_lookup(c, mapping, res))
3180 return -1;
3181 if (*res==NULL) {
3182 /* not found => default to 1:1 mapping */
3183 *(*outp)++ = (Py_UNICODE)c;
3184 }
3185 else if (*res==Py_None)
3186 ;
3187 else if (PyInt_Check(*res)) {
3188 /* no overflow check, because we know that the space is enough */
3189 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3190 }
3191 else if (PyUnicode_Check(*res)) {
3192 int repsize = PyUnicode_GET_SIZE(*res);
3193 if (repsize==1) {
3194 /* no overflow check, because we know that the space is enough */
3195 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3196 }
3197 else if (repsize!=0) {
3198 /* more than one character */
3199 int requiredsize = *outsize + repsize - 1;
3200 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3201 return -1;
3202 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3203 *outp += repsize;
3204 }
3205 }
3206 else
3207 return -1;
3208 return 0;
3209}
3210
3211PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 int size,
3213 PyObject *mapping,
3214 const char *errors)
3215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 /* output object */
3217 PyObject *res = NULL;
3218 /* pointers to the beginning and end+1 of input */
3219 const Py_UNICODE *startp = p;
3220 const Py_UNICODE *endp = p + size;
3221 /* pointer into the output */
3222 Py_UNICODE *str;
3223 /* current output position */
3224 int respos = 0;
3225 int ressize;
3226 char *reason = "character maps to <undefined>";
3227 PyObject *errorHandler = NULL;
3228 PyObject *exc = NULL;
3229 /* the following variable is used for caching string comparisons
3230 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3231 * 3=ignore, 4=xmlcharrefreplace */
3232 int known_errorHandler = -1;
3233
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 if (mapping == NULL) {
3235 PyErr_BadArgument();
3236 return NULL;
3237 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238
3239 /* allocate enough for a simple 1:1 translation without
3240 replacements, if we need more, we'll resize */
3241 res = PyUnicode_FromUnicode(NULL, size);
3242 if (res == NULL)
3243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 return res;
3246 str = PyUnicode_AS_UNICODE(res);
3247 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 while (p<endp) {
3250 /* try to encode it */
3251 PyObject *x = NULL;
3252 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3253 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 goto onError;
3255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 if (x!=Py_None) /* it worked => adjust input pointer */
3257 ++p;
3258 else { /* untranslatable character */
3259 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3260 int repsize;
3261 int newpos;
3262 Py_UNICODE *uni2;
3263 /* startpos for collecting untranslatable chars */
3264 const Py_UNICODE *collstart = p;
3265 const Py_UNICODE *collend = p+1;
3266 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 Py_XDECREF(x);
3269 /* find all untranslatable characters */
3270 while (collend < endp) {
3271 if (charmaptranslate_lookup(*collend, mapping, &x))
3272 goto onError;
3273 Py_XDECREF(x);
3274 if (x!=Py_None)
3275 break;
3276 ++collend;
3277 }
3278 /* cache callback name lookup
3279 * (if not done yet, i.e. it's the first error) */
3280 if (known_errorHandler==-1) {
3281 if ((errors==NULL) || (!strcmp(errors, "strict")))
3282 known_errorHandler = 1;
3283 else if (!strcmp(errors, "replace"))
3284 known_errorHandler = 2;
3285 else if (!strcmp(errors, "ignore"))
3286 known_errorHandler = 3;
3287 else if (!strcmp(errors, "xmlcharrefreplace"))
3288 known_errorHandler = 4;
3289 else
3290 known_errorHandler = 0;
3291 }
3292 switch (known_errorHandler) {
3293 case 1: /* strict */
3294 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3295 goto onError;
3296 case 2: /* replace */
3297 /* No need to check for space, this is a 1:1 replacement */
3298 for (coll = collstart; coll<collend; ++coll)
3299 *str++ = '?';
3300 /* fall through */
3301 case 3: /* ignore */
3302 p = collend;
3303 break;
3304 case 4: /* xmlcharrefreplace */
3305 /* generate replacement (temporarily (mis)uses p) */
3306 for (p = collstart; p < collend; ++p) {
3307 char buffer[2+29+1+1];
3308 char *cp;
3309 sprintf(buffer, "&#%d;", (int)*p);
3310 if (charmaptranslate_makespace(&res, &str, &ressize,
3311 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3312 goto onError;
3313 for (cp = buffer; *cp; ++cp)
3314 *str++ = *cp;
3315 }
3316 p = collend;
3317 break;
3318 default:
3319 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3320 reason, startp, size, &exc,
3321 collstart-startp, collend-startp, &newpos);
3322 if (repunicode == NULL)
3323 goto onError;
3324 /* generate replacement */
3325 repsize = PyUnicode_GET_SIZE(repunicode);
3326 if (charmaptranslate_makespace(&res, &str, &ressize,
3327 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3328 Py_DECREF(repunicode);
3329 goto onError;
3330 }
3331 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3332 *str++ = *uni2;
3333 p = startp + newpos;
3334 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
3336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 /* Resize if we allocated to much */
3339 respos = str-PyUnicode_AS_UNICODE(res);
3340 if (respos<ressize) {
3341 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003342 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 }
3344 Py_XDECREF(exc);
3345 Py_XDECREF(errorHandler);
3346 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 onError:
3349 Py_XDECREF(res);
3350 Py_XDECREF(exc);
3351 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return NULL;
3353}
3354
3355PyObject *PyUnicode_Translate(PyObject *str,
3356 PyObject *mapping,
3357 const char *errors)
3358{
3359 PyObject *result;
3360
3361 str = PyUnicode_FromObject(str);
3362 if (str == NULL)
3363 goto onError;
3364 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3365 PyUnicode_GET_SIZE(str),
3366 mapping,
3367 errors);
3368 Py_DECREF(str);
3369 return result;
3370
3371 onError:
3372 Py_XDECREF(str);
3373 return NULL;
3374}
3375
Guido van Rossum9e896b32000-04-05 20:11:21 +00003376/* --- Decimal Encoder ---------------------------------------------------- */
3377
3378int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3379 int length,
3380 char *output,
3381 const char *errors)
3382{
3383 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 PyObject *errorHandler = NULL;
3385 PyObject *exc = NULL;
3386 const char *encoding = "decimal";
3387 const char *reason = "invalid decimal Unicode string";
3388 /* the following variable is used for caching string comparisons
3389 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3390 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003391
3392 if (output == NULL) {
3393 PyErr_BadArgument();
3394 return -1;
3395 }
3396
3397 p = s;
3398 end = s + length;
3399 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003401 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 PyObject *repunicode;
3403 int repsize;
3404 int newpos;
3405 Py_UNICODE *uni2;
3406 Py_UNICODE *collstart;
3407 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003408
3409 if (Py_UNICODE_ISSPACE(ch)) {
3410 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003412 continue;
3413 }
3414 decimal = Py_UNICODE_TODECIMAL(ch);
3415 if (decimal >= 0) {
3416 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003418 continue;
3419 }
Guido van Rossumba477042000-04-06 18:18:10 +00003420 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003421 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003423 continue;
3424 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 /* All other characters are considered unencodable */
3426 collstart = p;
3427 collend = p+1;
3428 while (collend < end) {
3429 if ((0 < *collend && *collend < 256) ||
3430 !Py_UNICODE_ISSPACE(*collend) ||
3431 Py_UNICODE_TODECIMAL(*collend))
3432 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 /* cache callback name lookup
3435 * (if not done yet, i.e. it's the first error) */
3436 if (known_errorHandler==-1) {
3437 if ((errors==NULL) || (!strcmp(errors, "strict")))
3438 known_errorHandler = 1;
3439 else if (!strcmp(errors, "replace"))
3440 known_errorHandler = 2;
3441 else if (!strcmp(errors, "ignore"))
3442 known_errorHandler = 3;
3443 else if (!strcmp(errors, "xmlcharrefreplace"))
3444 known_errorHandler = 4;
3445 else
3446 known_errorHandler = 0;
3447 }
3448 switch (known_errorHandler) {
3449 case 1: /* strict */
3450 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3451 goto onError;
3452 case 2: /* replace */
3453 for (p = collstart; p < collend; ++p)
3454 *output++ = '?';
3455 /* fall through */
3456 case 3: /* ignore */
3457 p = collend;
3458 break;
3459 case 4: /* xmlcharrefreplace */
3460 /* generate replacement (temporarily (mis)uses p) */
3461 for (p = collstart; p < collend; ++p)
3462 output += sprintf(output, "&#%d;", (int)*p);
3463 p = collend;
3464 break;
3465 default:
3466 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3467 encoding, reason, s, length, &exc,
3468 collstart-s, collend-s, &newpos);
3469 if (repunicode == NULL)
3470 goto onError;
3471 /* generate replacement */
3472 repsize = PyUnicode_GET_SIZE(repunicode);
3473 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3474 Py_UNICODE ch = *uni2;
3475 if (Py_UNICODE_ISSPACE(ch))
3476 *output++ = ' ';
3477 else {
3478 decimal = Py_UNICODE_TODECIMAL(ch);
3479 if (decimal >= 0)
3480 *output++ = '0' + decimal;
3481 else if (0 < ch && ch < 256)
3482 *output++ = (char)ch;
3483 else {
3484 Py_DECREF(repunicode);
3485 raise_encode_exception(&exc, encoding,
3486 s, length, collstart-s, collend-s, reason);
3487 goto onError;
3488 }
3489 }
3490 }
3491 p = s + newpos;
3492 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003493 }
3494 }
3495 /* 0-terminate the output string */
3496 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 Py_XDECREF(exc);
3498 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003499 return 0;
3500
3501 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 Py_XDECREF(exc);
3503 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003504 return -1;
3505}
3506
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507/* --- Helpers ------------------------------------------------------------ */
3508
3509static
3510int count(PyUnicodeObject *self,
3511 int start,
3512 int end,
3513 PyUnicodeObject *substring)
3514{
3515 int count = 0;
3516
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003517 if (start < 0)
3518 start += self->length;
3519 if (start < 0)
3520 start = 0;
3521 if (end > self->length)
3522 end = self->length;
3523 if (end < 0)
3524 end += self->length;
3525 if (end < 0)
3526 end = 0;
3527
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003528 if (substring->length == 0)
3529 return (end - start + 1);
3530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 end -= substring->length;
3532
3533 while (start <= end)
3534 if (Py_UNICODE_MATCH(self, start, substring)) {
3535 count++;
3536 start += substring->length;
3537 } else
3538 start++;
3539
3540 return count;
3541}
3542
3543int PyUnicode_Count(PyObject *str,
3544 PyObject *substr,
3545 int start,
3546 int end)
3547{
3548 int result;
3549
3550 str = PyUnicode_FromObject(str);
3551 if (str == NULL)
3552 return -1;
3553 substr = PyUnicode_FromObject(substr);
3554 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003555 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return -1;
3557 }
3558
3559 result = count((PyUnicodeObject *)str,
3560 start, end,
3561 (PyUnicodeObject *)substr);
3562
3563 Py_DECREF(str);
3564 Py_DECREF(substr);
3565 return result;
3566}
3567
3568static
3569int findstring(PyUnicodeObject *self,
3570 PyUnicodeObject *substring,
3571 int start,
3572 int end,
3573 int direction)
3574{
3575 if (start < 0)
3576 start += self->length;
3577 if (start < 0)
3578 start = 0;
3579
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 if (end > self->length)
3581 end = self->length;
3582 if (end < 0)
3583 end += self->length;
3584 if (end < 0)
3585 end = 0;
3586
Guido van Rossum76afbd92002-08-20 17:29:29 +00003587 if (substring->length == 0)
3588 return (direction > 0) ? start : end;
3589
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 end -= substring->length;
3591
3592 if (direction < 0) {
3593 for (; end >= start; end--)
3594 if (Py_UNICODE_MATCH(self, end, substring))
3595 return end;
3596 } else {
3597 for (; start <= end; start++)
3598 if (Py_UNICODE_MATCH(self, start, substring))
3599 return start;
3600 }
3601
3602 return -1;
3603}
3604
3605int PyUnicode_Find(PyObject *str,
3606 PyObject *substr,
3607 int start,
3608 int end,
3609 int direction)
3610{
3611 int result;
3612
3613 str = PyUnicode_FromObject(str);
3614 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003615 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 substr = PyUnicode_FromObject(substr);
3617 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003618 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003619 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621
3622 result = findstring((PyUnicodeObject *)str,
3623 (PyUnicodeObject *)substr,
3624 start, end, direction);
3625 Py_DECREF(str);
3626 Py_DECREF(substr);
3627 return result;
3628}
3629
3630static
3631int tailmatch(PyUnicodeObject *self,
3632 PyUnicodeObject *substring,
3633 int start,
3634 int end,
3635 int direction)
3636{
3637 if (start < 0)
3638 start += self->length;
3639 if (start < 0)
3640 start = 0;
3641
3642 if (substring->length == 0)
3643 return 1;
3644
3645 if (end > self->length)
3646 end = self->length;
3647 if (end < 0)
3648 end += self->length;
3649 if (end < 0)
3650 end = 0;
3651
3652 end -= substring->length;
3653 if (end < start)
3654 return 0;
3655
3656 if (direction > 0) {
3657 if (Py_UNICODE_MATCH(self, end, substring))
3658 return 1;
3659 } else {
3660 if (Py_UNICODE_MATCH(self, start, substring))
3661 return 1;
3662 }
3663
3664 return 0;
3665}
3666
3667int PyUnicode_Tailmatch(PyObject *str,
3668 PyObject *substr,
3669 int start,
3670 int end,
3671 int direction)
3672{
3673 int result;
3674
3675 str = PyUnicode_FromObject(str);
3676 if (str == NULL)
3677 return -1;
3678 substr = PyUnicode_FromObject(substr);
3679 if (substr == NULL) {
3680 Py_DECREF(substr);
3681 return -1;
3682 }
3683
3684 result = tailmatch((PyUnicodeObject *)str,
3685 (PyUnicodeObject *)substr,
3686 start, end, direction);
3687 Py_DECREF(str);
3688 Py_DECREF(substr);
3689 return result;
3690}
3691
3692static
3693const Py_UNICODE *findchar(const Py_UNICODE *s,
3694 int size,
3695 Py_UNICODE ch)
3696{
3697 /* like wcschr, but doesn't stop at NULL characters */
3698
3699 while (size-- > 0) {
3700 if (*s == ch)
3701 return s;
3702 s++;
3703 }
3704
3705 return NULL;
3706}
3707
3708/* Apply fixfct filter to the Unicode object self and return a
3709 reference to the modified object */
3710
3711static
3712PyObject *fixup(PyUnicodeObject *self,
3713 int (*fixfct)(PyUnicodeObject *s))
3714{
3715
3716 PyUnicodeObject *u;
3717
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003718 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 if (u == NULL)
3720 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003721
3722 Py_UNICODE_COPY(u->str, self->str, self->length);
3723
Tim Peters7a29bd52001-09-12 03:03:31 +00003724 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 /* fixfct should return TRUE if it modified the buffer. If
3726 FALSE, return a reference to the original buffer instead
3727 (to save space, not time) */
3728 Py_INCREF(self);
3729 Py_DECREF(u);
3730 return (PyObject*) self;
3731 }
3732 return (PyObject*) u;
3733}
3734
3735static
3736int fixupper(PyUnicodeObject *self)
3737{
3738 int len = self->length;
3739 Py_UNICODE *s = self->str;
3740 int status = 0;
3741
3742 while (len-- > 0) {
3743 register Py_UNICODE ch;
3744
3745 ch = Py_UNICODE_TOUPPER(*s);
3746 if (ch != *s) {
3747 status = 1;
3748 *s = ch;
3749 }
3750 s++;
3751 }
3752
3753 return status;
3754}
3755
3756static
3757int fixlower(PyUnicodeObject *self)
3758{
3759 int len = self->length;
3760 Py_UNICODE *s = self->str;
3761 int status = 0;
3762
3763 while (len-- > 0) {
3764 register Py_UNICODE ch;
3765
3766 ch = Py_UNICODE_TOLOWER(*s);
3767 if (ch != *s) {
3768 status = 1;
3769 *s = ch;
3770 }
3771 s++;
3772 }
3773
3774 return status;
3775}
3776
3777static
3778int fixswapcase(PyUnicodeObject *self)
3779{
3780 int len = self->length;
3781 Py_UNICODE *s = self->str;
3782 int status = 0;
3783
3784 while (len-- > 0) {
3785 if (Py_UNICODE_ISUPPER(*s)) {
3786 *s = Py_UNICODE_TOLOWER(*s);
3787 status = 1;
3788 } else if (Py_UNICODE_ISLOWER(*s)) {
3789 *s = Py_UNICODE_TOUPPER(*s);
3790 status = 1;
3791 }
3792 s++;
3793 }
3794
3795 return status;
3796}
3797
3798static
3799int fixcapitalize(PyUnicodeObject *self)
3800{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003801 int len = self->length;
3802 Py_UNICODE *s = self->str;
3803 int status = 0;
3804
3805 if (len == 0)
3806 return 0;
3807 if (Py_UNICODE_ISLOWER(*s)) {
3808 *s = Py_UNICODE_TOUPPER(*s);
3809 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003811 s++;
3812 while (--len > 0) {
3813 if (Py_UNICODE_ISUPPER(*s)) {
3814 *s = Py_UNICODE_TOLOWER(*s);
3815 status = 1;
3816 }
3817 s++;
3818 }
3819 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820}
3821
3822static
3823int fixtitle(PyUnicodeObject *self)
3824{
3825 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3826 register Py_UNICODE *e;
3827 int previous_is_cased;
3828
3829 /* Shortcut for single character strings */
3830 if (PyUnicode_GET_SIZE(self) == 1) {
3831 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3832 if (*p != ch) {
3833 *p = ch;
3834 return 1;
3835 }
3836 else
3837 return 0;
3838 }
3839
3840 e = p + PyUnicode_GET_SIZE(self);
3841 previous_is_cased = 0;
3842 for (; p < e; p++) {
3843 register const Py_UNICODE ch = *p;
3844
3845 if (previous_is_cased)
3846 *p = Py_UNICODE_TOLOWER(ch);
3847 else
3848 *p = Py_UNICODE_TOTITLE(ch);
3849
3850 if (Py_UNICODE_ISLOWER(ch) ||
3851 Py_UNICODE_ISUPPER(ch) ||
3852 Py_UNICODE_ISTITLE(ch))
3853 previous_is_cased = 1;
3854 else
3855 previous_is_cased = 0;
3856 }
3857 return 1;
3858}
3859
3860PyObject *PyUnicode_Join(PyObject *separator,
3861 PyObject *seq)
3862{
3863 Py_UNICODE *sep;
3864 int seplen;
3865 PyUnicodeObject *res = NULL;
3866 int reslen = 0;
3867 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 int sz = 100;
3869 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003870 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871
Tim Peters2cfe3682001-05-05 05:36:48 +00003872 it = PyObject_GetIter(seq);
3873 if (it == NULL)
3874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
3876 if (separator == NULL) {
3877 Py_UNICODE blank = ' ';
3878 sep = &blank;
3879 seplen = 1;
3880 }
3881 else {
3882 separator = PyUnicode_FromObject(separator);
3883 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 sep = PyUnicode_AS_UNICODE(separator);
3886 seplen = PyUnicode_GET_SIZE(separator);
3887 }
3888
3889 res = _PyUnicode_New(sz);
3890 if (res == NULL)
3891 goto onError;
3892 p = PyUnicode_AS_UNICODE(res);
3893 reslen = 0;
3894
Tim Peters2cfe3682001-05-05 05:36:48 +00003895 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003897 PyObject *item = PyIter_Next(it);
3898 if (item == NULL) {
3899 if (PyErr_Occurred())
3900 goto onError;
3901 break;
3902 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 if (!PyUnicode_Check(item)) {
3904 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003905 if (!PyString_Check(item)) {
3906 PyErr_Format(PyExc_TypeError,
3907 "sequence item %i: expected string or Unicode,"
3908 " %.80s found",
3909 i, item->ob_type->tp_name);
3910 Py_DECREF(item);
3911 goto onError;
3912 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 v = PyUnicode_FromObject(item);
3914 Py_DECREF(item);
3915 item = v;
3916 if (item == NULL)
3917 goto onError;
3918 }
3919 itemlen = PyUnicode_GET_SIZE(item);
3920 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003921 if (_PyUnicode_Resize(&res, sz*2)) {
3922 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 sz *= 2;
3926 p = PyUnicode_AS_UNICODE(res) + reslen;
3927 }
3928 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003929 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 p += seplen;
3931 reslen += seplen;
3932 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003933 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 p += itemlen;
3935 reslen += itemlen;
3936 Py_DECREF(item);
3937 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003938 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 goto onError;
3940
3941 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003942 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 return (PyObject *)res;
3944
3945 onError:
3946 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003947 Py_XDECREF(res);
3948 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 return NULL;
3950}
3951
3952static
3953PyUnicodeObject *pad(PyUnicodeObject *self,
3954 int left,
3955 int right,
3956 Py_UNICODE fill)
3957{
3958 PyUnicodeObject *u;
3959
3960 if (left < 0)
3961 left = 0;
3962 if (right < 0)
3963 right = 0;
3964
Tim Peters7a29bd52001-09-12 03:03:31 +00003965 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 Py_INCREF(self);
3967 return self;
3968 }
3969
3970 u = _PyUnicode_New(left + self->length + right);
3971 if (u) {
3972 if (left)
3973 Py_UNICODE_FILL(u->str, fill, left);
3974 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3975 if (right)
3976 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3977 }
3978
3979 return u;
3980}
3981
3982#define SPLIT_APPEND(data, left, right) \
3983 str = PyUnicode_FromUnicode(data + left, right - left); \
3984 if (!str) \
3985 goto onError; \
3986 if (PyList_Append(list, str)) { \
3987 Py_DECREF(str); \
3988 goto onError; \
3989 } \
3990 else \
3991 Py_DECREF(str);
3992
3993static
3994PyObject *split_whitespace(PyUnicodeObject *self,
3995 PyObject *list,
3996 int maxcount)
3997{
3998 register int i;
3999 register int j;
4000 int len = self->length;
4001 PyObject *str;
4002
4003 for (i = j = 0; i < len; ) {
4004 /* find a token */
4005 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4006 i++;
4007 j = i;
4008 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4009 i++;
4010 if (j < i) {
4011 if (maxcount-- <= 0)
4012 break;
4013 SPLIT_APPEND(self->str, j, i);
4014 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4015 i++;
4016 j = i;
4017 }
4018 }
4019 if (j < len) {
4020 SPLIT_APPEND(self->str, j, len);
4021 }
4022 return list;
4023
4024 onError:
4025 Py_DECREF(list);
4026 return NULL;
4027}
4028
4029PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004030 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031{
4032 register int i;
4033 register int j;
4034 int len;
4035 PyObject *list;
4036 PyObject *str;
4037 Py_UNICODE *data;
4038
4039 string = PyUnicode_FromObject(string);
4040 if (string == NULL)
4041 return NULL;
4042 data = PyUnicode_AS_UNICODE(string);
4043 len = PyUnicode_GET_SIZE(string);
4044
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 list = PyList_New(0);
4046 if (!list)
4047 goto onError;
4048
4049 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004050 int eol;
4051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 /* Find a line and append it */
4053 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4054 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055
4056 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004057 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 if (i < len) {
4059 if (data[i] == '\r' && i + 1 < len &&
4060 data[i+1] == '\n')
4061 i += 2;
4062 else
4063 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004064 if (keepends)
4065 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
Guido van Rossum86662912000-04-11 15:38:46 +00004067 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 j = i;
4069 }
4070 if (j < len) {
4071 SPLIT_APPEND(data, j, len);
4072 }
4073
4074 Py_DECREF(string);
4075 return list;
4076
4077 onError:
4078 Py_DECREF(list);
4079 Py_DECREF(string);
4080 return NULL;
4081}
4082
4083static
4084PyObject *split_char(PyUnicodeObject *self,
4085 PyObject *list,
4086 Py_UNICODE ch,
4087 int maxcount)
4088{
4089 register int i;
4090 register int j;
4091 int len = self->length;
4092 PyObject *str;
4093
4094 for (i = j = 0; i < len; ) {
4095 if (self->str[i] == ch) {
4096 if (maxcount-- <= 0)
4097 break;
4098 SPLIT_APPEND(self->str, j, i);
4099 i = j = i + 1;
4100 } else
4101 i++;
4102 }
4103 if (j <= len) {
4104 SPLIT_APPEND(self->str, j, len);
4105 }
4106 return list;
4107
4108 onError:
4109 Py_DECREF(list);
4110 return NULL;
4111}
4112
4113static
4114PyObject *split_substring(PyUnicodeObject *self,
4115 PyObject *list,
4116 PyUnicodeObject *substring,
4117 int maxcount)
4118{
4119 register int i;
4120 register int j;
4121 int len = self->length;
4122 int sublen = substring->length;
4123 PyObject *str;
4124
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004125 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 if (Py_UNICODE_MATCH(self, i, substring)) {
4127 if (maxcount-- <= 0)
4128 break;
4129 SPLIT_APPEND(self->str, j, i);
4130 i = j = i + sublen;
4131 } else
4132 i++;
4133 }
4134 if (j <= len) {
4135 SPLIT_APPEND(self->str, j, len);
4136 }
4137 return list;
4138
4139 onError:
4140 Py_DECREF(list);
4141 return NULL;
4142}
4143
4144#undef SPLIT_APPEND
4145
4146static
4147PyObject *split(PyUnicodeObject *self,
4148 PyUnicodeObject *substring,
4149 int maxcount)
4150{
4151 PyObject *list;
4152
4153 if (maxcount < 0)
4154 maxcount = INT_MAX;
4155
4156 list = PyList_New(0);
4157 if (!list)
4158 return NULL;
4159
4160 if (substring == NULL)
4161 return split_whitespace(self,list,maxcount);
4162
4163 else if (substring->length == 1)
4164 return split_char(self,list,substring->str[0],maxcount);
4165
4166 else if (substring->length == 0) {
4167 Py_DECREF(list);
4168 PyErr_SetString(PyExc_ValueError, "empty separator");
4169 return NULL;
4170 }
4171 else
4172 return split_substring(self,list,substring,maxcount);
4173}
4174
4175static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176PyObject *replace(PyUnicodeObject *self,
4177 PyUnicodeObject *str1,
4178 PyUnicodeObject *str2,
4179 int maxcount)
4180{
4181 PyUnicodeObject *u;
4182
4183 if (maxcount < 0)
4184 maxcount = INT_MAX;
4185
4186 if (str1->length == 1 && str2->length == 1) {
4187 int i;
4188
4189 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004190 if (!findchar(self->str, self->length, str1->str[0]) &&
4191 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 /* nothing to replace, return original string */
4193 Py_INCREF(self);
4194 u = self;
4195 } else {
4196 Py_UNICODE u1 = str1->str[0];
4197 Py_UNICODE u2 = str2->str[0];
4198
4199 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004200 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 self->length
4202 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004203 if (u != NULL) {
4204 Py_UNICODE_COPY(u->str, self->str,
4205 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 for (i = 0; i < u->length; i++)
4207 if (u->str[i] == u1) {
4208 if (--maxcount < 0)
4209 break;
4210 u->str[i] = u2;
4211 }
4212 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214
4215 } else {
4216 int n, i;
4217 Py_UNICODE *p;
4218
4219 /* replace strings */
4220 n = count(self, 0, self->length, str1);
4221 if (n > maxcount)
4222 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004223 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004225 if (PyUnicode_CheckExact(self)) {
4226 Py_INCREF(self);
4227 u = self;
4228 }
4229 else {
4230 u = (PyUnicodeObject *)
4231 PyUnicode_FromUnicode(self->str, self->length);
4232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 } else {
4234 u = _PyUnicode_New(
4235 self->length + n * (str2->length - str1->length));
4236 if (u) {
4237 i = 0;
4238 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004239 if (str1->length > 0) {
4240 while (i <= self->length - str1->length)
4241 if (Py_UNICODE_MATCH(self, i, str1)) {
4242 /* replace string segment */
4243 Py_UNICODE_COPY(p, str2->str, str2->length);
4244 p += str2->length;
4245 i += str1->length;
4246 if (--n <= 0) {
4247 /* copy remaining part */
4248 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4249 break;
4250 }
4251 } else
4252 *p++ = self->str[i++];
4253 } else {
4254 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 Py_UNICODE_COPY(p, str2->str, str2->length);
4256 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004257 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004260 }
4261 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 }
4264 }
4265 }
4266
4267 return (PyObject *) u;
4268}
4269
4270/* --- Unicode Object Methods --------------------------------------------- */
4271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004272PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273"S.title() -> unicode\n\
4274\n\
4275Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004276characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277
4278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004279unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 return fixup(self, fixtitle);
4282}
4283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004284PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285"S.capitalize() -> unicode\n\
4286\n\
4287Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004288have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289
4290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004291unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 return fixup(self, fixcapitalize);
4294}
4295
4296#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004297PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298"S.capwords() -> unicode\n\
4299\n\
4300Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004301normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302
4303static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004304unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305{
4306 PyObject *list;
4307 PyObject *item;
4308 int i;
4309
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 /* Split into words */
4311 list = split(self, NULL, -1);
4312 if (!list)
4313 return NULL;
4314
4315 /* Capitalize each word */
4316 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4317 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4318 fixcapitalize);
4319 if (item == NULL)
4320 goto onError;
4321 Py_DECREF(PyList_GET_ITEM(list, i));
4322 PyList_SET_ITEM(list, i, item);
4323 }
4324
4325 /* Join the words to form a new string */
4326 item = PyUnicode_Join(NULL, list);
4327
4328onError:
4329 Py_DECREF(list);
4330 return (PyObject *)item;
4331}
4332#endif
4333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004334PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335"S.center(width) -> unicode\n\
4336\n\
4337Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004338using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339
4340static PyObject *
4341unicode_center(PyUnicodeObject *self, PyObject *args)
4342{
4343 int marg, left;
4344 int width;
4345
4346 if (!PyArg_ParseTuple(args, "i:center", &width))
4347 return NULL;
4348
Tim Peters7a29bd52001-09-12 03:03:31 +00004349 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 Py_INCREF(self);
4351 return (PyObject*) self;
4352 }
4353
4354 marg = width - self->length;
4355 left = marg / 2 + (marg & width & 1);
4356
4357 return (PyObject*) pad(self, left, marg - left, ' ');
4358}
4359
Marc-André Lemburge5034372000-08-08 08:04:29 +00004360#if 0
4361
4362/* This code should go into some future Unicode collation support
4363 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004364 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004365
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004366/* speedy UTF-16 code point order comparison */
4367/* gleaned from: */
4368/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4369
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004370static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004371{
4372 0, 0, 0, 0, 0, 0, 0, 0,
4373 0, 0, 0, 0, 0, 0, 0, 0,
4374 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004375 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004376};
4377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378static int
4379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4380{
4381 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 Py_UNICODE *s1 = str1->str;
4384 Py_UNICODE *s2 = str2->str;
4385
4386 len1 = str1->length;
4387 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004388
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004390 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004391
4392 c1 = *s1++;
4393 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004394
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004395 if (c1 > (1<<11) * 26)
4396 c1 += utf16Fixup[c1>>11];
4397 if (c2 > (1<<11) * 26)
4398 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004399 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004400
4401 if (c1 != c2)
4402 return (c1 < c2) ? -1 : 1;
4403
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004404 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 }
4406
4407 return (len1 < len2) ? -1 : (len1 != len2);
4408}
4409
Marc-André Lemburge5034372000-08-08 08:04:29 +00004410#else
4411
4412static int
4413unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4414{
4415 register int len1, len2;
4416
4417 Py_UNICODE *s1 = str1->str;
4418 Py_UNICODE *s2 = str2->str;
4419
4420 len1 = str1->length;
4421 len2 = str2->length;
4422
4423 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004424 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004425
Fredrik Lundh45714e92001-06-26 16:39:36 +00004426 c1 = *s1++;
4427 c2 = *s2++;
4428
4429 if (c1 != c2)
4430 return (c1 < c2) ? -1 : 1;
4431
Marc-André Lemburge5034372000-08-08 08:04:29 +00004432 len1--; len2--;
4433 }
4434
4435 return (len1 < len2) ? -1 : (len1 != len2);
4436}
4437
4438#endif
4439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440int PyUnicode_Compare(PyObject *left,
4441 PyObject *right)
4442{
4443 PyUnicodeObject *u = NULL, *v = NULL;
4444 int result;
4445
4446 /* Coerce the two arguments */
4447 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4448 if (u == NULL)
4449 goto onError;
4450 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4451 if (v == NULL)
4452 goto onError;
4453
Thomas Wouters7e474022000-07-16 12:04:32 +00004454 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 if (v == u) {
4456 Py_DECREF(u);
4457 Py_DECREF(v);
4458 return 0;
4459 }
4460
4461 result = unicode_compare(u, v);
4462
4463 Py_DECREF(u);
4464 Py_DECREF(v);
4465 return result;
4466
4467onError:
4468 Py_XDECREF(u);
4469 Py_XDECREF(v);
4470 return -1;
4471}
4472
Guido van Rossum403d68b2000-03-13 15:55:09 +00004473int PyUnicode_Contains(PyObject *container,
4474 PyObject *element)
4475{
4476 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004477 int result, size;
4478 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004479
4480 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004481 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004482 if (v == NULL) {
4483 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004484 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004485 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004486 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004487 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004488 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004489 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004490
Barry Warsaw817918c2002-08-06 16:58:21 +00004491 size = PyUnicode_GET_SIZE(v);
4492 rhs = PyUnicode_AS_UNICODE(v);
4493 lhs = PyUnicode_AS_UNICODE(u);
4494
Guido van Rossum403d68b2000-03-13 15:55:09 +00004495 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004496 if (size == 1) {
4497 end = lhs + PyUnicode_GET_SIZE(u);
4498 while (lhs < end) {
4499 if (*lhs++ == *rhs) {
4500 result = 1;
4501 break;
4502 }
4503 }
4504 }
4505 else {
4506 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4507 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004508 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004509 result = 1;
4510 break;
4511 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004512 }
4513 }
4514
4515 Py_DECREF(u);
4516 Py_DECREF(v);
4517 return result;
4518
4519onError:
4520 Py_XDECREF(u);
4521 Py_XDECREF(v);
4522 return -1;
4523}
4524
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525/* Concat to string or Unicode object giving a new Unicode object. */
4526
4527PyObject *PyUnicode_Concat(PyObject *left,
4528 PyObject *right)
4529{
4530 PyUnicodeObject *u = NULL, *v = NULL, *w;
4531
4532 /* Coerce the two arguments */
4533 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4534 if (u == NULL)
4535 goto onError;
4536 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4537 if (v == NULL)
4538 goto onError;
4539
4540 /* Shortcuts */
4541 if (v == unicode_empty) {
4542 Py_DECREF(v);
4543 return (PyObject *)u;
4544 }
4545 if (u == unicode_empty) {
4546 Py_DECREF(u);
4547 return (PyObject *)v;
4548 }
4549
4550 /* Concat the two Unicode strings */
4551 w = _PyUnicode_New(u->length + v->length);
4552 if (w == NULL)
4553 goto onError;
4554 Py_UNICODE_COPY(w->str, u->str, u->length);
4555 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4556
4557 Py_DECREF(u);
4558 Py_DECREF(v);
4559 return (PyObject *)w;
4560
4561onError:
4562 Py_XDECREF(u);
4563 Py_XDECREF(v);
4564 return NULL;
4565}
4566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004567PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568"S.count(sub[, start[, end]]) -> int\n\
4569\n\
4570Return the number of occurrences of substring sub in Unicode string\n\
4571S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004572interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573
4574static PyObject *
4575unicode_count(PyUnicodeObject *self, PyObject *args)
4576{
4577 PyUnicodeObject *substring;
4578 int start = 0;
4579 int end = INT_MAX;
4580 PyObject *result;
4581
Guido van Rossumb8872e62000-05-09 14:14:27 +00004582 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 return NULL;
4585
4586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4587 (PyObject *)substring);
4588 if (substring == NULL)
4589 return NULL;
4590
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 if (start < 0)
4592 start += self->length;
4593 if (start < 0)
4594 start = 0;
4595 if (end > self->length)
4596 end = self->length;
4597 if (end < 0)
4598 end += self->length;
4599 if (end < 0)
4600 end = 0;
4601
4602 result = PyInt_FromLong((long) count(self, start, end, substring));
4603
4604 Py_DECREF(substring);
4605 return result;
4606}
4607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004608PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609"S.encode([encoding[,errors]]) -> string\n\
4610\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004611Return an encoded string version of S. Default encoding is the current\n\
4612default string encoding. errors may be given to set a different error\n\
4613handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4615'xmlcharrefreplace' as well as any other name registered with\n\
4616codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617
4618static PyObject *
4619unicode_encode(PyUnicodeObject *self, PyObject *args)
4620{
4621 char *encoding = NULL;
4622 char *errors = NULL;
4623 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4624 return NULL;
4625 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4626}
4627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004628PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629"S.expandtabs([tabsize]) -> unicode\n\
4630\n\
4631Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004632If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
4634static PyObject*
4635unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4636{
4637 Py_UNICODE *e;
4638 Py_UNICODE *p;
4639 Py_UNICODE *q;
4640 int i, j;
4641 PyUnicodeObject *u;
4642 int tabsize = 8;
4643
4644 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4645 return NULL;
4646
Thomas Wouters7e474022000-07-16 12:04:32 +00004647 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 i = j = 0;
4649 e = self->str + self->length;
4650 for (p = self->str; p < e; p++)
4651 if (*p == '\t') {
4652 if (tabsize > 0)
4653 j += tabsize - (j % tabsize);
4654 }
4655 else {
4656 j++;
4657 if (*p == '\n' || *p == '\r') {
4658 i += j;
4659 j = 0;
4660 }
4661 }
4662
4663 /* Second pass: create output string and fill it */
4664 u = _PyUnicode_New(i + j);
4665 if (!u)
4666 return NULL;
4667
4668 j = 0;
4669 q = u->str;
4670
4671 for (p = self->str; p < e; p++)
4672 if (*p == '\t') {
4673 if (tabsize > 0) {
4674 i = tabsize - (j % tabsize);
4675 j += i;
4676 while (i--)
4677 *q++ = ' ';
4678 }
4679 }
4680 else {
4681 j++;
4682 *q++ = *p;
4683 if (*p == '\n' || *p == '\r')
4684 j = 0;
4685 }
4686
4687 return (PyObject*) u;
4688}
4689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004690PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691"S.find(sub [,start [,end]]) -> int\n\
4692\n\
4693Return the lowest index in S where substring sub is found,\n\
4694such that sub is contained within s[start,end]. Optional\n\
4695arguments start and end are interpreted as in slice notation.\n\
4696\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004697Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698
4699static PyObject *
4700unicode_find(PyUnicodeObject *self, PyObject *args)
4701{
4702 PyUnicodeObject *substring;
4703 int start = 0;
4704 int end = INT_MAX;
4705 PyObject *result;
4706
Guido van Rossumb8872e62000-05-09 14:14:27 +00004707 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4708 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 return NULL;
4710 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4711 (PyObject *)substring);
4712 if (substring == NULL)
4713 return NULL;
4714
4715 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4716
4717 Py_DECREF(substring);
4718 return result;
4719}
4720
4721static PyObject *
4722unicode_getitem(PyUnicodeObject *self, int index)
4723{
4724 if (index < 0 || index >= self->length) {
4725 PyErr_SetString(PyExc_IndexError, "string index out of range");
4726 return NULL;
4727 }
4728
4729 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4730}
4731
4732static long
4733unicode_hash(PyUnicodeObject *self)
4734{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004735 /* Since Unicode objects compare equal to their ASCII string
4736 counterparts, they should use the individual character values
4737 as basis for their hash value. This is needed to assure that
4738 strings and Unicode objects behave in the same way as
4739 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740
Fredrik Lundhdde61642000-07-10 18:27:47 +00004741 register int len;
4742 register Py_UNICODE *p;
4743 register long x;
4744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 if (self->hash != -1)
4746 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004747 len = PyUnicode_GET_SIZE(self);
4748 p = PyUnicode_AS_UNICODE(self);
4749 x = *p << 7;
4750 while (--len >= 0)
4751 x = (1000003*x) ^ *p++;
4752 x ^= PyUnicode_GET_SIZE(self);
4753 if (x == -1)
4754 x = -2;
4755 self->hash = x;
4756 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757}
4758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004759PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760"S.index(sub [,start [,end]]) -> int\n\
4761\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004762Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
4764static PyObject *
4765unicode_index(PyUnicodeObject *self, PyObject *args)
4766{
4767 int result;
4768 PyUnicodeObject *substring;
4769 int start = 0;
4770 int end = INT_MAX;
4771
Guido van Rossumb8872e62000-05-09 14:14:27 +00004772 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4773 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return NULL;
4775
4776 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4777 (PyObject *)substring);
4778 if (substring == NULL)
4779 return NULL;
4780
4781 result = findstring(self, substring, start, end, 1);
4782
4783 Py_DECREF(substring);
4784 if (result < 0) {
4785 PyErr_SetString(PyExc_ValueError, "substring not found");
4786 return NULL;
4787 }
4788 return PyInt_FromLong(result);
4789}
4790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004791PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004792"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004794Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004795at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796
4797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004798unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799{
4800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4801 register const Py_UNICODE *e;
4802 int cased;
4803
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 /* Shortcut for single character strings */
4805 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004806 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004808 /* Special case for empty strings */
4809 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004810 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004811
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 e = p + PyUnicode_GET_SIZE(self);
4813 cased = 0;
4814 for (; p < e; p++) {
4815 register const Py_UNICODE ch = *p;
4816
4817 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004818 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 else if (!cased && Py_UNICODE_ISLOWER(ch))
4820 cased = 1;
4821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823}
4824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004825PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004826"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004828Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004829at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830
4831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004832unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833{
4834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4835 register const Py_UNICODE *e;
4836 int cased;
4837
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 /* Shortcut for single character strings */
4839 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004840 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004842 /* Special case for empty strings */
4843 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004844 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 e = p + PyUnicode_GET_SIZE(self);
4847 cased = 0;
4848 for (; p < e; p++) {
4849 register const Py_UNICODE ch = *p;
4850
4851 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004852 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 else if (!cased && Py_UNICODE_ISUPPER(ch))
4854 cased = 1;
4855 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004856 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857}
4858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004859PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004860"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004862Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4863characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004864only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
4866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004867unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868{
4869 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4870 register const Py_UNICODE *e;
4871 int cased, previous_is_cased;
4872
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 /* Shortcut for single character strings */
4874 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004875 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4876 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004878 /* Special case for empty strings */
4879 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004880 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004881
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 e = p + PyUnicode_GET_SIZE(self);
4883 cased = 0;
4884 previous_is_cased = 0;
4885 for (; p < e; p++) {
4886 register const Py_UNICODE ch = *p;
4887
4888 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4889 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004890 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 previous_is_cased = 1;
4892 cased = 1;
4893 }
4894 else if (Py_UNICODE_ISLOWER(ch)) {
4895 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004896 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 previous_is_cased = 1;
4898 cased = 1;
4899 }
4900 else
4901 previous_is_cased = 0;
4902 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004903 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904}
4905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004906PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004907"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004909Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004910False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911
4912static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004913unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
4915 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4916 register const Py_UNICODE *e;
4917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 /* Shortcut for single character strings */
4919 if (PyUnicode_GET_SIZE(self) == 1 &&
4920 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004921 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004923 /* Special case for empty strings */
4924 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004925 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004926
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 e = p + PyUnicode_GET_SIZE(self);
4928 for (; p < e; p++) {
4929 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004930 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004935PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004936"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004937\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004938Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004939and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004940
4941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004942unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004943{
4944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4945 register const Py_UNICODE *e;
4946
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004947 /* Shortcut for single character strings */
4948 if (PyUnicode_GET_SIZE(self) == 1 &&
4949 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004950 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004951
4952 /* Special case for empty strings */
4953 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004954 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004955
4956 e = p + PyUnicode_GET_SIZE(self);
4957 for (; p < e; p++) {
4958 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004959 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004960 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004961 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004962}
4963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004964PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004965"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004966\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004967Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004968and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004969
4970static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004971unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004972{
4973 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4974 register const Py_UNICODE *e;
4975
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004976 /* Shortcut for single character strings */
4977 if (PyUnicode_GET_SIZE(self) == 1 &&
4978 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004979 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004980
4981 /* Special case for empty strings */
4982 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004983 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004984
4985 e = p + PyUnicode_GET_SIZE(self);
4986 for (; p < e; p++) {
4987 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004988 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004989 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004990 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004991}
4992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004993PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004994"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004996Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004997False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998
4999static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005000unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001{
5002 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5003 register const Py_UNICODE *e;
5004
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 /* Shortcut for single character strings */
5006 if (PyUnicode_GET_SIZE(self) == 1 &&
5007 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005008 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005010 /* Special case for empty strings */
5011 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005013
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 e = p + PyUnicode_GET_SIZE(self);
5015 for (; p < e; p++) {
5016 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005017 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005019 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020}
5021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005022PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005023"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005025Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005026False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027
5028static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005029unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030{
5031 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5032 register const Py_UNICODE *e;
5033
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 /* Shortcut for single character strings */
5035 if (PyUnicode_GET_SIZE(self) == 1 &&
5036 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005037 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005039 /* Special case for empty strings */
5040 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005041 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005042
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 e = p + PyUnicode_GET_SIZE(self);
5044 for (; p < e; p++) {
5045 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005046 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005048 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049}
5050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005051PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005052"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005054Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005055False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056
5057static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005058unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059{
5060 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5061 register const Py_UNICODE *e;
5062
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 /* Shortcut for single character strings */
5064 if (PyUnicode_GET_SIZE(self) == 1 &&
5065 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005066 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005068 /* Special case for empty strings */
5069 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005070 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005071
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 e = p + PyUnicode_GET_SIZE(self);
5073 for (; p < e; p++) {
5074 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005075 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005077 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078}
5079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005080PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081"S.join(sequence) -> unicode\n\
5082\n\
5083Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085
5086static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005087unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005089 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090}
5091
5092static int
5093unicode_length(PyUnicodeObject *self)
5094{
5095 return self->length;
5096}
5097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005098PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099"S.ljust(width) -> unicode\n\
5100\n\
5101Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005102done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103
5104static PyObject *
5105unicode_ljust(PyUnicodeObject *self, PyObject *args)
5106{
5107 int width;
5108 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5109 return NULL;
5110
Tim Peters7a29bd52001-09-12 03:03:31 +00005111 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 Py_INCREF(self);
5113 return (PyObject*) self;
5114 }
5115
5116 return (PyObject*) pad(self, 0, width - self->length, ' ');
5117}
5118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005119PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120"S.lower() -> unicode\n\
5121\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005122Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123
5124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005125unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 return fixup(self, fixlower);
5128}
5129
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005130#define LEFTSTRIP 0
5131#define RIGHTSTRIP 1
5132#define BOTHSTRIP 2
5133
5134/* Arrays indexed by above */
5135static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5136
5137#define STRIPNAME(i) (stripformat[i]+3)
5138
5139static const Py_UNICODE *
5140unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5141{
Tim Peters030a5ce2002-04-22 19:00:10 +00005142 size_t i;
5143 for (i = 0; i < n; ++i)
5144 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005145 return s+i;
5146 return NULL;
5147}
5148
5149/* externally visible for str.strip(unicode) */
5150PyObject *
5151_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5152{
5153 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5154 int len = PyUnicode_GET_SIZE(self);
5155 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5156 int seplen = PyUnicode_GET_SIZE(sepobj);
5157 int i, j;
5158
5159 i = 0;
5160 if (striptype != RIGHTSTRIP) {
5161 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5162 i++;
5163 }
5164 }
5165
5166 j = len;
5167 if (striptype != LEFTSTRIP) {
5168 do {
5169 j--;
5170 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5171 j++;
5172 }
5173
5174 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5175 Py_INCREF(self);
5176 return (PyObject*)self;
5177 }
5178 else
5179 return PyUnicode_FromUnicode(s+i, j-i);
5180}
5181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182
5183static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005184do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005186 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5187 int len = PyUnicode_GET_SIZE(self), i, j;
5188
5189 i = 0;
5190 if (striptype != RIGHTSTRIP) {
5191 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5192 i++;
5193 }
5194 }
5195
5196 j = len;
5197 if (striptype != LEFTSTRIP) {
5198 do {
5199 j--;
5200 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5201 j++;
5202 }
5203
5204 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5205 Py_INCREF(self);
5206 return (PyObject*)self;
5207 }
5208 else
5209 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210}
5211
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005212
5213static PyObject *
5214do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5215{
5216 PyObject *sep = NULL;
5217
5218 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5219 return NULL;
5220
5221 if (sep != NULL && sep != Py_None) {
5222 if (PyUnicode_Check(sep))
5223 return _PyUnicode_XStrip(self, striptype, sep);
5224 else if (PyString_Check(sep)) {
5225 PyObject *res;
5226 sep = PyUnicode_FromObject(sep);
5227 if (sep==NULL)
5228 return NULL;
5229 res = _PyUnicode_XStrip(self, striptype, sep);
5230 Py_DECREF(sep);
5231 return res;
5232 }
5233 else {
5234 PyErr_Format(PyExc_TypeError,
5235 "%s arg must be None, unicode or str",
5236 STRIPNAME(striptype));
5237 return NULL;
5238 }
5239 }
5240
5241 return do_strip(self, striptype);
5242}
5243
5244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005245PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005246"S.strip([sep]) -> unicode\n\
5247\n\
5248Return a copy of the string S with leading and trailing\n\
5249whitespace removed.\n\
5250If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005251If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005252
5253static PyObject *
5254unicode_strip(PyUnicodeObject *self, PyObject *args)
5255{
5256 if (PyTuple_GET_SIZE(args) == 0)
5257 return do_strip(self, BOTHSTRIP); /* Common case */
5258 else
5259 return do_argstrip(self, BOTHSTRIP, args);
5260}
5261
5262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005263PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005264"S.lstrip([sep]) -> unicode\n\
5265\n\
5266Return a copy of the string S with leading whitespace removed.\n\
5267If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005268If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005269
5270static PyObject *
5271unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5272{
5273 if (PyTuple_GET_SIZE(args) == 0)
5274 return do_strip(self, LEFTSTRIP); /* Common case */
5275 else
5276 return do_argstrip(self, LEFTSTRIP, args);
5277}
5278
5279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005280PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005281"S.rstrip([sep]) -> unicode\n\
5282\n\
5283Return a copy of the string S with trailing whitespace removed.\n\
5284If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005285If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005286
5287static PyObject *
5288unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5289{
5290 if (PyTuple_GET_SIZE(args) == 0)
5291 return do_strip(self, RIGHTSTRIP); /* Common case */
5292 else
5293 return do_argstrip(self, RIGHTSTRIP, args);
5294}
5295
5296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297static PyObject*
5298unicode_repeat(PyUnicodeObject *str, int len)
5299{
5300 PyUnicodeObject *u;
5301 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005302 int nchars;
5303 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305 if (len < 0)
5306 len = 0;
5307
Tim Peters7a29bd52001-09-12 03:03:31 +00005308 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 /* no repeat, return original string */
5310 Py_INCREF(str);
5311 return (PyObject*) str;
5312 }
Tim Peters8f422462000-09-09 06:13:41 +00005313
5314 /* ensure # of chars needed doesn't overflow int and # of bytes
5315 * needed doesn't overflow size_t
5316 */
5317 nchars = len * str->length;
5318 if (len && nchars / len != str->length) {
5319 PyErr_SetString(PyExc_OverflowError,
5320 "repeated string is too long");
5321 return NULL;
5322 }
5323 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5324 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5325 PyErr_SetString(PyExc_OverflowError,
5326 "repeated string is too long");
5327 return NULL;
5328 }
5329 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 if (!u)
5331 return NULL;
5332
5333 p = u->str;
5334
5335 while (len-- > 0) {
5336 Py_UNICODE_COPY(p, str->str, str->length);
5337 p += str->length;
5338 }
5339
5340 return (PyObject*) u;
5341}
5342
5343PyObject *PyUnicode_Replace(PyObject *obj,
5344 PyObject *subobj,
5345 PyObject *replobj,
5346 int maxcount)
5347{
5348 PyObject *self;
5349 PyObject *str1;
5350 PyObject *str2;
5351 PyObject *result;
5352
5353 self = PyUnicode_FromObject(obj);
5354 if (self == NULL)
5355 return NULL;
5356 str1 = PyUnicode_FromObject(subobj);
5357 if (str1 == NULL) {
5358 Py_DECREF(self);
5359 return NULL;
5360 }
5361 str2 = PyUnicode_FromObject(replobj);
5362 if (str2 == NULL) {
5363 Py_DECREF(self);
5364 Py_DECREF(str1);
5365 return NULL;
5366 }
5367 result = replace((PyUnicodeObject *)self,
5368 (PyUnicodeObject *)str1,
5369 (PyUnicodeObject *)str2,
5370 maxcount);
5371 Py_DECREF(self);
5372 Py_DECREF(str1);
5373 Py_DECREF(str2);
5374 return result;
5375}
5376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005377PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378"S.replace (old, new[, maxsplit]) -> unicode\n\
5379\n\
5380Return a copy of S with all occurrences of substring\n\
5381old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005382given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
5384static PyObject*
5385unicode_replace(PyUnicodeObject *self, PyObject *args)
5386{
5387 PyUnicodeObject *str1;
5388 PyUnicodeObject *str2;
5389 int maxcount = -1;
5390 PyObject *result;
5391
5392 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5393 return NULL;
5394 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5395 if (str1 == NULL)
5396 return NULL;
5397 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5398 if (str2 == NULL)
5399 return NULL;
5400
5401 result = replace(self, str1, str2, maxcount);
5402
5403 Py_DECREF(str1);
5404 Py_DECREF(str2);
5405 return result;
5406}
5407
5408static
5409PyObject *unicode_repr(PyObject *unicode)
5410{
5411 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5412 PyUnicode_GET_SIZE(unicode),
5413 1);
5414}
5415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005416PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417"S.rfind(sub [,start [,end]]) -> int\n\
5418\n\
5419Return the highest index in S where substring sub is found,\n\
5420such that sub is contained within s[start,end]. Optional\n\
5421arguments start and end are interpreted as in slice notation.\n\
5422\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005423Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424
5425static PyObject *
5426unicode_rfind(PyUnicodeObject *self, PyObject *args)
5427{
5428 PyUnicodeObject *substring;
5429 int start = 0;
5430 int end = INT_MAX;
5431 PyObject *result;
5432
Guido van Rossumb8872e62000-05-09 14:14:27 +00005433 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5434 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 return NULL;
5436 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5437 (PyObject *)substring);
5438 if (substring == NULL)
5439 return NULL;
5440
5441 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5442
5443 Py_DECREF(substring);
5444 return result;
5445}
5446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005447PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448"S.rindex(sub [,start [,end]]) -> int\n\
5449\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005450Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
5452static PyObject *
5453unicode_rindex(PyUnicodeObject *self, PyObject *args)
5454{
5455 int result;
5456 PyUnicodeObject *substring;
5457 int start = 0;
5458 int end = INT_MAX;
5459
Guido van Rossumb8872e62000-05-09 14:14:27 +00005460 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5461 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 return NULL;
5463 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5464 (PyObject *)substring);
5465 if (substring == NULL)
5466 return NULL;
5467
5468 result = findstring(self, substring, start, end, -1);
5469
5470 Py_DECREF(substring);
5471 if (result < 0) {
5472 PyErr_SetString(PyExc_ValueError, "substring not found");
5473 return NULL;
5474 }
5475 return PyInt_FromLong(result);
5476}
5477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005478PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479"S.rjust(width) -> unicode\n\
5480\n\
5481Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005482done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483
5484static PyObject *
5485unicode_rjust(PyUnicodeObject *self, PyObject *args)
5486{
5487 int width;
5488 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5489 return NULL;
5490
Tim Peters7a29bd52001-09-12 03:03:31 +00005491 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 Py_INCREF(self);
5493 return (PyObject*) self;
5494 }
5495
5496 return (PyObject*) pad(self, width - self->length, 0, ' ');
5497}
5498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499static PyObject*
5500unicode_slice(PyUnicodeObject *self, int start, int end)
5501{
5502 /* standard clamping */
5503 if (start < 0)
5504 start = 0;
5505 if (end < 0)
5506 end = 0;
5507 if (end > self->length)
5508 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005509 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 /* full slice, return original string */
5511 Py_INCREF(self);
5512 return (PyObject*) self;
5513 }
5514 if (start > end)
5515 start = end;
5516 /* copy slice */
5517 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5518 end - start);
5519}
5520
5521PyObject *PyUnicode_Split(PyObject *s,
5522 PyObject *sep,
5523 int maxsplit)
5524{
5525 PyObject *result;
5526
5527 s = PyUnicode_FromObject(s);
5528 if (s == NULL)
5529 return NULL;
5530 if (sep != NULL) {
5531 sep = PyUnicode_FromObject(sep);
5532 if (sep == NULL) {
5533 Py_DECREF(s);
5534 return NULL;
5535 }
5536 }
5537
5538 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5539
5540 Py_DECREF(s);
5541 Py_XDECREF(sep);
5542 return result;
5543}
5544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005545PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546"S.split([sep [,maxsplit]]) -> list of strings\n\
5547\n\
5548Return a list of the words in S, using sep as the\n\
5549delimiter string. If maxsplit is given, at most maxsplit\n\
5550splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005551is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552
5553static PyObject*
5554unicode_split(PyUnicodeObject *self, PyObject *args)
5555{
5556 PyObject *substring = Py_None;
5557 int maxcount = -1;
5558
5559 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5560 return NULL;
5561
5562 if (substring == Py_None)
5563 return split(self, NULL, maxcount);
5564 else if (PyUnicode_Check(substring))
5565 return split(self, (PyUnicodeObject *)substring, maxcount);
5566 else
5567 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5568}
5569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005570PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005571"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572\n\
5573Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005574Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005575is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
5577static PyObject*
5578unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5579{
Guido van Rossum86662912000-04-11 15:38:46 +00005580 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
Guido van Rossum86662912000-04-11 15:38:46 +00005582 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 return NULL;
5584
Guido van Rossum86662912000-04-11 15:38:46 +00005585 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586}
5587
5588static
5589PyObject *unicode_str(PyUnicodeObject *self)
5590{
Fred Drakee4315f52000-05-09 19:53:39 +00005591 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592}
5593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005594PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595"S.swapcase() -> unicode\n\
5596\n\
5597Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005598and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
5600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005601unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 return fixup(self, fixswapcase);
5604}
5605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005606PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607"S.translate(table) -> unicode\n\
5608\n\
5609Return a copy of the string S, where all characters have been mapped\n\
5610through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005611Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5612Unmapped characters are left untouched. Characters mapped to None\n\
5613are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
5615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005616unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 return PyUnicode_TranslateCharmap(self->str,
5619 self->length,
5620 table,
5621 "ignore");
5622}
5623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005624PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625"S.upper() -> unicode\n\
5626\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005627Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628
5629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005630unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 return fixup(self, fixupper);
5633}
5634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636"S.zfill(width) -> unicode\n\
5637\n\
5638Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005639of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
5641static PyObject *
5642unicode_zfill(PyUnicodeObject *self, PyObject *args)
5643{
5644 int fill;
5645 PyUnicodeObject *u;
5646
5647 int width;
5648 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5649 return NULL;
5650
5651 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005652 if (PyUnicode_CheckExact(self)) {
5653 Py_INCREF(self);
5654 return (PyObject*) self;
5655 }
5656 else
5657 return PyUnicode_FromUnicode(
5658 PyUnicode_AS_UNICODE(self),
5659 PyUnicode_GET_SIZE(self)
5660 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 }
5662
5663 fill = width - self->length;
5664
5665 u = pad(self, fill, 0, '0');
5666
Walter Dörwald068325e2002-04-15 13:36:47 +00005667 if (u == NULL)
5668 return NULL;
5669
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 if (u->str[fill] == '+' || u->str[fill] == '-') {
5671 /* move sign to beginning of string */
5672 u->str[0] = u->str[fill];
5673 u->str[fill] = '0';
5674 }
5675
5676 return (PyObject*) u;
5677}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678
5679#if 0
5680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005681unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 return PyInt_FromLong(unicode_freelist_size);
5684}
5685#endif
5686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005687PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005688"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005690Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
5694static PyObject *
5695unicode_startswith(PyUnicodeObject *self,
5696 PyObject *args)
5697{
5698 PyUnicodeObject *substring;
5699 int start = 0;
5700 int end = INT_MAX;
5701 PyObject *result;
5702
Guido van Rossumb8872e62000-05-09 14:14:27 +00005703 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5704 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5707 (PyObject *)substring);
5708 if (substring == NULL)
5709 return NULL;
5710
Guido van Rossum77f6a652002-04-03 22:41:51 +00005711 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
5713 Py_DECREF(substring);
5714 return result;
5715}
5716
5717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005718PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005719"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005721Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005723comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
5725static PyObject *
5726unicode_endswith(PyUnicodeObject *self,
5727 PyObject *args)
5728{
5729 PyUnicodeObject *substring;
5730 int start = 0;
5731 int end = INT_MAX;
5732 PyObject *result;
5733
Guido van Rossumb8872e62000-05-09 14:14:27 +00005734 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5735 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 return NULL;
5737 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5738 (PyObject *)substring);
5739 if (substring == NULL)
5740 return NULL;
5741
Guido van Rossum77f6a652002-04-03 22:41:51 +00005742 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743
5744 Py_DECREF(substring);
5745 return result;
5746}
5747
5748
5749static PyMethodDef unicode_methods[] = {
5750
5751 /* Order is according to common usage: often used methods should
5752 appear first, since lookup is done sequentially. */
5753
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005754 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5755 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5756 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5757 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5758 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5759 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5760 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5761 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5762 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5763 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5764 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5765 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5766 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005767 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005768/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5769 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5770 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5771 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005772 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005773 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005774 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005775 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5776 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5777 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5778 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5779 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5780 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5781 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5782 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5783 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5784 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5785 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5786 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5787 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5788 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005789 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005790#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005791 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792#endif
5793
5794#if 0
5795 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005796 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797#endif
5798
5799 {NULL, NULL}
5800};
5801
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005802static PyObject *
5803unicode_mod(PyObject *v, PyObject *w)
5804{
5805 if (!PyUnicode_Check(v)) {
5806 Py_INCREF(Py_NotImplemented);
5807 return Py_NotImplemented;
5808 }
5809 return PyUnicode_Format(v, w);
5810}
5811
5812static PyNumberMethods unicode_as_number = {
5813 0, /*nb_add*/
5814 0, /*nb_subtract*/
5815 0, /*nb_multiply*/
5816 0, /*nb_divide*/
5817 unicode_mod, /*nb_remainder*/
5818};
5819
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820static PySequenceMethods unicode_as_sequence = {
5821 (inquiry) unicode_length, /* sq_length */
5822 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5823 (intargfunc) unicode_repeat, /* sq_repeat */
5824 (intargfunc) unicode_getitem, /* sq_item */
5825 (intintargfunc) unicode_slice, /* sq_slice */
5826 0, /* sq_ass_item */
5827 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005828 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829};
5830
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005831static PyObject*
5832unicode_subscript(PyUnicodeObject* self, PyObject* item)
5833{
5834 if (PyInt_Check(item)) {
5835 long i = PyInt_AS_LONG(item);
5836 if (i < 0)
5837 i += PyString_GET_SIZE(self);
5838 return unicode_getitem(self, i);
5839 } else if (PyLong_Check(item)) {
5840 long i = PyLong_AsLong(item);
5841 if (i == -1 && PyErr_Occurred())
5842 return NULL;
5843 if (i < 0)
5844 i += PyString_GET_SIZE(self);
5845 return unicode_getitem(self, i);
5846 } else if (PySlice_Check(item)) {
5847 int start, stop, step, slicelength, cur, i;
5848 Py_UNICODE* source_buf;
5849 Py_UNICODE* result_buf;
5850 PyObject* result;
5851
5852 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5853 &start, &stop, &step, &slicelength) < 0) {
5854 return NULL;
5855 }
5856
5857 if (slicelength <= 0) {
5858 return PyUnicode_FromUnicode(NULL, 0);
5859 } else {
5860 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5861 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5862
5863 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5864 result_buf[i] = source_buf[cur];
5865 }
5866
5867 result = PyUnicode_FromUnicode(result_buf, slicelength);
5868 PyMem_FREE(result_buf);
5869 return result;
5870 }
5871 } else {
5872 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5873 return NULL;
5874 }
5875}
5876
5877static PyMappingMethods unicode_as_mapping = {
5878 (inquiry)unicode_length, /* mp_length */
5879 (binaryfunc)unicode_subscript, /* mp_subscript */
5880 (objobjargproc)0, /* mp_ass_subscript */
5881};
5882
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883static int
5884unicode_buffer_getreadbuf(PyUnicodeObject *self,
5885 int index,
5886 const void **ptr)
5887{
5888 if (index != 0) {
5889 PyErr_SetString(PyExc_SystemError,
5890 "accessing non-existent unicode segment");
5891 return -1;
5892 }
5893 *ptr = (void *) self->str;
5894 return PyUnicode_GET_DATA_SIZE(self);
5895}
5896
5897static int
5898unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5899 const void **ptr)
5900{
5901 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005902 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return -1;
5904}
5905
5906static int
5907unicode_buffer_getsegcount(PyUnicodeObject *self,
5908 int *lenp)
5909{
5910 if (lenp)
5911 *lenp = PyUnicode_GET_DATA_SIZE(self);
5912 return 1;
5913}
5914
5915static int
5916unicode_buffer_getcharbuf(PyUnicodeObject *self,
5917 int index,
5918 const void **ptr)
5919{
5920 PyObject *str;
5921
5922 if (index != 0) {
5923 PyErr_SetString(PyExc_SystemError,
5924 "accessing non-existent unicode segment");
5925 return -1;
5926 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005927 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 if (str == NULL)
5929 return -1;
5930 *ptr = (void *) PyString_AS_STRING(str);
5931 return PyString_GET_SIZE(str);
5932}
5933
5934/* Helpers for PyUnicode_Format() */
5935
5936static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005937getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
5939 int argidx = *p_argidx;
5940 if (argidx < arglen) {
5941 (*p_argidx)++;
5942 if (arglen < 0)
5943 return args;
5944 else
5945 return PyTuple_GetItem(args, argidx);
5946 }
5947 PyErr_SetString(PyExc_TypeError,
5948 "not enough arguments for format string");
5949 return NULL;
5950}
5951
5952#define F_LJUST (1<<0)
5953#define F_SIGN (1<<1)
5954#define F_BLANK (1<<2)
5955#define F_ALT (1<<3)
5956#define F_ZERO (1<<4)
5957
5958static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960{
5961 register int i;
5962 int len;
5963 va_list va;
5964 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
5967 /* First, format the string as char array, then expand to Py_UNICODE
5968 array. */
5969 charbuffer = (char *)buffer;
5970 len = vsprintf(charbuffer, format, va);
5971 for (i = len - 1; i >= 0; i--)
5972 buffer[i] = (Py_UNICODE) charbuffer[i];
5973
5974 va_end(va);
5975 return len;
5976}
5977
Guido van Rossum078151d2002-08-11 04:24:12 +00005978/* XXX To save some code duplication, formatfloat/long/int could have been
5979 shared with stringobject.c, converting from 8-bit to Unicode after the
5980 formatting is done. */
5981
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982static int
5983formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005984 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 int flags,
5986 int prec,
5987 int type,
5988 PyObject *v)
5989{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005990 /* fmt = '%#.' + `prec` + `type`
5991 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 char fmt[20];
5993 double x;
5994
5995 x = PyFloat_AsDouble(v);
5996 if (x == -1.0 && PyErr_Occurred())
5997 return -1;
5998 if (prec < 0)
5999 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6001 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00006002 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6003 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006004 /* worst case length calc to ensure no buffer overrun:
6005 fmt = %#.<prec>g
6006 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6007 for any double rep.)
6008 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6009 If prec=0 the effective precision is 1 (the leading digit is
6010 always given), therefore increase by one to 10+prec. */
6011 if (buflen <= (size_t)10 + (size_t)prec) {
6012 PyErr_SetString(PyExc_OverflowError,
6013 "formatted float is too long (precision too long?)");
6014 return -1;
6015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 return usprintf(buf, fmt, x);
6017}
6018
Tim Peters38fd5b62000-09-21 05:43:11 +00006019static PyObject*
6020formatlong(PyObject *val, int flags, int prec, int type)
6021{
6022 char *buf;
6023 int i, len;
6024 PyObject *str; /* temporary string object. */
6025 PyUnicodeObject *result;
6026
6027 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6028 if (!str)
6029 return NULL;
6030 result = _PyUnicode_New(len);
6031 for (i = 0; i < len; i++)
6032 result->str[i] = buf[i];
6033 result->str[len] = 0;
6034 Py_DECREF(str);
6035 return (PyObject*)result;
6036}
6037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038static int
6039formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006040 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 int flags,
6042 int prec,
6043 int type,
6044 PyObject *v)
6045{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006046 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006047 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6048 * + 1 + 1
6049 * = 24
6050 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006051 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 long x;
6053
6054 x = PyInt_AsLong(v);
6055 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006056 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006057 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006058 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006059 "%u/%o/%x/%X of negative int will return "
6060 "a signed string in Python 2.4 and up") < 0)
6061 return -1;
6062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006064 prec = 1;
6065
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006066 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006067 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6068 */
6069 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006070 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006071 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006072 return -1;
6073 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006074
6075 if ((flags & F_ALT) &&
6076 (type == 'x' || type == 'X')) {
6077 /* When converting under %#x or %#X, there are a number
6078 * of issues that cause pain:
6079 * - when 0 is being converted, the C standard leaves off
6080 * the '0x' or '0X', which is inconsistent with other
6081 * %#x/%#X conversions and inconsistent with Python's
6082 * hex() function
6083 * - there are platforms that violate the standard and
6084 * convert 0 with the '0x' or '0X'
6085 * (Metrowerks, Compaq Tru64)
6086 * - there are platforms that give '0x' when converting
6087 * under %#X, but convert 0 in accordance with the
6088 * standard (OS/2 EMX)
6089 *
6090 * We can achieve the desired consistency by inserting our
6091 * own '0x' or '0X' prefix, and substituting %x/%X in place
6092 * of %#x/%#X.
6093 *
6094 * Note that this is the same approach as used in
6095 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006096 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006097 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6098 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006099 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006100 else {
6101 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6102 (flags&F_ALT) ? "#" : "",
6103 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 return usprintf(buf, fmt, x);
6106}
6107
6108static int
6109formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006110 size_t buflen,
6111 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006113 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006114 if (PyUnicode_Check(v)) {
6115 if (PyUnicode_GET_SIZE(v) != 1)
6116 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006118 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006120 else if (PyString_Check(v)) {
6121 if (PyString_GET_SIZE(v) != 1)
6122 goto onError;
6123 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
6126 else {
6127 /* Integer input truncated to a character */
6128 long x;
6129 x = PyInt_AsLong(v);
6130 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006131 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006132#ifdef Py_UNICODE_WIDE
6133 if (x < 0 || x > 0x10ffff) {
6134 PyErr_SetString(PyExc_ValueError,
6135 "%c arg not in range(0x110000) "
6136 "(wide Python build)");
6137 return -1;
6138 }
6139#else
6140 if (x < 0 || x > 0xffff) {
6141 PyErr_SetString(PyExc_ValueError,
6142 "%c arg not in range(0x10000) "
6143 "(narrow Python build)");
6144 return -1;
6145 }
6146#endif
6147 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
6149 buf[1] = '\0';
6150 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006151
6152 onError:
6153 PyErr_SetString(PyExc_TypeError,
6154 "%c requires int or char");
6155 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156}
6157
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006158/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6159
6160 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6161 chars are formatted. XXX This is a magic number. Each formatting
6162 routine does bounds checking to ensure no overflow, but a better
6163 solution may be to malloc a buffer of appropriate size for each
6164 format. For now, the current solution is sufficient.
6165*/
6166#define FORMATBUFLEN (size_t)120
6167
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168PyObject *PyUnicode_Format(PyObject *format,
6169 PyObject *args)
6170{
6171 Py_UNICODE *fmt, *res;
6172 int fmtcnt, rescnt, reslen, arglen, argidx;
6173 int args_owned = 0;
6174 PyUnicodeObject *result = NULL;
6175 PyObject *dict = NULL;
6176 PyObject *uformat;
6177
6178 if (format == NULL || args == NULL) {
6179 PyErr_BadInternalCall();
6180 return NULL;
6181 }
6182 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006183 if (uformat == NULL)
6184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 fmt = PyUnicode_AS_UNICODE(uformat);
6186 fmtcnt = PyUnicode_GET_SIZE(uformat);
6187
6188 reslen = rescnt = fmtcnt + 100;
6189 result = _PyUnicode_New(reslen);
6190 if (result == NULL)
6191 goto onError;
6192 res = PyUnicode_AS_UNICODE(result);
6193
6194 if (PyTuple_Check(args)) {
6195 arglen = PyTuple_Size(args);
6196 argidx = 0;
6197 }
6198 else {
6199 arglen = -1;
6200 argidx = -2;
6201 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006202 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6203 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 dict = args;
6205
6206 while (--fmtcnt >= 0) {
6207 if (*fmt != '%') {
6208 if (--rescnt < 0) {
6209 rescnt = fmtcnt + 100;
6210 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006211 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return NULL;
6213 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6214 --rescnt;
6215 }
6216 *res++ = *fmt++;
6217 }
6218 else {
6219 /* Got a format specifier */
6220 int flags = 0;
6221 int width = -1;
6222 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 Py_UNICODE c = '\0';
6224 Py_UNICODE fill;
6225 PyObject *v = NULL;
6226 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006227 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 Py_UNICODE sign;
6229 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006230 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231
6232 fmt++;
6233 if (*fmt == '(') {
6234 Py_UNICODE *keystart;
6235 int keylen;
6236 PyObject *key;
6237 int pcount = 1;
6238
6239 if (dict == NULL) {
6240 PyErr_SetString(PyExc_TypeError,
6241 "format requires a mapping");
6242 goto onError;
6243 }
6244 ++fmt;
6245 --fmtcnt;
6246 keystart = fmt;
6247 /* Skip over balanced parentheses */
6248 while (pcount > 0 && --fmtcnt >= 0) {
6249 if (*fmt == ')')
6250 --pcount;
6251 else if (*fmt == '(')
6252 ++pcount;
6253 fmt++;
6254 }
6255 keylen = fmt - keystart - 1;
6256 if (fmtcnt < 0 || pcount > 0) {
6257 PyErr_SetString(PyExc_ValueError,
6258 "incomplete format key");
6259 goto onError;
6260 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006261#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006262 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 then looked up since Python uses strings to hold
6264 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006265 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 key = PyUnicode_EncodeUTF8(keystart,
6267 keylen,
6268 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006269#else
6270 key = PyUnicode_FromUnicode(keystart, keylen);
6271#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 if (key == NULL)
6273 goto onError;
6274 if (args_owned) {
6275 Py_DECREF(args);
6276 args_owned = 0;
6277 }
6278 args = PyObject_GetItem(dict, key);
6279 Py_DECREF(key);
6280 if (args == NULL) {
6281 goto onError;
6282 }
6283 args_owned = 1;
6284 arglen = -1;
6285 argidx = -2;
6286 }
6287 while (--fmtcnt >= 0) {
6288 switch (c = *fmt++) {
6289 case '-': flags |= F_LJUST; continue;
6290 case '+': flags |= F_SIGN; continue;
6291 case ' ': flags |= F_BLANK; continue;
6292 case '#': flags |= F_ALT; continue;
6293 case '0': flags |= F_ZERO; continue;
6294 }
6295 break;
6296 }
6297 if (c == '*') {
6298 v = getnextarg(args, arglen, &argidx);
6299 if (v == NULL)
6300 goto onError;
6301 if (!PyInt_Check(v)) {
6302 PyErr_SetString(PyExc_TypeError,
6303 "* wants int");
6304 goto onError;
6305 }
6306 width = PyInt_AsLong(v);
6307 if (width < 0) {
6308 flags |= F_LJUST;
6309 width = -width;
6310 }
6311 if (--fmtcnt >= 0)
6312 c = *fmt++;
6313 }
6314 else if (c >= '0' && c <= '9') {
6315 width = c - '0';
6316 while (--fmtcnt >= 0) {
6317 c = *fmt++;
6318 if (c < '0' || c > '9')
6319 break;
6320 if ((width*10) / 10 != width) {
6321 PyErr_SetString(PyExc_ValueError,
6322 "width too big");
6323 goto onError;
6324 }
6325 width = width*10 + (c - '0');
6326 }
6327 }
6328 if (c == '.') {
6329 prec = 0;
6330 if (--fmtcnt >= 0)
6331 c = *fmt++;
6332 if (c == '*') {
6333 v = getnextarg(args, arglen, &argidx);
6334 if (v == NULL)
6335 goto onError;
6336 if (!PyInt_Check(v)) {
6337 PyErr_SetString(PyExc_TypeError,
6338 "* wants int");
6339 goto onError;
6340 }
6341 prec = PyInt_AsLong(v);
6342 if (prec < 0)
6343 prec = 0;
6344 if (--fmtcnt >= 0)
6345 c = *fmt++;
6346 }
6347 else if (c >= '0' && c <= '9') {
6348 prec = c - '0';
6349 while (--fmtcnt >= 0) {
6350 c = Py_CHARMASK(*fmt++);
6351 if (c < '0' || c > '9')
6352 break;
6353 if ((prec*10) / 10 != prec) {
6354 PyErr_SetString(PyExc_ValueError,
6355 "prec too big");
6356 goto onError;
6357 }
6358 prec = prec*10 + (c - '0');
6359 }
6360 }
6361 } /* prec */
6362 if (fmtcnt >= 0) {
6363 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 if (--fmtcnt >= 0)
6365 c = *fmt++;
6366 }
6367 }
6368 if (fmtcnt < 0) {
6369 PyErr_SetString(PyExc_ValueError,
6370 "incomplete format");
6371 goto onError;
6372 }
6373 if (c != '%') {
6374 v = getnextarg(args, arglen, &argidx);
6375 if (v == NULL)
6376 goto onError;
6377 }
6378 sign = 0;
6379 fill = ' ';
6380 switch (c) {
6381
6382 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006383 pbuf = formatbuf;
6384 /* presume that buffer length is at least 1 */
6385 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 len = 1;
6387 break;
6388
6389 case 's':
6390 case 'r':
6391 if (PyUnicode_Check(v) && c == 's') {
6392 temp = v;
6393 Py_INCREF(temp);
6394 }
6395 else {
6396 PyObject *unicode;
6397 if (c == 's')
6398 temp = PyObject_Str(v);
6399 else
6400 temp = PyObject_Repr(v);
6401 if (temp == NULL)
6402 goto onError;
6403 if (!PyString_Check(temp)) {
6404 /* XXX Note: this should never happen, since
6405 PyObject_Repr() and PyObject_Str() assure
6406 this */
6407 Py_DECREF(temp);
6408 PyErr_SetString(PyExc_TypeError,
6409 "%s argument has non-string str()");
6410 goto onError;
6411 }
Fred Drakee4315f52000-05-09 19:53:39 +00006412 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006414 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 "strict");
6416 Py_DECREF(temp);
6417 temp = unicode;
6418 if (temp == NULL)
6419 goto onError;
6420 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006421 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 len = PyUnicode_GET_SIZE(temp);
6423 if (prec >= 0 && len > prec)
6424 len = prec;
6425 break;
6426
6427 case 'i':
6428 case 'd':
6429 case 'u':
6430 case 'o':
6431 case 'x':
6432 case 'X':
6433 if (c == 'i')
6434 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006435 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006436 temp = formatlong(v, flags, prec, c);
6437 if (!temp)
6438 goto onError;
6439 pbuf = PyUnicode_AS_UNICODE(temp);
6440 len = PyUnicode_GET_SIZE(temp);
6441 /* unbounded ints can always produce
6442 a sign character! */
6443 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006445 else {
6446 pbuf = formatbuf;
6447 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6448 flags, prec, c, v);
6449 if (len < 0)
6450 goto onError;
6451 /* only d conversion is signed */
6452 sign = c == 'd';
6453 }
6454 if (flags & F_ZERO)
6455 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 break;
6457
6458 case 'e':
6459 case 'E':
6460 case 'f':
6461 case 'g':
6462 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006463 pbuf = formatbuf;
6464 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6465 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 if (len < 0)
6467 goto onError;
6468 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006469 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 fill = '0';
6471 break;
6472
6473 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006474 pbuf = formatbuf;
6475 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 if (len < 0)
6477 goto onError;
6478 break;
6479
6480 default:
6481 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006482 "unsupported format character '%c' (0x%x) "
6483 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006484 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006485 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006486 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 goto onError;
6488 }
6489 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006490 if (*pbuf == '-' || *pbuf == '+') {
6491 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 len--;
6493 }
6494 else if (flags & F_SIGN)
6495 sign = '+';
6496 else if (flags & F_BLANK)
6497 sign = ' ';
6498 else
6499 sign = 0;
6500 }
6501 if (width < len)
6502 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006503 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 reslen -= rescnt;
6505 rescnt = width + fmtcnt + 100;
6506 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006507 if (reslen < 0) {
6508 Py_DECREF(result);
6509 return PyErr_NoMemory();
6510 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006511 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 return NULL;
6513 res = PyUnicode_AS_UNICODE(result)
6514 + reslen - rescnt;
6515 }
6516 if (sign) {
6517 if (fill != ' ')
6518 *res++ = sign;
6519 rescnt--;
6520 if (width > len)
6521 width--;
6522 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006523 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6524 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006525 assert(pbuf[1] == c);
6526 if (fill != ' ') {
6527 *res++ = *pbuf++;
6528 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006529 }
Tim Petersfff53252001-04-12 18:38:48 +00006530 rescnt -= 2;
6531 width -= 2;
6532 if (width < 0)
6533 width = 0;
6534 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 if (width > len && !(flags & F_LJUST)) {
6537 do {
6538 --rescnt;
6539 *res++ = fill;
6540 } while (--width > len);
6541 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006542 if (fill == ' ') {
6543 if (sign)
6544 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006545 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006546 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006547 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006548 *res++ = *pbuf++;
6549 *res++ = *pbuf++;
6550 }
6551 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006552 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 res += len;
6554 rescnt -= len;
6555 while (--width >= len) {
6556 --rescnt;
6557 *res++ = ' ';
6558 }
6559 if (dict && (argidx < arglen) && c != '%') {
6560 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006561 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 goto onError;
6563 }
6564 Py_XDECREF(temp);
6565 } /* '%' */
6566 } /* until end */
6567 if (argidx < arglen && !dict) {
6568 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006569 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 goto onError;
6571 }
6572
6573 if (args_owned) {
6574 Py_DECREF(args);
6575 }
6576 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006577 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006578 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return (PyObject *)result;
6580
6581 onError:
6582 Py_XDECREF(result);
6583 Py_DECREF(uformat);
6584 if (args_owned) {
6585 Py_DECREF(args);
6586 }
6587 return NULL;
6588}
6589
6590static PyBufferProcs unicode_as_buffer = {
6591 (getreadbufferproc) unicode_buffer_getreadbuf,
6592 (getwritebufferproc) unicode_buffer_getwritebuf,
6593 (getsegcountproc) unicode_buffer_getsegcount,
6594 (getcharbufferproc) unicode_buffer_getcharbuf,
6595};
6596
Jeremy Hylton938ace62002-07-17 16:30:39 +00006597static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006598unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6599
Tim Peters6d6c1a32001-08-02 04:15:00 +00006600static PyObject *
6601unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6602{
6603 PyObject *x = NULL;
6604 static char *kwlist[] = {"string", "encoding", "errors", 0};
6605 char *encoding = NULL;
6606 char *errors = NULL;
6607
Guido van Rossume023fe02001-08-30 03:12:59 +00006608 if (type != &PyUnicode_Type)
6609 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006610 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6611 kwlist, &x, &encoding, &errors))
6612 return NULL;
6613 if (x == NULL)
6614 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006615 if (encoding == NULL && errors == NULL)
6616 return PyObject_Unicode(x);
6617 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006618 return PyUnicode_FromEncodedObject(x, encoding, errors);
6619}
6620
Guido van Rossume023fe02001-08-30 03:12:59 +00006621static PyObject *
6622unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6623{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006624 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006625 int n;
6626
6627 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6628 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6629 if (tmp == NULL)
6630 return NULL;
6631 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006632 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6633 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00006634 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00006635 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6636 if (pnew->str == NULL) {
6637 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006638 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00006639 return NULL;
6640 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006641 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6642 pnew->length = n;
6643 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006644 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006645 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006646}
6647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006648PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006649"unicode(string [, encoding[, errors]]) -> object\n\
6650\n\
6651Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006652encoding defaults to the current default string encoding.\n\
6653errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006654
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655PyTypeObject PyUnicode_Type = {
6656 PyObject_HEAD_INIT(&PyType_Type)
6657 0, /* ob_size */
6658 "unicode", /* tp_name */
6659 sizeof(PyUnicodeObject), /* tp_size */
6660 0, /* tp_itemsize */
6661 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006662 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006664 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 0, /* tp_setattr */
6666 (cmpfunc) unicode_compare, /* tp_compare */
6667 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006668 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006670 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 (hashfunc) unicode_hash, /* tp_hash*/
6672 0, /* tp_call*/
6673 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006674 PyObject_GenericGetAttr, /* tp_getattro */
6675 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006677 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6678 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006679 unicode_doc, /* tp_doc */
6680 0, /* tp_traverse */
6681 0, /* tp_clear */
6682 0, /* tp_richcompare */
6683 0, /* tp_weaklistoffset */
6684 0, /* tp_iter */
6685 0, /* tp_iternext */
6686 unicode_methods, /* tp_methods */
6687 0, /* tp_members */
6688 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006689 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006690 0, /* tp_dict */
6691 0, /* tp_descr_get */
6692 0, /* tp_descr_set */
6693 0, /* tp_dictoffset */
6694 0, /* tp_init */
6695 0, /* tp_alloc */
6696 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006697 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698};
6699
6700/* Initialize the Unicode implementation */
6701
Thomas Wouters78890102000-07-22 19:25:51 +00006702void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006704 int i;
6705
Fred Drakee4315f52000-05-09 19:53:39 +00006706 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006707 unicode_freelist = NULL;
6708 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006710 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006711 for (i = 0; i < 256; i++)
6712 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006713 if (PyType_Ready(&PyUnicode_Type) < 0)
6714 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715}
6716
6717/* Finalize the Unicode implementation */
6718
6719void
Thomas Wouters78890102000-07-22 19:25:51 +00006720_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006722 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006723 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006725 Py_XDECREF(unicode_empty);
6726 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006727
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006728 for (i = 0; i < 256; i++) {
6729 if (unicode_latin1[i]) {
6730 Py_DECREF(unicode_latin1[i]);
6731 unicode_latin1[i] = NULL;
6732 }
6733 }
6734
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006735 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 PyUnicodeObject *v = u;
6737 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006738 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006739 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006740 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006741 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006743 unicode_freelist = NULL;
6744 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745}