blob: dde6b4704aa46db5a764039a46bf7df0328078bf [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 int owned = 0;
456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Greg Steinaf36a3a2000-07-17 09:04:43 +0000513 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000515 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000519 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523}
524
525PyObject *PyUnicode_Decode(const char *s,
526 int size,
527 const char *encoding,
528 const char *errors)
529{
530 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000531
532 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000533 encoding = PyUnicode_GetDefaultEncoding();
534
535 /* Shortcuts for common default encodings */
536 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "latin-1") == 0)
539 return PyUnicode_DecodeLatin1(s, size, errors);
540 else if (strcmp(encoding, "ascii") == 0)
541 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000542
543 /* Decode via the codec registry */
544 buffer = PyBuffer_FromMemory((void *)s, size);
545 if (buffer == NULL)
546 goto onError;
547 unicode = PyCodec_Decode(buffer, encoding, errors);
548 if (unicode == NULL)
549 goto onError;
550 if (!PyUnicode_Check(unicode)) {
551 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000552 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553 unicode->ob_type->tp_name);
554 Py_DECREF(unicode);
555 goto onError;
556 }
557 Py_DECREF(buffer);
558 return unicode;
559
560 onError:
561 Py_XDECREF(buffer);
562 return NULL;
563}
564
565PyObject *PyUnicode_Encode(const Py_UNICODE *s,
566 int size,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v, *unicode;
571
572 unicode = PyUnicode_FromUnicode(s, size);
573 if (unicode == NULL)
574 return NULL;
575 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
576 Py_DECREF(unicode);
577 return v;
578}
579
580PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
581 const char *encoding,
582 const char *errors)
583{
584 PyObject *v;
585
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
Fred Drakee4315f52000-05-09 19:53:39 +0000590
591 if (encoding == NULL)
592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (errors == NULL) {
596 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000597 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "latin-1") == 0)
599 return PyUnicode_AsLatin1String(unicode);
600 else if (strcmp(encoding, "ascii") == 0)
601 return PyUnicode_AsASCIIString(unicode);
602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603
604 /* Encode via the codec registry */
605 v = PyCodec_Encode(unicode, encoding, errors);
606 if (v == NULL)
607 goto onError;
608 /* XXX Should we really enforce this ? */
609 if (!PyString_Check(v)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 v->ob_type->tp_name);
613 Py_DECREF(v);
614 goto onError;
615 }
616 return v;
617
618 onError:
619 return NULL;
620}
621
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
623 const char *errors)
624{
625 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
626
627 if (v)
628 return v;
629 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
630 if (v && errors == NULL)
631 ((PyUnicodeObject *)unicode)->defenc = v;
632 return v;
633}
634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
636{
637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
641 return PyUnicode_AS_UNICODE(unicode);
642
643 onError:
644 return NULL;
645}
646
647int PyUnicode_GetSize(PyObject *unicode)
648{
649 if (!PyUnicode_Check(unicode)) {
650 PyErr_BadArgument();
651 goto onError;
652 }
653 return PyUnicode_GET_SIZE(unicode);
654
655 onError:
656 return -1;
657}
658
Thomas Wouters78890102000-07-22 19:25:51 +0000659const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000660{
661 return unicode_default_encoding;
662}
663
664int PyUnicode_SetDefaultEncoding(const char *encoding)
665{
666 PyObject *v;
667
668 /* Make sure the encoding is valid. As side effect, this also
669 loads the encoding into the codec registry cache. */
670 v = _PyCodec_Lookup(encoding);
671 if (v == NULL)
672 goto onError;
673 Py_DECREF(v);
674 strncpy(unicode_default_encoding,
675 encoding,
676 sizeof(unicode_default_encoding));
677 return 0;
678
679 onError:
680 return -1;
681}
682
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000683/* error handling callback helper:
684 build arguments, call the callback and check the arguments,
685 if no exception occured, copy the replacement to the output
686 and adjust various state variables.
687 return 0 on success, -1 on error
688*/
689
690static
691int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
692 const char *encoding, const char *reason,
693 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
694 PyObject **output, int *outpos, Py_UNICODE **outptr)
695{
696 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
697
698 PyObject *restuple = NULL;
699 PyObject *repunicode = NULL;
700 int outsize = PyUnicode_GET_SIZE(*output);
701 int requiredsize;
702 int newpos;
703 Py_UNICODE *repptr;
704 int repsize;
705 int res = -1;
706
707 if (*errorHandler == NULL) {
708 *errorHandler = PyCodec_LookupError(errors);
709 if (*errorHandler == NULL)
710 goto onError;
711 }
712
713 if (*exceptionObject == NULL) {
714 *exceptionObject = PyUnicodeDecodeError_Create(
715 encoding, input, insize, *startinpos, *endinpos, reason);
716 if (*exceptionObject == NULL)
717 goto onError;
718 }
719 else {
720 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
721 goto onError;
722 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
723 goto onError;
724 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
725 goto onError;
726 }
727
728 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
729 if (restuple == NULL)
730 goto onError;
731 if (!PyTuple_Check(restuple)) {
732 PyErr_Format(PyExc_TypeError, &argparse[4]);
733 goto onError;
734 }
735 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
736 goto onError;
737 if (newpos<0)
738 newpos = 0;
739 else if (newpos>insize)
740 newpos = insize;
741
742 /* need more space? (at least enough for what we
743 have+the replacement+the rest of the string (starting
744 at the new input position), so we won't have to check space
745 when there are no errors in the rest of the string) */
746 repptr = PyUnicode_AS_UNICODE(repunicode);
747 repsize = PyUnicode_GET_SIZE(repunicode);
748 requiredsize = *outpos + repsize + insize-newpos;
749 if (requiredsize > outsize) {
750 if (requiredsize<2*outsize)
751 requiredsize = 2*outsize;
752 if (PyUnicode_Resize(output, requiredsize))
753 goto onError;
754 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
755 }
756 *endinpos = newpos;
757 *inptr = input + newpos;
758 Py_UNICODE_COPY(*outptr, repptr, repsize);
759 *outptr += repsize;
760 *outpos += repsize;
761 /* we made it! */
762 res = 0;
763
764 onError:
765 Py_XDECREF(restuple);
766 return res;
767}
768
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000769/* --- UTF-7 Codec -------------------------------------------------------- */
770
771/* see RFC2152 for details */
772
773static
774char utf7_special[128] = {
775 /* indicate whether a UTF-7 character is special i.e. cannot be directly
776 encoded:
777 0 - not special
778 1 - special
779 2 - whitespace (optional)
780 3 - RFC2152 Set O (optional) */
781 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
783 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
785 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
787 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
789
790};
791
792#define SPECIAL(c, encodeO, encodeWS) \
793 (((c)>127 || utf7_special[(c)] == 1) || \
794 (encodeWS && (utf7_special[(c)] == 2)) || \
795 (encodeO && (utf7_special[(c)] == 3)))
796
797#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
798#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
799#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
800 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
801
802#define ENCODE(out, ch, bits) \
803 while (bits >= 6) { \
804 *out++ = B64(ch >> (bits-6)); \
805 bits -= 6; \
806 }
807
808#define DECODE(out, ch, bits, surrogate) \
809 while (bits >= 16) { \
810 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
811 bits -= 16; \
812 if (surrogate) { \
813 /* We have already generated an error for the high surrogate
814 so let's not bother seeing if the low surrogate is correct or not */\
815 surrogate = 0; \
816 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
817 /* This is a surrogate pair. Unfortunately we can't represent \
818 it in a 16-bit character */ \
819 surrogate = 1; \
820 errmsg = "code pairs are not supported"; \
821 goto utf7Error; \
822 } else { \
823 *out++ = outCh; \
824 } \
825 } \
826
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000827PyObject *PyUnicode_DecodeUTF7(const char *s,
828 int size,
829 const char *errors)
830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000831 const char *starts = s;
832 int startinpos;
833 int endinpos;
834 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000835 const char *e;
836 PyUnicodeObject *unicode;
837 Py_UNICODE *p;
838 const char *errmsg = "";
839 int inShift = 0;
840 unsigned int bitsleft = 0;
841 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000842 int surrogate = 0;
843 PyObject *errorHandler = NULL;
844 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000845
846 unicode = _PyUnicode_New(size);
847 if (!unicode)
848 return NULL;
849 if (size == 0)
850 return (PyObject *)unicode;
851
852 p = unicode->str;
853 e = s + size;
854
855 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000856 Py_UNICODE ch;
857 restart:
858 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000859
860 if (inShift) {
861 if ((ch == '-') || !B64CHAR(ch)) {
862 inShift = 0;
863 s++;
864
865 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
866 if (bitsleft >= 6) {
867 /* The shift sequence has a partial character in it. If
868 bitsleft < 6 then we could just classify it as padding
869 but that is not the case here */
870
871 errmsg = "partial character in shift sequence";
872 goto utf7Error;
873 }
874 /* According to RFC2152 the remaining bits should be zero. We
875 choose to signal an error/insert a replacement character
876 here so indicate the potential of a misencoded character. */
877
878 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
879 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
880 errmsg = "non-zero padding bits in shift sequence";
881 goto utf7Error;
882 }
883
884 if (ch == '-') {
885 if ((s < e) && (*(s) == '-')) {
886 *p++ = '-';
887 inShift = 1;
888 }
889 } else if (SPECIAL(ch,0,0)) {
890 errmsg = "unexpected special character";
891 goto utf7Error;
892 } else {
893 *p++ = ch;
894 }
895 } else {
896 charsleft = (charsleft << 6) | UB64(ch);
897 bitsleft += 6;
898 s++;
899 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
900 }
901 }
902 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000903 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904 s++;
905 if (s < e && *s == '-') {
906 s++;
907 *p++ = '+';
908 } else
909 {
910 inShift = 1;
911 bitsleft = 0;
912 }
913 }
914 else if (SPECIAL(ch,0,0)) {
915 errmsg = "unexpected special character";
916 s++;
917 goto utf7Error;
918 }
919 else {
920 *p++ = ch;
921 s++;
922 }
923 continue;
924 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000925 outpos = p-PyUnicode_AS_UNICODE(unicode);
926 endinpos = s-starts;
927 if (unicode_decode_call_errorhandler(
928 errors, &errorHandler,
929 "utf7", errmsg,
930 starts, size, &startinpos, &endinpos, &exc, &s,
931 (PyObject **)&unicode, &outpos, &p))
932 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 }
934
935 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000936 outpos = p-PyUnicode_AS_UNICODE(unicode);
937 endinpos = size;
938 if (unicode_decode_call_errorhandler(
939 errors, &errorHandler,
940 "utf7", "unterminated shift sequence",
941 starts, size, &startinpos, &endinpos, &exc, &s,
942 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000943 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000944 if (s < e)
945 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 }
947
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000948 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 goto onError;
950
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 Py_XDECREF(errorHandler);
952 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 return (PyObject *)unicode;
954
955onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000956 Py_XDECREF(errorHandler);
957 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000958 Py_DECREF(unicode);
959 return NULL;
960}
961
962
963PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
964 int size,
965 int encodeSetO,
966 int encodeWhiteSpace,
967 const char *errors)
968{
969 PyObject *v;
970 /* It might be possible to tighten this worst case */
971 unsigned int cbAllocated = 5 * size;
972 int inShift = 0;
973 int i = 0;
974 unsigned int bitsleft = 0;
975 unsigned long charsleft = 0;
976 char * out;
977 char * start;
978
979 if (size == 0)
980 return PyString_FromStringAndSize(NULL, 0);
981
982 v = PyString_FromStringAndSize(NULL, cbAllocated);
983 if (v == NULL)
984 return NULL;
985
986 start = out = PyString_AS_STRING(v);
987 for (;i < size; ++i) {
988 Py_UNICODE ch = s[i];
989
990 if (!inShift) {
991 if (ch == '+') {
992 *out++ = '+';
993 *out++ = '-';
994 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
995 charsleft = ch;
996 bitsleft = 16;
997 *out++ = '+';
998 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
999 inShift = bitsleft > 0;
1000 } else {
1001 *out++ = (char) ch;
1002 }
1003 } else {
1004 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1005 *out++ = B64(charsleft << (6-bitsleft));
1006 charsleft = 0;
1007 bitsleft = 0;
1008 /* Characters not in the BASE64 set implicitly unshift the sequence
1009 so no '-' is required, except if the character is itself a '-' */
1010 if (B64CHAR(ch) || ch == '-') {
1011 *out++ = '-';
1012 }
1013 inShift = 0;
1014 *out++ = (char) ch;
1015 } else {
1016 bitsleft += 16;
1017 charsleft = (charsleft << 16) | ch;
1018 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1019
1020 /* If the next character is special then we dont' need to terminate
1021 the shift sequence. If the next character is not a BASE64 character
1022 or '-' then the shift sequence will be terminated implicitly and we
1023 don't have to insert a '-'. */
1024
1025 if (bitsleft == 0) {
1026 if (i + 1 < size) {
1027 Py_UNICODE ch2 = s[i+1];
1028
1029 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1030
1031 } else if (B64CHAR(ch2) || ch2 == '-') {
1032 *out++ = '-';
1033 inShift = 0;
1034 } else {
1035 inShift = 0;
1036 }
1037
1038 }
1039 else {
1040 *out++ = '-';
1041 inShift = 0;
1042 }
1043 }
1044 }
1045 }
1046 }
1047 if (bitsleft) {
1048 *out++= B64(charsleft << (6-bitsleft) );
1049 *out++ = '-';
1050 }
1051
Tim Peters5de98422002-04-27 18:44:32 +00001052 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 return v;
1054}
1055
1056#undef SPECIAL
1057#undef B64
1058#undef B64CHAR
1059#undef UB64
1060#undef ENCODE
1061#undef DECODE
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063/* --- UTF-8 Codec -------------------------------------------------------- */
1064
1065static
1066char utf8_code_length[256] = {
1067 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1068 illegal prefix. see RFC 2279 for details */
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1081 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1082 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1083 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1084 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1085};
1086
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087PyObject *PyUnicode_DecodeUTF8(const char *s,
1088 int size,
1089 const char *errors)
1090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001091 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 int startinpos;
1094 int endinpos;
1095 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 const char *e;
1097 PyUnicodeObject *unicode;
1098 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001099 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001100 PyObject *errorHandler = NULL;
1101 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102
1103 /* Note: size will always be longer than the resulting Unicode
1104 character count */
1105 unicode = _PyUnicode_New(size);
1106 if (!unicode)
1107 return NULL;
1108 if (size == 0)
1109 return (PyObject *)unicode;
1110
1111 /* Unpack UTF-8 encoded data */
1112 p = unicode->str;
1113 e = s + size;
1114
1115 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001116 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117
1118 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001119 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 s++;
1121 continue;
1122 }
1123
1124 n = utf8_code_length[ch];
1125
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001126 if (s + n > e) {
1127 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001128 startinpos = s-starts;
1129 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001130 goto utf8Error;
1131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132
1133 switch (n) {
1134
1135 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001136 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001137 startinpos = s-starts;
1138 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001139 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
1141 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001143 startinpos = s-starts;
1144 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001145 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
1147 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001148 if ((s[1] & 0xc0) != 0x80) {
1149 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001150 startinpos = s-starts;
1151 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 goto utf8Error;
1153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001156 startinpos = s-starts;
1157 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "illegal encoding";
1159 goto utf8Error;
1160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 break;
1164
1165 case 3:
1166 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001167 (s[2] & 0xc0) != 0x80) {
1168 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001169 startinpos = s-starts;
1170 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 goto utf8Error;
1172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001174 if (ch < 0x0800) {
1175 /* Note: UTF-8 encodings of surrogates are considered
1176 legal UTF-8 sequences;
1177
1178 XXX For wide builds (UCS-4) we should probably try
1179 to recombine the surrogates into a single code
1180 unit.
1181 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001182 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001183 startinpos = s-starts;
1184 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001185 goto utf8Error;
1186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001188 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 break;
1190
1191 case 4:
1192 if ((s[1] & 0xc0) != 0x80 ||
1193 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001194 (s[3] & 0xc0) != 0x80) {
1195 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001196 startinpos = s-starts;
1197 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 goto utf8Error;
1199 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001200 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1201 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1202 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001203 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001205 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001206 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001207 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001209 startinpos = s-starts;
1210 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 goto utf8Error;
1212 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001213#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001214 *p++ = (Py_UNICODE)ch;
1215#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 /* compute and append the two surrogates: */
1217
1218 /* translate from 10000..10FFFF to 0..FFFF */
1219 ch -= 0x10000;
1220
1221 /* high surrogate = top 10 bits added to D800 */
1222 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1223
1224 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001225 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001226#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 break;
1228
1229 default:
1230 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001232 startinpos = s-starts;
1233 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 }
1236 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001237 continue;
1238
1239 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001240 outpos = p-PyUnicode_AS_UNICODE(unicode);
1241 if (unicode_decode_call_errorhandler(
1242 errors, &errorHandler,
1243 "utf8", errmsg,
1244 starts, size, &startinpos, &endinpos, &exc, &s,
1245 (PyObject **)&unicode, &outpos, &p))
1246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 }
1248
1249 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001250 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 goto onError;
1252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253 Py_XDECREF(errorHandler);
1254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return (PyObject *)unicode;
1256
1257onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001258 Py_XDECREF(errorHandler);
1259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 Py_DECREF(unicode);
1261 return NULL;
1262}
1263
Tim Peters602f7402002-04-27 18:03:26 +00001264/* Allocation strategy: if the string is short, convert into a stack buffer
1265 and allocate exactly as much space needed at the end. Else allocate the
1266 maximum possible needed (4 result bytes per Unicode character), and return
1267 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001268*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001269PyObject *
1270PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1271 int size,
1272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273{
Tim Peters602f7402002-04-27 18:03:26 +00001274#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001275
Tim Peters602f7402002-04-27 18:03:26 +00001276 int i; /* index into s of next input byte */
1277 PyObject *v; /* result string object */
1278 char *p; /* next free byte in output buffer */
1279 int nallocated; /* number of result bytes allocated */
1280 int nneeded; /* number of result bytes needed */
1281 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001282
Tim Peters602f7402002-04-27 18:03:26 +00001283 assert(s != NULL);
1284 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285
Tim Peters602f7402002-04-27 18:03:26 +00001286 if (size <= MAX_SHORT_UNICHARS) {
1287 /* Write into the stack buffer; nallocated can't overflow.
1288 * At the end, we'll allocate exactly as much heap space as it
1289 * turns out we need.
1290 */
1291 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1292 v = NULL; /* will allocate after we're done */
1293 p = stackbuf;
1294 }
1295 else {
1296 /* Overallocate on the heap, and give the excess back at the end. */
1297 nallocated = size * 4;
1298 if (nallocated / 4 != size) /* overflow! */
1299 return PyErr_NoMemory();
1300 v = PyString_FromStringAndSize(NULL, nallocated);
1301 if (v == NULL)
1302 return NULL;
1303 p = PyString_AS_STRING(v);
1304 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001305
Tim Peters602f7402002-04-27 18:03:26 +00001306 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001307 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001308
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001309 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001310 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001312
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001314 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001315 *p++ = (char)(0xc0 | (ch >> 6));
1316 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001317 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001318 else {
Tim Peters602f7402002-04-27 18:03:26 +00001319 /* Encode UCS2 Unicode ordinals */
1320 if (ch < 0x10000) {
1321 /* Special case: check for high surrogate */
1322 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1323 Py_UCS4 ch2 = s[i];
1324 /* Check for low surrogate and combine the two to
1325 form a UCS4 value */
1326 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001327 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001328 i++;
1329 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 }
Tim Peters602f7402002-04-27 18:03:26 +00001331 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001334 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1335 *p++ = (char)(0x80 | (ch & 0x3f));
1336 continue;
1337 }
1338encodeUCS4:
1339 /* Encode UCS4 Unicode ordinals */
1340 *p++ = (char)(0xf0 | (ch >> 18));
1341 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1343 *p++ = (char)(0x80 | (ch & 0x3f));
1344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001346
Tim Peters602f7402002-04-27 18:03:26 +00001347 if (v == NULL) {
1348 /* This was stack allocated. */
1349 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1350 assert(nneeded <= nallocated);
1351 v = PyString_FromStringAndSize(stackbuf, nneeded);
1352 }
1353 else {
1354 /* Cut back to size actually needed. */
1355 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1356 assert(nneeded <= nallocated);
1357 _PyString_Resize(&v, nneeded);
1358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001360
Tim Peters602f7402002-04-27 18:03:26 +00001361#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362}
1363
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1365{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 if (!PyUnicode_Check(unicode)) {
1367 PyErr_BadArgument();
1368 return NULL;
1369 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001370 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1371 PyUnicode_GET_SIZE(unicode),
1372 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373}
1374
1375/* --- UTF-16 Codec ------------------------------------------------------- */
1376
Tim Peters772747b2001-08-09 22:21:55 +00001377PyObject *
1378PyUnicode_DecodeUTF16(const char *s,
1379 int size,
1380 const char *errors,
1381 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 const char *starts = s;
1384 int startinpos;
1385 int endinpos;
1386 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 PyUnicodeObject *unicode;
1388 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001389 const unsigned char *q, *e;
1390 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001391 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001392 /* Offsets from q for retrieving byte pairs in the right order. */
1393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1394 int ihi = 1, ilo = 0;
1395#else
1396 int ihi = 0, ilo = 1;
1397#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001398 PyObject *errorHandler = NULL;
1399 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400
1401 /* Note: size will always be longer than the resulting Unicode
1402 character count */
1403 unicode = _PyUnicode_New(size);
1404 if (!unicode)
1405 return NULL;
1406 if (size == 0)
1407 return (PyObject *)unicode;
1408
1409 /* Unpack UTF-16 encoded data */
1410 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001411 q = (unsigned char *)s;
1412 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413
1414 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001415 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001417 /* Check for BOM marks (U+FEFF) in the input and adjust current
1418 byte order setting accordingly. In native mode, the leading BOM
1419 mark is skipped, in all other modes, it is copied to the output
1420 stream as-is (giving a ZWNBSP character). */
1421 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001422 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001424 if (bom == 0xFEFF) {
1425 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001426 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001427 }
1428 else if (bom == 0xFFFE) {
1429 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001430 bo = 1;
1431 }
1432#else
Tim Peters772747b2001-08-09 22:21:55 +00001433 if (bom == 0xFEFF) {
1434 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001435 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001436 }
1437 else if (bom == 0xFFFE) {
1438 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439 bo = -1;
1440 }
1441#endif
1442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443
Tim Peters772747b2001-08-09 22:21:55 +00001444 if (bo == -1) {
1445 /* force LE */
1446 ihi = 1;
1447 ilo = 0;
1448 }
1449 else if (bo == 1) {
1450 /* force BE */
1451 ihi = 0;
1452 ilo = 1;
1453 }
1454
1455 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 Py_UNICODE ch;
1457 /* remaing bytes at the end? (size should be even) */
1458 if (e-q<2) {
1459 errmsg = "truncated data";
1460 startinpos = ((const char *)q)-starts;
1461 endinpos = ((const char *)e)-starts;
1462 goto utf16Error;
1463 /* The remaining input chars are ignored if the callback
1464 chooses to skip the input */
1465 }
1466 ch = (q[ihi] << 8) | q[ilo];
1467
Tim Peters772747b2001-08-09 22:21:55 +00001468 q += 2;
1469
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (ch < 0xD800 || ch > 0xDFFF) {
1471 *p++ = ch;
1472 continue;
1473 }
1474
1475 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001476 if (q >= e) {
1477 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 startinpos = (((const char *)q)-2)-starts;
1479 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 goto utf16Error;
1481 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001482 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001483 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1484 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001485 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001486#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001487 *p++ = ch;
1488 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489#else
1490 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001491#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001492 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493 }
1494 else {
1495 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 startinpos = (((const char *)q)-4)-starts;
1497 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001498 goto utf16Error;
1499 }
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001502 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 startinpos = (((const char *)q)-2)-starts;
1504 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001505 /* Fall through to report the error */
1506
1507 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001508 outpos = p-PyUnicode_AS_UNICODE(unicode);
1509 if (unicode_decode_call_errorhandler(
1510 errors, &errorHandler,
1511 "utf16", errmsg,
1512 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1513 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001514 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 }
1516
1517 if (byteorder)
1518 *byteorder = bo;
1519
1520 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001521 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 goto onError;
1523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 Py_XDECREF(errorHandler);
1525 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return (PyObject *)unicode;
1527
1528onError:
1529 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 Py_XDECREF(errorHandler);
1531 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 return NULL;
1533}
1534
Tim Peters772747b2001-08-09 22:21:55 +00001535PyObject *
1536PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1537 int size,
1538 const char *errors,
1539 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540{
1541 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001542 unsigned char *p;
1543 int i, pairs;
1544 /* Offsets from p for storing byte pairs in the right order. */
1545#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1546 int ihi = 1, ilo = 0;
1547#else
1548 int ihi = 0, ilo = 1;
1549#endif
1550
1551#define STORECHAR(CH) \
1552 do { \
1553 p[ihi] = ((CH) >> 8) & 0xff; \
1554 p[ilo] = (CH) & 0xff; \
1555 p += 2; \
1556 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001558 for (i = pairs = 0; i < size; i++)
1559 if (s[i] >= 0x10000)
1560 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001562 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (v == NULL)
1564 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
Tim Peters772747b2001-08-09 22:21:55 +00001566 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001568 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001569 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001570 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001571
1572 if (byteorder == -1) {
1573 /* force LE */
1574 ihi = 1;
1575 ilo = 0;
1576 }
1577 else if (byteorder == 1) {
1578 /* force BE */
1579 ihi = 0;
1580 ilo = 1;
1581 }
1582
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583 while (size-- > 0) {
1584 Py_UNICODE ch = *s++;
1585 Py_UNICODE ch2 = 0;
1586 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001587 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1588 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 }
Tim Peters772747b2001-08-09 22:21:55 +00001590 STORECHAR(ch);
1591 if (ch2)
1592 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001595#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596}
1597
1598PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1599{
1600 if (!PyUnicode_Check(unicode)) {
1601 PyErr_BadArgument();
1602 return NULL;
1603 }
1604 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1605 PyUnicode_GET_SIZE(unicode),
1606 NULL,
1607 0);
1608}
1609
1610/* --- Unicode Escape Codec ----------------------------------------------- */
1611
Fredrik Lundh06d12682001-01-24 07:59:11 +00001612static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001613
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1615 int size,
1616 const char *errors)
1617{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 const char *starts = s;
1619 int startinpos;
1620 int endinpos;
1621 int outpos;
1622 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001625 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626 char* message;
1627 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 PyObject *errorHandler = NULL;
1629 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001630
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631 /* Escaped strings will always be longer than the resulting
1632 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 length after conversion to the true value.
1634 (but if the error callback returns a long replacement string
1635 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636 v = _PyUnicode_New(size);
1637 if (v == NULL)
1638 goto onError;
1639 if (size == 0)
1640 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001644
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645 while (s < end) {
1646 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001647 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649
1650 /* Non-escape characters are interpreted as Unicode ordinals */
1651 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001652 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 continue;
1654 }
1655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 /* \ - Escapes */
1658 s++;
1659 switch (*s++) {
1660
1661 /* \x escapes */
1662 case '\n': break;
1663 case '\\': *p++ = '\\'; break;
1664 case '\'': *p++ = '\''; break;
1665 case '\"': *p++ = '\"'; break;
1666 case 'b': *p++ = '\b'; break;
1667 case 'f': *p++ = '\014'; break; /* FF */
1668 case 't': *p++ = '\t'; break;
1669 case 'n': *p++ = '\n'; break;
1670 case 'r': *p++ = '\r'; break;
1671 case 'v': *p++ = '\013'; break; /* VT */
1672 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1673
1674 /* \OOO (octal) escapes */
1675 case '0': case '1': case '2': case '3':
1676 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001677 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001679 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001681 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001683 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 break;
1685
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686 /* hex escapes */
1687 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001689 digits = 2;
1690 message = "truncated \\xXX escape";
1691 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692
Fredrik Lundhccc74732001-02-18 22:13:49 +00001693 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001695 digits = 4;
1696 message = "truncated \\uXXXX escape";
1697 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698
Fredrik Lundhccc74732001-02-18 22:13:49 +00001699 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001700 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001701 digits = 8;
1702 message = "truncated \\UXXXXXXXX escape";
1703 hexescape:
1704 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001705 outpos = p-PyUnicode_AS_UNICODE(v);
1706 if (s+digits>end) {
1707 endinpos = size;
1708 if (unicode_decode_call_errorhandler(
1709 errors, &errorHandler,
1710 "unicodeescape", "end of string in escape sequence",
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 (PyObject **)&v, &outpos, &p))
1713 goto onError;
1714 goto nextByte;
1715 }
1716 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001718 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 endinpos = (s+i+1)-starts;
1720 if (unicode_decode_call_errorhandler(
1721 errors, &errorHandler,
1722 "unicodeescape", message,
1723 starts, size, &startinpos, &endinpos, &exc, &s,
1724 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001725 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001726 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001727 }
1728 chr = (chr<<4) & ~0xF;
1729 if (c >= '0' && c <= '9')
1730 chr += c - '0';
1731 else if (c >= 'a' && c <= 'f')
1732 chr += 10 + c - 'a';
1733 else
1734 chr += 10 + c - 'A';
1735 }
1736 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001737 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 /* _decoding_error will have already written into the
1739 target buffer. */
1740 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001741 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001742 /* when we get here, chr is a 32-bit unicode character */
1743 if (chr <= 0xffff)
1744 /* UCS-2 character */
1745 *p++ = (Py_UNICODE) chr;
1746 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001747 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001748 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001749#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 *p++ = chr;
1751#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001752 chr -= 0x10000L;
1753 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001754 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 endinpos = s-starts;
1758 outpos = p-PyUnicode_AS_UNICODE(v);
1759 if (unicode_decode_call_errorhandler(
1760 errors, &errorHandler,
1761 "unicodeescape", "illegal Unicode character",
1762 starts, size, &startinpos, &endinpos, &exc, &s,
1763 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001764 goto onError;
1765 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001766 break;
1767
1768 /* \N{name} */
1769 case 'N':
1770 message = "malformed \\N character escape";
1771 if (ucnhash_CAPI == NULL) {
1772 /* load the unicode data module */
1773 PyObject *m, *v;
1774 m = PyImport_ImportModule("unicodedata");
1775 if (m == NULL)
1776 goto ucnhashError;
1777 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1778 Py_DECREF(m);
1779 if (v == NULL)
1780 goto ucnhashError;
1781 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1782 Py_DECREF(v);
1783 if (ucnhash_CAPI == NULL)
1784 goto ucnhashError;
1785 }
1786 if (*s == '{') {
1787 const char *start = s+1;
1788 /* look for the closing brace */
1789 while (*s != '}' && s < end)
1790 s++;
1791 if (s > start && s < end && *s == '}') {
1792 /* found a name. look it up in the unicode database */
1793 message = "unknown Unicode character name";
1794 s++;
1795 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1796 goto store;
1797 }
1798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 endinpos = s-starts;
1800 outpos = p-PyUnicode_AS_UNICODE(v);
1801 if (unicode_decode_call_errorhandler(
1802 errors, &errorHandler,
1803 "unicodeescape", message,
1804 starts, size, &startinpos, &endinpos, &exc, &s,
1805 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001807 break;
1808
1809 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001810 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 message = "\\ at end of string";
1812 s--;
1813 endinpos = s-starts;
1814 outpos = p-PyUnicode_AS_UNICODE(v);
1815 if (unicode_decode_call_errorhandler(
1816 errors, &errorHandler,
1817 "unicodeescape", message,
1818 starts, size, &startinpos, &endinpos, &exc, &s,
1819 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001820 goto onError;
1821 }
1822 else {
1823 *p++ = '\\';
1824 *p++ = (unsigned char)s[-1];
1825 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001826 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 nextByte:
1829 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1832 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001834
Fredrik Lundhccc74732001-02-18 22:13:49 +00001835ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001836 PyErr_SetString(
1837 PyExc_UnicodeError,
1838 "\\N escapes not supported (can't load unicodedata module)"
1839 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001840 Py_XDECREF(errorHandler);
1841 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001842 return NULL;
1843
Fredrik Lundhccc74732001-02-18 22:13:49 +00001844onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 Py_XDECREF(errorHandler);
1847 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 return NULL;
1849}
1850
1851/* Return a Unicode-Escape string version of the Unicode object.
1852
1853 If quotes is true, the string is enclosed in u"" or u'' quotes as
1854 appropriate.
1855
1856*/
1857
Barry Warsaw51ac5802000-03-20 16:36:48 +00001858static const Py_UNICODE *findchar(const Py_UNICODE *s,
1859 int size,
1860 Py_UNICODE ch);
1861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862static
1863PyObject *unicodeescape_string(const Py_UNICODE *s,
1864 int size,
1865 int quotes)
1866{
1867 PyObject *repr;
1868 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001870 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871
1872 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1873 if (repr == NULL)
1874 return NULL;
1875
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001876 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877
1878 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 *p++ = 'u';
1880 *p++ = (findchar(s, size, '\'') &&
1881 !findchar(s, size, '"')) ? '"' : '\'';
1882 }
1883 while (size-- > 0) {
1884 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001887 if (quotes &&
1888 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 *p++ = '\\';
1890 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001891 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001893
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001894#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001895 /* Map 21-bit characters to '\U00xxxxxx' */
1896 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001897 int offset = p - PyString_AS_STRING(repr);
1898
1899 /* Resize the string if necessary */
1900 if (offset + 12 > PyString_GET_SIZE(repr)) {
1901 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001902 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903 p = PyString_AS_STRING(repr) + offset;
1904 }
1905
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001906 *p++ = '\\';
1907 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001908 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1909 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1910 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1911 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1912 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1913 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1914 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 *p++ = hexdigit[ch & 0x0000000F];
1916 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001918#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001919 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1920 else if (ch >= 0xD800 && ch < 0xDC00) {
1921 Py_UNICODE ch2;
1922 Py_UCS4 ucs;
1923
1924 ch2 = *s++;
1925 size--;
1926 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1927 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1928 *p++ = '\\';
1929 *p++ = 'U';
1930 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1931 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1932 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1933 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1934 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1935 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1936 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1937 *p++ = hexdigit[ucs & 0x0000000F];
1938 continue;
1939 }
1940 /* Fall through: isolated surrogates are copied as-is */
1941 s--;
1942 size++;
1943 }
1944
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001946 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 *p++ = '\\';
1948 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001949 *p++ = hexdigit[(ch >> 12) & 0x000F];
1950 *p++ = hexdigit[(ch >> 8) & 0x000F];
1951 *p++ = hexdigit[(ch >> 4) & 0x000F];
1952 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001954
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001955 /* Map special whitespace to '\t', \n', '\r' */
1956 else if (ch == '\t') {
1957 *p++ = '\\';
1958 *p++ = 't';
1959 }
1960 else if (ch == '\n') {
1961 *p++ = '\\';
1962 *p++ = 'n';
1963 }
1964 else if (ch == '\r') {
1965 *p++ = '\\';
1966 *p++ = 'r';
1967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001968
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001969 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001970 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001972 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001973 *p++ = hexdigit[(ch >> 4) & 0x000F];
1974 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001976
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 /* Copy everything else as-is */
1978 else
1979 *p++ = (char) ch;
1980 }
1981 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001985 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 return repr;
1987}
1988
1989PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1990 int size)
1991{
1992 return unicodeescape_string(s, size, 0);
1993}
1994
1995PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1996{
1997 if (!PyUnicode_Check(unicode)) {
1998 PyErr_BadArgument();
1999 return NULL;
2000 }
2001 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2002 PyUnicode_GET_SIZE(unicode));
2003}
2004
2005/* --- Raw Unicode Escape Codec ------------------------------------------- */
2006
2007PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2008 int size,
2009 const char *errors)
2010{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 const char *starts = s;
2012 int startinpos;
2013 int endinpos;
2014 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 const char *end;
2018 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 /* Escaped strings will always be longer than the resulting
2023 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 length after conversion to the true value. (But decoding error
2025 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 v = _PyUnicode_New(size);
2027 if (v == NULL)
2028 goto onError;
2029 if (size == 0)
2030 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 end = s + size;
2033 while (s < end) {
2034 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002035 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int i;
2037
2038 /* Non-escape characters are interpreted as Unicode ordinals */
2039 if (*s != '\\') {
2040 *p++ = (unsigned char)*s++;
2041 continue;
2042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* \u-escapes are only interpreted iff the number of leading
2046 backslashes if odd */
2047 bs = s;
2048 for (;s < end;) {
2049 if (*s != '\\')
2050 break;
2051 *p++ = (unsigned char)*s++;
2052 }
2053 if (((s - bs) & 1) == 0 ||
2054 s >= end ||
2055 *s != 'u') {
2056 continue;
2057 }
2058 p--;
2059 s++;
2060
2061 /* \uXXXX with 4 hex digits */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 outpos = p-PyUnicode_AS_UNICODE(v);
2063 for (x = 0, i = 0; i < 4; ++i, ++s) {
2064 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 endinpos = s-starts;
2067 if (unicode_decode_call_errorhandler(
2068 errors, &errorHandler,
2069 "rawunicodeescape", "truncated \\uXXXX",
2070 starts, size, &startinpos, &endinpos, &exc, &s,
2071 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002073 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 }
2075 x = (x<<4) & ~0xF;
2076 if (c >= '0' && c <= '9')
2077 x += c - '0';
2078 else if (c >= 'a' && c <= 'f')
2079 x += 10 + c - 'a';
2080 else
2081 x += 10 + c - 'A';
2082 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 *p++ = x;
2084 nextByte:
2085 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002087 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 Py_XDECREF(errorHandler);
2090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 return (PyObject *)v;
2092
2093 onError:
2094 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 Py_XDECREF(errorHandler);
2096 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 return NULL;
2098}
2099
2100PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2101 int size)
2102{
2103 PyObject *repr;
2104 char *p;
2105 char *q;
2106
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002107 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
2109 repr = PyString_FromStringAndSize(NULL, 6 * size);
2110 if (repr == NULL)
2111 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002112 if (size == 0)
2113 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114
2115 p = q = PyString_AS_STRING(repr);
2116 while (size-- > 0) {
2117 Py_UNICODE ch = *s++;
2118 /* Map 16-bit characters to '\uxxxx' */
2119 if (ch >= 256) {
2120 *p++ = '\\';
2121 *p++ = 'u';
2122 *p++ = hexdigit[(ch >> 12) & 0xf];
2123 *p++ = hexdigit[(ch >> 8) & 0xf];
2124 *p++ = hexdigit[(ch >> 4) & 0xf];
2125 *p++ = hexdigit[ch & 15];
2126 }
2127 /* Copy everything else as-is */
2128 else
2129 *p++ = (char) ch;
2130 }
2131 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002132 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 return repr;
2134}
2135
2136PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2137{
2138 if (!PyUnicode_Check(unicode)) {
2139 PyErr_BadArgument();
2140 return NULL;
2141 }
2142 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2143 PyUnicode_GET_SIZE(unicode));
2144}
2145
2146/* --- Latin-1 Codec ------------------------------------------------------ */
2147
2148PyObject *PyUnicode_DecodeLatin1(const char *s,
2149 int size,
2150 const char *errors)
2151{
2152 PyUnicodeObject *v;
2153 Py_UNICODE *p;
2154
2155 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 if (size == 1 && *(unsigned char*)s < 256) {
2157 Py_UNICODE r = *(unsigned char*)s;
2158 return PyUnicode_FromUnicode(&r, 1);
2159 }
2160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 v = _PyUnicode_New(size);
2162 if (v == NULL)
2163 goto onError;
2164 if (size == 0)
2165 return (PyObject *)v;
2166 p = PyUnicode_AS_UNICODE(v);
2167 while (size-- > 0)
2168 *p++ = (unsigned char)*s++;
2169 return (PyObject *)v;
2170
2171 onError:
2172 Py_XDECREF(v);
2173 return NULL;
2174}
2175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176/* create or adjust a UnicodeEncodeError */
2177static void make_encode_exception(PyObject **exceptionObject,
2178 const char *encoding,
2179 const Py_UNICODE *unicode, int size,
2180 int startpos, int endpos,
2181 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 if (*exceptionObject == NULL) {
2184 *exceptionObject = PyUnicodeEncodeError_Create(
2185 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 }
2187 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002188 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2189 goto onError;
2190 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2191 goto onError;
2192 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2193 goto onError;
2194 return;
2195 onError:
2196 Py_DECREF(*exceptionObject);
2197 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 }
2199}
2200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002201/* raises a UnicodeEncodeError */
2202static void raise_encode_exception(PyObject **exceptionObject,
2203 const char *encoding,
2204 const Py_UNICODE *unicode, int size,
2205 int startpos, int endpos,
2206 const char *reason)
2207{
2208 make_encode_exception(exceptionObject,
2209 encoding, unicode, size, startpos, endpos, reason);
2210 if (*exceptionObject != NULL)
2211 PyCodec_StrictErrors(*exceptionObject);
2212}
2213
2214/* error handling callback helper:
2215 build arguments, call the callback and check the arguments,
2216 put the result into newpos and return the replacement string, which
2217 has to be freed by the caller */
2218static PyObject *unicode_encode_call_errorhandler(const char *errors,
2219 PyObject **errorHandler,
2220 const char *encoding, const char *reason,
2221 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2222 int startpos, int endpos,
2223 int *newpos)
2224{
2225 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2226
2227 PyObject *restuple;
2228 PyObject *resunicode;
2229
2230 if (*errorHandler == NULL) {
2231 *errorHandler = PyCodec_LookupError(errors);
2232 if (*errorHandler == NULL)
2233 return NULL;
2234 }
2235
2236 make_encode_exception(exceptionObject,
2237 encoding, unicode, size, startpos, endpos, reason);
2238 if (*exceptionObject == NULL)
2239 return NULL;
2240
2241 restuple = PyObject_CallFunctionObjArgs(
2242 *errorHandler, *exceptionObject, NULL);
2243 if (restuple == NULL)
2244 return NULL;
2245 if (!PyTuple_Check(restuple)) {
2246 PyErr_Format(PyExc_TypeError, &argparse[4]);
2247 Py_DECREF(restuple);
2248 return NULL;
2249 }
2250 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2251 &resunicode, newpos)) {
2252 Py_DECREF(restuple);
2253 return NULL;
2254 }
2255 if (*newpos<0)
2256 *newpos = 0;
2257 else if (*newpos>size)
2258 *newpos = size;
2259 Py_INCREF(resunicode);
2260 Py_DECREF(restuple);
2261 return resunicode;
2262}
2263
2264static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2265 int size,
2266 const char *errors,
2267 int limit)
2268{
2269 /* output object */
2270 PyObject *res;
2271 /* pointers to the beginning and end+1 of input */
2272 const Py_UNICODE *startp = p;
2273 const Py_UNICODE *endp = p + size;
2274 /* pointer to the beginning of the unencodable characters */
2275 /* const Py_UNICODE *badp = NULL; */
2276 /* pointer into the output */
2277 char *str;
2278 /* current output position */
2279 int respos = 0;
2280 int ressize;
2281 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2282 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2283 PyObject *errorHandler = NULL;
2284 PyObject *exc = NULL;
2285 /* the following variable is used for caching string comparisons
2286 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2287 int known_errorHandler = -1;
2288
2289 /* allocate enough for a simple encoding without
2290 replacements, if we need more, we'll resize */
2291 res = PyString_FromStringAndSize(NULL, size);
2292 if (res == NULL)
2293 goto onError;
2294 if (size == 0)
2295 return res;
2296 str = PyString_AS_STRING(res);
2297 ressize = size;
2298
2299 while (p<endp) {
2300 Py_UNICODE c = *p;
2301
2302 /* can we encode this? */
2303 if (c<limit) {
2304 /* no overflow check, because we know that the space is enough */
2305 *str++ = (char)c;
2306 ++p;
2307 }
2308 else {
2309 int unicodepos = p-startp;
2310 int requiredsize;
2311 PyObject *repunicode;
2312 int repsize;
2313 int newpos;
2314 int respos;
2315 Py_UNICODE *uni2;
2316 /* startpos for collecting unencodable chars */
2317 const Py_UNICODE *collstart = p;
2318 const Py_UNICODE *collend = p;
2319 /* find all unecodable characters */
2320 while ((collend < endp) && ((*collend)>=limit))
2321 ++collend;
2322 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2323 if (known_errorHandler==-1) {
2324 if ((errors==NULL) || (!strcmp(errors, "strict")))
2325 known_errorHandler = 1;
2326 else if (!strcmp(errors, "replace"))
2327 known_errorHandler = 2;
2328 else if (!strcmp(errors, "ignore"))
2329 known_errorHandler = 3;
2330 else if (!strcmp(errors, "xmlcharrefreplace"))
2331 known_errorHandler = 4;
2332 else
2333 known_errorHandler = 0;
2334 }
2335 switch (known_errorHandler) {
2336 case 1: /* strict */
2337 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2338 goto onError;
2339 case 2: /* replace */
2340 while (collstart++<collend)
2341 *str++ = '?'; /* fall through */
2342 case 3: /* ignore */
2343 p = collend;
2344 break;
2345 case 4: /* xmlcharrefreplace */
2346 respos = str-PyString_AS_STRING(res);
2347 /* determine replacement size (temporarily (mis)uses p) */
2348 for (p = collstart, repsize = 0; p < collend; ++p) {
2349 if (*p<10)
2350 repsize += 2+1+1;
2351 else if (*p<100)
2352 repsize += 2+2+1;
2353 else if (*p<1000)
2354 repsize += 2+3+1;
2355 else if (*p<10000)
2356 repsize += 2+4+1;
2357 else if (*p<100000)
2358 repsize += 2+5+1;
2359 else if (*p<1000000)
2360 repsize += 2+6+1;
2361 else
2362 repsize += 2+7+1;
2363 }
2364 requiredsize = respos+repsize+(endp-collend);
2365 if (requiredsize > ressize) {
2366 if (requiredsize<2*ressize)
2367 requiredsize = 2*ressize;
2368 if (_PyString_Resize(&res, requiredsize))
2369 goto onError;
2370 str = PyString_AS_STRING(res) + respos;
2371 ressize = requiredsize;
2372 }
2373 /* generate replacement (temporarily (mis)uses p) */
2374 for (p = collstart; p < collend; ++p) {
2375 str += sprintf(str, "&#%d;", (int)*p);
2376 }
2377 p = collend;
2378 break;
2379 default:
2380 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2381 encoding, reason, startp, size, &exc,
2382 collstart-startp, collend-startp, &newpos);
2383 if (repunicode == NULL)
2384 goto onError;
2385 /* need more space? (at least enough for what we
2386 have+the replacement+the rest of the string, so
2387 we won't have to check space for encodable characters) */
2388 respos = str-PyString_AS_STRING(res);
2389 repsize = PyUnicode_GET_SIZE(repunicode);
2390 requiredsize = respos+repsize+(endp-collend);
2391 if (requiredsize > ressize) {
2392 if (requiredsize<2*ressize)
2393 requiredsize = 2*ressize;
2394 if (_PyString_Resize(&res, requiredsize)) {
2395 Py_DECREF(repunicode);
2396 goto onError;
2397 }
2398 str = PyString_AS_STRING(res) + respos;
2399 ressize = requiredsize;
2400 }
2401 /* check if there is anything unencodable in the replacement
2402 and copy it to the output */
2403 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2404 c = *uni2;
2405 if (c >= limit) {
2406 raise_encode_exception(&exc, encoding, startp, size,
2407 unicodepos, unicodepos+1, reason);
2408 Py_DECREF(repunicode);
2409 goto onError;
2410 }
2411 *str = (char)c;
2412 }
2413 p = startp + newpos;
2414 Py_DECREF(repunicode);
2415 }
2416 }
2417 }
2418 /* Resize if we allocated to much */
2419 respos = str-PyString_AS_STRING(res);
2420 if (respos<ressize)
2421 /* If this falls res will be NULL */
2422 _PyString_Resize(&res, respos);
2423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
2425 return res;
2426
2427 onError:
2428 Py_XDECREF(res);
2429 Py_XDECREF(errorHandler);
2430 Py_XDECREF(exc);
2431 return NULL;
2432}
2433
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2435 int size,
2436 const char *errors)
2437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002438 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439}
2440
2441PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2442{
2443 if (!PyUnicode_Check(unicode)) {
2444 PyErr_BadArgument();
2445 return NULL;
2446 }
2447 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2448 PyUnicode_GET_SIZE(unicode),
2449 NULL);
2450}
2451
2452/* --- 7-bit ASCII Codec -------------------------------------------------- */
2453
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454PyObject *PyUnicode_DecodeASCII(const char *s,
2455 int size,
2456 const char *errors)
2457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002458 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 PyUnicodeObject *v;
2460 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 int startinpos;
2462 int endinpos;
2463 int outpos;
2464 const char *e;
2465 PyObject *errorHandler = NULL;
2466 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467
2468 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002469 if (size == 1 && *(unsigned char*)s < 128) {
2470 Py_UNICODE r = *(unsigned char*)s;
2471 return PyUnicode_FromUnicode(&r, 1);
2472 }
2473
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 v = _PyUnicode_New(size);
2475 if (v == NULL)
2476 goto onError;
2477 if (size == 0)
2478 return (PyObject *)v;
2479 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 e = s + size;
2481 while (s < e) {
2482 register unsigned char c = (unsigned char)*s;
2483 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 ++s;
2486 }
2487 else {
2488 startinpos = s-starts;
2489 endinpos = startinpos + 1;
2490 outpos = p-PyUnicode_AS_UNICODE(v);
2491 if (unicode_decode_call_errorhandler(
2492 errors, &errorHandler,
2493 "ascii", "ordinal not in range(128)",
2494 starts, size, &startinpos, &endinpos, &exc, &s,
2495 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002499 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002500 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002501 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 Py_XDECREF(errorHandler);
2503 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 return (PyObject *)v;
2505
2506 onError:
2507 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002508 Py_XDECREF(errorHandler);
2509 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return NULL;
2511}
2512
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2514 int size,
2515 const char *errors)
2516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002517 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518}
2519
2520PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2521{
2522 if (!PyUnicode_Check(unicode)) {
2523 PyErr_BadArgument();
2524 return NULL;
2525 }
2526 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2527 PyUnicode_GET_SIZE(unicode),
2528 NULL);
2529}
2530
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002532
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002533/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002534
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002535PyObject *PyUnicode_DecodeMBCS(const char *s,
2536 int size,
2537 const char *errors)
2538{
2539 PyUnicodeObject *v;
2540 Py_UNICODE *p;
2541
2542 /* First get the size of the result */
2543 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002544 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002545 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2546
2547 v = _PyUnicode_New(usize);
2548 if (v == NULL)
2549 return NULL;
2550 if (usize == 0)
2551 return (PyObject *)v;
2552 p = PyUnicode_AS_UNICODE(v);
2553 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2554 Py_DECREF(v);
2555 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2556 }
2557
2558 return (PyObject *)v;
2559}
2560
2561PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2562 int size,
2563 const char *errors)
2564{
2565 PyObject *repr;
2566 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002567 DWORD mbcssize;
2568
2569 /* If there are no characters, bail now! */
2570 if (size==0)
2571 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002572
2573 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002574 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002575 if (mbcssize==0)
2576 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2577
2578 repr = PyString_FromStringAndSize(NULL, mbcssize);
2579 if (repr == NULL)
2580 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002581 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002582 return repr;
2583
2584 /* Do the conversion */
2585 s = PyString_AS_STRING(repr);
2586 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2587 Py_DECREF(repr);
2588 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2589 }
2590 return repr;
2591}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002592
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002593#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002594
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595/* --- Character Mapping Codec -------------------------------------------- */
2596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597PyObject *PyUnicode_DecodeCharmap(const char *s,
2598 int size,
2599 PyObject *mapping,
2600 const char *errors)
2601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002602 const char *starts = s;
2603 int startinpos;
2604 int endinpos;
2605 int outpos;
2606 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 PyUnicodeObject *v;
2608 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002609 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 PyObject *errorHandler = NULL;
2611 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612
2613 /* Default to Latin-1 */
2614 if (mapping == NULL)
2615 return PyUnicode_DecodeLatin1(s, size, errors);
2616
2617 v = _PyUnicode_New(size);
2618 if (v == NULL)
2619 goto onError;
2620 if (size == 0)
2621 return (PyObject *)v;
2622 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 e = s + size;
2624 while (s < e) {
2625 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 PyObject *w, *x;
2627
2628 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2629 w = PyInt_FromLong((long)ch);
2630 if (w == NULL)
2631 goto onError;
2632 x = PyObject_GetItem(mapping, w);
2633 Py_DECREF(w);
2634 if (x == NULL) {
2635 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002636 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002638 x = Py_None;
2639 Py_INCREF(x);
2640 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002641 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 }
2643
2644 /* Apply mapping */
2645 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002646 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 if (value < 0 || value > 65535) {
2648 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002649 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 Py_DECREF(x);
2651 goto onError;
2652 }
2653 *p++ = (Py_UNICODE)value;
2654 }
2655 else if (x == Py_None) {
2656 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 outpos = p-PyUnicode_AS_UNICODE(v);
2658 startinpos = s-starts;
2659 endinpos = startinpos+1;
2660 if (unicode_decode_call_errorhandler(
2661 errors, &errorHandler,
2662 "charmap", "character maps to <undefined>",
2663 starts, size, &startinpos, &endinpos, &exc, &s,
2664 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 Py_DECREF(x);
2666 goto onError;
2667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 }
2670 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002671 int targetsize = PyUnicode_GET_SIZE(x);
2672
2673 if (targetsize == 1)
2674 /* 1-1 mapping */
2675 *p++ = *PyUnicode_AS_UNICODE(x);
2676
2677 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002679 if (targetsize > extrachars) {
2680 /* resize first */
2681 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2682 int needed = (targetsize - extrachars) + \
2683 (targetsize << 2);
2684 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002685 if (_PyUnicode_Resize(&v,
2686 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002687 Py_DECREF(x);
2688 goto onError;
2689 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002690 p = PyUnicode_AS_UNICODE(v) + oldpos;
2691 }
2692 Py_UNICODE_COPY(p,
2693 PyUnicode_AS_UNICODE(x),
2694 targetsize);
2695 p += targetsize;
2696 extrachars -= targetsize;
2697 }
2698 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
2700 else {
2701 /* wrong return value */
2702 PyErr_SetString(PyExc_TypeError,
2703 "character mapping must return integer, None or unicode");
2704 Py_DECREF(x);
2705 goto onError;
2706 }
2707 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 }
2710 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002711 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_XDECREF(errorHandler);
2714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 return (PyObject *)v;
2716
2717 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 Py_XDECREF(errorHandler);
2719 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 Py_XDECREF(v);
2721 return NULL;
2722}
2723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724/* Lookup the character ch in the mapping. If the character
2725 can't be found, Py_None is returned (or NULL, if another
2726 error occured). */
2727static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 PyObject *w = PyInt_FromLong((long)c);
2730 PyObject *x;
2731
2732 if (w == NULL)
2733 return NULL;
2734 x = PyObject_GetItem(mapping, w);
2735 Py_DECREF(w);
2736 if (x == NULL) {
2737 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2738 /* No mapping found means: mapping is undefined. */
2739 PyErr_Clear();
2740 x = Py_None;
2741 Py_INCREF(x);
2742 return x;
2743 } else
2744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 else if (PyInt_Check(x)) {
2747 long value = PyInt_AS_LONG(x);
2748 if (value < 0 || value > 255) {
2749 PyErr_SetString(PyExc_TypeError,
2750 "character mapping must be in range(256)");
2751 Py_DECREF(x);
2752 return NULL;
2753 }
2754 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 else if (PyString_Check(x))
2757 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 /* wrong return value */
2760 PyErr_SetString(PyExc_TypeError,
2761 "character mapping must return integer, None or str");
2762 Py_DECREF(x);
2763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
2765}
2766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767/* lookup the character, put the result in the output string and adjust
2768 various state variables. Reallocate the output string if not enough
2769 space is available. Return a new reference to the object that
2770 was put in the output buffer, or Py_None, if the mapping was undefined
2771 (in which case no character was written) or NULL, if a
2772 reallocation error ocurred. The called must decref the result */
2773static
2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2775 PyObject **outobj, int *outpos)
2776{
2777 PyObject *rep = charmapencode_lookup(c, mapping);
2778
2779 if (rep==NULL)
2780 return NULL;
2781 else if (rep==Py_None)
2782 return rep;
2783 else {
2784 char *outstart = PyString_AS_STRING(*outobj);
2785 int outsize = PyString_GET_SIZE(*outobj);
2786 if (PyInt_Check(rep)) {
2787 int requiredsize = *outpos+1;
2788 if (outsize<requiredsize) {
2789 /* exponentially overallocate to minimize reallocations */
2790 if (requiredsize < 2*outsize)
2791 requiredsize = 2*outsize;
2792 if (_PyString_Resize(outobj, requiredsize)) {
2793 Py_DECREF(rep);
2794 return NULL;
2795 }
2796 outstart = PyString_AS_STRING(*outobj);
2797 }
2798 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2799 }
2800 else {
2801 const char *repchars = PyString_AS_STRING(rep);
2802 int repsize = PyString_GET_SIZE(rep);
2803 int requiredsize = *outpos+repsize;
2804 if (outsize<requiredsize) {
2805 /* exponentially overallocate to minimize reallocations */
2806 if (requiredsize < 2*outsize)
2807 requiredsize = 2*outsize;
2808 if (_PyString_Resize(outobj, requiredsize)) {
2809 Py_DECREF(rep);
2810 return NULL;
2811 }
2812 outstart = PyString_AS_STRING(*outobj);
2813 }
2814 memcpy(outstart + *outpos, repchars, repsize);
2815 *outpos += repsize;
2816 }
2817 }
2818 return rep;
2819}
2820
2821/* handle an error in PyUnicode_EncodeCharmap
2822 Return 0 on success, -1 on error */
2823static
2824int charmap_encoding_error(
2825 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2826 PyObject **exceptionObject,
2827 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2828 PyObject **res, int *respos)
2829{
2830 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2831 int repsize;
2832 int newpos;
2833 Py_UNICODE *uni2;
2834 /* startpos for collecting unencodable chars */
2835 int collstartpos = *inpos;
2836 int collendpos = *inpos+1;
2837 int collpos;
2838 char *encoding = "charmap";
2839 char *reason = "character maps to <undefined>";
2840
2841 PyObject *x;
2842 /* find all unencodable characters */
2843 while (collendpos < size) {
2844 x = charmapencode_lookup(p[collendpos], mapping);
2845 if (x==NULL)
2846 return -1;
2847 else if (x!=Py_None) {
2848 Py_DECREF(x);
2849 break;
2850 }
2851 Py_DECREF(x);
2852 ++collendpos;
2853 }
2854 /* cache callback name lookup
2855 * (if not done yet, i.e. it's the first error) */
2856 if (*known_errorHandler==-1) {
2857 if ((errors==NULL) || (!strcmp(errors, "strict")))
2858 *known_errorHandler = 1;
2859 else if (!strcmp(errors, "replace"))
2860 *known_errorHandler = 2;
2861 else if (!strcmp(errors, "ignore"))
2862 *known_errorHandler = 3;
2863 else if (!strcmp(errors, "xmlcharrefreplace"))
2864 *known_errorHandler = 4;
2865 else
2866 *known_errorHandler = 0;
2867 }
2868 switch (*known_errorHandler) {
2869 case 1: /* strict */
2870 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2871 return -1;
2872 case 2: /* replace */
2873 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2874 x = charmapencode_output('?', mapping, res, respos);
2875 if (x==NULL) {
2876 return -1;
2877 }
2878 else if (x==Py_None) {
2879 Py_DECREF(x);
2880 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2881 return -1;
2882 }
2883 Py_DECREF(x);
2884 }
2885 /* fall through */
2886 case 3: /* ignore */
2887 *inpos = collendpos;
2888 break;
2889 case 4: /* xmlcharrefreplace */
2890 /* generate replacement (temporarily (mis)uses p) */
2891 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2892 char buffer[2+29+1+1];
2893 char *cp;
2894 sprintf(buffer, "&#%d;", (int)p[collpos]);
2895 for (cp = buffer; *cp; ++cp) {
2896 x = charmapencode_output(*cp, mapping, res, respos);
2897 if (x==NULL)
2898 return -1;
2899 else if (x==Py_None) {
2900 Py_DECREF(x);
2901 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2902 return -1;
2903 }
2904 Py_DECREF(x);
2905 }
2906 }
2907 *inpos = collendpos;
2908 break;
2909 default:
2910 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2911 encoding, reason, p, size, exceptionObject,
2912 collstartpos, collendpos, &newpos);
2913 if (repunicode == NULL)
2914 return -1;
2915 /* generate replacement */
2916 repsize = PyUnicode_GET_SIZE(repunicode);
2917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2918 x = charmapencode_output(*uni2, mapping, res, respos);
2919 if (x==NULL) {
2920 Py_DECREF(repunicode);
2921 return -1;
2922 }
2923 else if (x==Py_None) {
2924 Py_DECREF(repunicode);
2925 Py_DECREF(x);
2926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2927 return -1;
2928 }
2929 Py_DECREF(x);
2930 }
2931 *inpos = newpos;
2932 Py_DECREF(repunicode);
2933 }
2934 return 0;
2935}
2936
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2938 int size,
2939 PyObject *mapping,
2940 const char *errors)
2941{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 /* output object */
2943 PyObject *res = NULL;
2944 /* current input position */
2945 int inpos = 0;
2946 /* current output position */
2947 int respos = 0;
2948 PyObject *errorHandler = NULL;
2949 PyObject *exc = NULL;
2950 /* the following variable is used for caching string comparisons
2951 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2952 * 3=ignore, 4=xmlcharrefreplace */
2953 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
2955 /* Default to Latin-1 */
2956 if (mapping == NULL)
2957 return PyUnicode_EncodeLatin1(p, size, errors);
2958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 /* allocate enough for a simple encoding without
2960 replacements, if we need more, we'll resize */
2961 res = PyString_FromStringAndSize(NULL, size);
2962 if (res == NULL)
2963 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002964 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967 while (inpos<size) {
2968 /* try to encode it */
2969 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2970 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 if (x==Py_None) { /* unencodable character */
2973 if (charmap_encoding_error(p, size, &inpos, mapping,
2974 &exc,
2975 &known_errorHandler, errorHandler, errors,
2976 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 else
2980 /* done with this character => adjust input position */
2981 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 Py_DECREF(x);
2983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 /* Resize if we allocated to much */
2986 if (respos<PyString_GET_SIZE(res)) {
2987 if (_PyString_Resize(&res, respos))
2988 goto onError;
2989 }
2990 Py_XDECREF(exc);
2991 Py_XDECREF(errorHandler);
2992 return res;
2993
2994 onError:
2995 Py_XDECREF(res);
2996 Py_XDECREF(exc);
2997 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 return NULL;
2999}
3000
3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3002 PyObject *mapping)
3003{
3004 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3005 PyErr_BadArgument();
3006 return NULL;
3007 }
3008 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3009 PyUnicode_GET_SIZE(unicode),
3010 mapping,
3011 NULL);
3012}
3013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014/* create or adjust a UnicodeTranslateError */
3015static void make_translate_exception(PyObject **exceptionObject,
3016 const Py_UNICODE *unicode, int size,
3017 int startpos, int endpos,
3018 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 if (*exceptionObject == NULL) {
3021 *exceptionObject = PyUnicodeTranslateError_Create(
3022 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 }
3024 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3026 goto onError;
3027 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3028 goto onError;
3029 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3030 goto onError;
3031 return;
3032 onError:
3033 Py_DECREF(*exceptionObject);
3034 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 }
3036}
3037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038/* raises a UnicodeTranslateError */
3039static void raise_translate_exception(PyObject **exceptionObject,
3040 const Py_UNICODE *unicode, int size,
3041 int startpos, int endpos,
3042 const char *reason)
3043{
3044 make_translate_exception(exceptionObject,
3045 unicode, size, startpos, endpos, reason);
3046 if (*exceptionObject != NULL)
3047 PyCodec_StrictErrors(*exceptionObject);
3048}
3049
3050/* error handling callback helper:
3051 build arguments, call the callback and check the arguments,
3052 put the result into newpos and return the replacement string, which
3053 has to be freed by the caller */
3054static PyObject *unicode_translate_call_errorhandler(const char *errors,
3055 PyObject **errorHandler,
3056 const char *reason,
3057 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3058 int startpos, int endpos,
3059 int *newpos)
3060{
3061 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3062
3063 PyObject *restuple;
3064 PyObject *resunicode;
3065
3066 if (*errorHandler == NULL) {
3067 *errorHandler = PyCodec_LookupError(errors);
3068 if (*errorHandler == NULL)
3069 return NULL;
3070 }
3071
3072 make_translate_exception(exceptionObject,
3073 unicode, size, startpos, endpos, reason);
3074 if (*exceptionObject == NULL)
3075 return NULL;
3076
3077 restuple = PyObject_CallFunctionObjArgs(
3078 *errorHandler, *exceptionObject, NULL);
3079 if (restuple == NULL)
3080 return NULL;
3081 if (!PyTuple_Check(restuple)) {
3082 PyErr_Format(PyExc_TypeError, &argparse[4]);
3083 Py_DECREF(restuple);
3084 return NULL;
3085 }
3086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3087 &resunicode, newpos)) {
3088 Py_DECREF(restuple);
3089 return NULL;
3090 }
3091 if (*newpos<0)
3092 *newpos = 0;
3093 else if (*newpos>size)
3094 *newpos = size;
3095 Py_INCREF(resunicode);
3096 Py_DECREF(restuple);
3097 return resunicode;
3098}
3099
3100/* Lookup the character ch in the mapping and put the result in result,
3101 which must be decrefed by the caller.
3102 Return 0 on success, -1 on error */
3103static
3104int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3105{
3106 PyObject *w = PyInt_FromLong((long)c);
3107 PyObject *x;
3108
3109 if (w == NULL)
3110 return -1;
3111 x = PyObject_GetItem(mapping, w);
3112 Py_DECREF(w);
3113 if (x == NULL) {
3114 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3115 /* No mapping found means: use 1:1 mapping. */
3116 PyErr_Clear();
3117 *result = NULL;
3118 return 0;
3119 } else
3120 return -1;
3121 }
3122 else if (x == Py_None) {
3123 *result = x;
3124 return 0;
3125 }
3126 else if (PyInt_Check(x)) {
3127 long value = PyInt_AS_LONG(x);
3128 long max = PyUnicode_GetMax();
3129 if (value < 0 || value > max) {
3130 PyErr_Format(PyExc_TypeError,
3131 "character mapping must be in range(0x%lx)", max+1);
3132 Py_DECREF(x);
3133 return -1;
3134 }
3135 *result = x;
3136 return 0;
3137 }
3138 else if (PyUnicode_Check(x)) {
3139 *result = x;
3140 return 0;
3141 }
3142 else {
3143 /* wrong return value */
3144 PyErr_SetString(PyExc_TypeError,
3145 "character mapping must return integer, None or unicode");
3146 return -1;
3147 }
3148}
3149/* ensure that *outobj is at least requiredsize characters long,
3150if not reallocate and adjust various state variables.
3151Return 0 on success, -1 on error */
3152static
3153int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3154 int requiredsize)
3155{
3156 if (requiredsize > *outsize) {
3157 /* remember old output position */
3158 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3159 /* exponentially overallocate to minimize reallocations */
3160 if (requiredsize < 2 * *outsize)
3161 requiredsize = 2 * *outsize;
3162 if (_PyUnicode_Resize(outobj, requiredsize))
3163 return -1;
3164 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3165 *outsize = requiredsize;
3166 }
3167 return 0;
3168}
3169/* lookup the character, put the result in the output string and adjust
3170 various state variables. Return a new reference to the object that
3171 was put in the output buffer in *result, or Py_None, if the mapping was
3172 undefined (in which case no character was written).
3173 The called must decref result.
3174 Return 0 on success, -1 on error. */
3175static
3176int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3177 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3178{
3179 if (charmaptranslate_lookup(c, mapping, res))
3180 return -1;
3181 if (*res==NULL) {
3182 /* not found => default to 1:1 mapping */
3183 *(*outp)++ = (Py_UNICODE)c;
3184 }
3185 else if (*res==Py_None)
3186 ;
3187 else if (PyInt_Check(*res)) {
3188 /* no overflow check, because we know that the space is enough */
3189 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3190 }
3191 else if (PyUnicode_Check(*res)) {
3192 int repsize = PyUnicode_GET_SIZE(*res);
3193 if (repsize==1) {
3194 /* no overflow check, because we know that the space is enough */
3195 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3196 }
3197 else if (repsize!=0) {
3198 /* more than one character */
3199 int requiredsize = *outsize + repsize - 1;
3200 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3201 return -1;
3202 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3203 *outp += repsize;
3204 }
3205 }
3206 else
3207 return -1;
3208 return 0;
3209}
3210
3211PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 int size,
3213 PyObject *mapping,
3214 const char *errors)
3215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 /* output object */
3217 PyObject *res = NULL;
3218 /* pointers to the beginning and end+1 of input */
3219 const Py_UNICODE *startp = p;
3220 const Py_UNICODE *endp = p + size;
3221 /* pointer into the output */
3222 Py_UNICODE *str;
3223 /* current output position */
3224 int respos = 0;
3225 int ressize;
3226 char *reason = "character maps to <undefined>";
3227 PyObject *errorHandler = NULL;
3228 PyObject *exc = NULL;
3229 /* the following variable is used for caching string comparisons
3230 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3231 * 3=ignore, 4=xmlcharrefreplace */
3232 int known_errorHandler = -1;
3233
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 if (mapping == NULL) {
3235 PyErr_BadArgument();
3236 return NULL;
3237 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238
3239 /* allocate enough for a simple 1:1 translation without
3240 replacements, if we need more, we'll resize */
3241 res = PyUnicode_FromUnicode(NULL, size);
3242 if (res == NULL)
3243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 return res;
3246 str = PyUnicode_AS_UNICODE(res);
3247 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 while (p<endp) {
3250 /* try to encode it */
3251 PyObject *x = NULL;
3252 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3253 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 goto onError;
3255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 if (x!=Py_None) /* it worked => adjust input pointer */
3257 ++p;
3258 else { /* untranslatable character */
3259 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3260 int repsize;
3261 int newpos;
3262 Py_UNICODE *uni2;
3263 /* startpos for collecting untranslatable chars */
3264 const Py_UNICODE *collstart = p;
3265 const Py_UNICODE *collend = p+1;
3266 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 Py_XDECREF(x);
3269 /* find all untranslatable characters */
3270 while (collend < endp) {
3271 if (charmaptranslate_lookup(*collend, mapping, &x))
3272 goto onError;
3273 Py_XDECREF(x);
3274 if (x!=Py_None)
3275 break;
3276 ++collend;
3277 }
3278 /* cache callback name lookup
3279 * (if not done yet, i.e. it's the first error) */
3280 if (known_errorHandler==-1) {
3281 if ((errors==NULL) || (!strcmp(errors, "strict")))
3282 known_errorHandler = 1;
3283 else if (!strcmp(errors, "replace"))
3284 known_errorHandler = 2;
3285 else if (!strcmp(errors, "ignore"))
3286 known_errorHandler = 3;
3287 else if (!strcmp(errors, "xmlcharrefreplace"))
3288 known_errorHandler = 4;
3289 else
3290 known_errorHandler = 0;
3291 }
3292 switch (known_errorHandler) {
3293 case 1: /* strict */
3294 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3295 goto onError;
3296 case 2: /* replace */
3297 /* No need to check for space, this is a 1:1 replacement */
3298 for (coll = collstart; coll<collend; ++coll)
3299 *str++ = '?';
3300 /* fall through */
3301 case 3: /* ignore */
3302 p = collend;
3303 break;
3304 case 4: /* xmlcharrefreplace */
3305 /* generate replacement (temporarily (mis)uses p) */
3306 for (p = collstart; p < collend; ++p) {
3307 char buffer[2+29+1+1];
3308 char *cp;
3309 sprintf(buffer, "&#%d;", (int)*p);
3310 if (charmaptranslate_makespace(&res, &str, &ressize,
3311 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3312 goto onError;
3313 for (cp = buffer; *cp; ++cp)
3314 *str++ = *cp;
3315 }
3316 p = collend;
3317 break;
3318 default:
3319 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3320 reason, startp, size, &exc,
3321 collstart-startp, collend-startp, &newpos);
3322 if (repunicode == NULL)
3323 goto onError;
3324 /* generate replacement */
3325 repsize = PyUnicode_GET_SIZE(repunicode);
3326 if (charmaptranslate_makespace(&res, &str, &ressize,
3327 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3328 Py_DECREF(repunicode);
3329 goto onError;
3330 }
3331 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3332 *str++ = *uni2;
3333 p = startp + newpos;
3334 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
3336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 /* Resize if we allocated to much */
3339 respos = str-PyUnicode_AS_UNICODE(res);
3340 if (respos<ressize) {
3341 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003342 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 }
3344 Py_XDECREF(exc);
3345 Py_XDECREF(errorHandler);
3346 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 onError:
3349 Py_XDECREF(res);
3350 Py_XDECREF(exc);
3351 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return NULL;
3353}
3354
3355PyObject *PyUnicode_Translate(PyObject *str,
3356 PyObject *mapping,
3357 const char *errors)
3358{
3359 PyObject *result;
3360
3361 str = PyUnicode_FromObject(str);
3362 if (str == NULL)
3363 goto onError;
3364 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3365 PyUnicode_GET_SIZE(str),
3366 mapping,
3367 errors);
3368 Py_DECREF(str);
3369 return result;
3370
3371 onError:
3372 Py_XDECREF(str);
3373 return NULL;
3374}
3375
Guido van Rossum9e896b32000-04-05 20:11:21 +00003376/* --- Decimal Encoder ---------------------------------------------------- */
3377
3378int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3379 int length,
3380 char *output,
3381 const char *errors)
3382{
3383 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 PyObject *errorHandler = NULL;
3385 PyObject *exc = NULL;
3386 const char *encoding = "decimal";
3387 const char *reason = "invalid decimal Unicode string";
3388 /* the following variable is used for caching string comparisons
3389 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3390 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003391
3392 if (output == NULL) {
3393 PyErr_BadArgument();
3394 return -1;
3395 }
3396
3397 p = s;
3398 end = s + length;
3399 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003401 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 PyObject *repunicode;
3403 int repsize;
3404 int newpos;
3405 Py_UNICODE *uni2;
3406 Py_UNICODE *collstart;
3407 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003408
3409 if (Py_UNICODE_ISSPACE(ch)) {
3410 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003412 continue;
3413 }
3414 decimal = Py_UNICODE_TODECIMAL(ch);
3415 if (decimal >= 0) {
3416 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003418 continue;
3419 }
Guido van Rossumba477042000-04-06 18:18:10 +00003420 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003421 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003423 continue;
3424 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 /* All other characters are considered unencodable */
3426 collstart = p;
3427 collend = p+1;
3428 while (collend < end) {
3429 if ((0 < *collend && *collend < 256) ||
3430 !Py_UNICODE_ISSPACE(*collend) ||
3431 Py_UNICODE_TODECIMAL(*collend))
3432 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 /* cache callback name lookup
3435 * (if not done yet, i.e. it's the first error) */
3436 if (known_errorHandler==-1) {
3437 if ((errors==NULL) || (!strcmp(errors, "strict")))
3438 known_errorHandler = 1;
3439 else if (!strcmp(errors, "replace"))
3440 known_errorHandler = 2;
3441 else if (!strcmp(errors, "ignore"))
3442 known_errorHandler = 3;
3443 else if (!strcmp(errors, "xmlcharrefreplace"))
3444 known_errorHandler = 4;
3445 else
3446 known_errorHandler = 0;
3447 }
3448 switch (known_errorHandler) {
3449 case 1: /* strict */
3450 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3451 goto onError;
3452 case 2: /* replace */
3453 for (p = collstart; p < collend; ++p)
3454 *output++ = '?';
3455 /* fall through */
3456 case 3: /* ignore */
3457 p = collend;
3458 break;
3459 case 4: /* xmlcharrefreplace */
3460 /* generate replacement (temporarily (mis)uses p) */
3461 for (p = collstart; p < collend; ++p)
3462 output += sprintf(output, "&#%d;", (int)*p);
3463 p = collend;
3464 break;
3465 default:
3466 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3467 encoding, reason, s, length, &exc,
3468 collstart-s, collend-s, &newpos);
3469 if (repunicode == NULL)
3470 goto onError;
3471 /* generate replacement */
3472 repsize = PyUnicode_GET_SIZE(repunicode);
3473 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3474 Py_UNICODE ch = *uni2;
3475 if (Py_UNICODE_ISSPACE(ch))
3476 *output++ = ' ';
3477 else {
3478 decimal = Py_UNICODE_TODECIMAL(ch);
3479 if (decimal >= 0)
3480 *output++ = '0' + decimal;
3481 else if (0 < ch && ch < 256)
3482 *output++ = (char)ch;
3483 else {
3484 Py_DECREF(repunicode);
3485 raise_encode_exception(&exc, encoding,
3486 s, length, collstart-s, collend-s, reason);
3487 goto onError;
3488 }
3489 }
3490 }
3491 p = s + newpos;
3492 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003493 }
3494 }
3495 /* 0-terminate the output string */
3496 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 Py_XDECREF(exc);
3498 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003499 return 0;
3500
3501 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 Py_XDECREF(exc);
3503 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003504 return -1;
3505}
3506
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507/* --- Helpers ------------------------------------------------------------ */
3508
3509static
3510int count(PyUnicodeObject *self,
3511 int start,
3512 int end,
3513 PyUnicodeObject *substring)
3514{
3515 int count = 0;
3516
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003517 if (start < 0)
3518 start += self->length;
3519 if (start < 0)
3520 start = 0;
3521 if (end > self->length)
3522 end = self->length;
3523 if (end < 0)
3524 end += self->length;
3525 if (end < 0)
3526 end = 0;
3527
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003528 if (substring->length == 0)
3529 return (end - start + 1);
3530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 end -= substring->length;
3532
3533 while (start <= end)
3534 if (Py_UNICODE_MATCH(self, start, substring)) {
3535 count++;
3536 start += substring->length;
3537 } else
3538 start++;
3539
3540 return count;
3541}
3542
3543int PyUnicode_Count(PyObject *str,
3544 PyObject *substr,
3545 int start,
3546 int end)
3547{
3548 int result;
3549
3550 str = PyUnicode_FromObject(str);
3551 if (str == NULL)
3552 return -1;
3553 substr = PyUnicode_FromObject(substr);
3554 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003555 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return -1;
3557 }
3558
3559 result = count((PyUnicodeObject *)str,
3560 start, end,
3561 (PyUnicodeObject *)substr);
3562
3563 Py_DECREF(str);
3564 Py_DECREF(substr);
3565 return result;
3566}
3567
3568static
3569int findstring(PyUnicodeObject *self,
3570 PyUnicodeObject *substring,
3571 int start,
3572 int end,
3573 int direction)
3574{
3575 if (start < 0)
3576 start += self->length;
3577 if (start < 0)
3578 start = 0;
3579
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 if (end > self->length)
3581 end = self->length;
3582 if (end < 0)
3583 end += self->length;
3584 if (end < 0)
3585 end = 0;
3586
Guido van Rossum76afbd92002-08-20 17:29:29 +00003587 if (substring->length == 0)
3588 return (direction > 0) ? start : end;
3589
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 end -= substring->length;
3591
3592 if (direction < 0) {
3593 for (; end >= start; end--)
3594 if (Py_UNICODE_MATCH(self, end, substring))
3595 return end;
3596 } else {
3597 for (; start <= end; start++)
3598 if (Py_UNICODE_MATCH(self, start, substring))
3599 return start;
3600 }
3601
3602 return -1;
3603}
3604
3605int PyUnicode_Find(PyObject *str,
3606 PyObject *substr,
3607 int start,
3608 int end,
3609 int direction)
3610{
3611 int result;
3612
3613 str = PyUnicode_FromObject(str);
3614 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003615 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 substr = PyUnicode_FromObject(substr);
3617 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003618 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003619 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621
3622 result = findstring((PyUnicodeObject *)str,
3623 (PyUnicodeObject *)substr,
3624 start, end, direction);
3625 Py_DECREF(str);
3626 Py_DECREF(substr);
3627 return result;
3628}
3629
3630static
3631int tailmatch(PyUnicodeObject *self,
3632 PyUnicodeObject *substring,
3633 int start,
3634 int end,
3635 int direction)
3636{
3637 if (start < 0)
3638 start += self->length;
3639 if (start < 0)
3640 start = 0;
3641
3642 if (substring->length == 0)
3643 return 1;
3644
3645 if (end > self->length)
3646 end = self->length;
3647 if (end < 0)
3648 end += self->length;
3649 if (end < 0)
3650 end = 0;
3651
3652 end -= substring->length;
3653 if (end < start)
3654 return 0;
3655
3656 if (direction > 0) {
3657 if (Py_UNICODE_MATCH(self, end, substring))
3658 return 1;
3659 } else {
3660 if (Py_UNICODE_MATCH(self, start, substring))
3661 return 1;
3662 }
3663
3664 return 0;
3665}
3666
3667int PyUnicode_Tailmatch(PyObject *str,
3668 PyObject *substr,
3669 int start,
3670 int end,
3671 int direction)
3672{
3673 int result;
3674
3675 str = PyUnicode_FromObject(str);
3676 if (str == NULL)
3677 return -1;
3678 substr = PyUnicode_FromObject(substr);
3679 if (substr == NULL) {
3680 Py_DECREF(substr);
3681 return -1;
3682 }
3683
3684 result = tailmatch((PyUnicodeObject *)str,
3685 (PyUnicodeObject *)substr,
3686 start, end, direction);
3687 Py_DECREF(str);
3688 Py_DECREF(substr);
3689 return result;
3690}
3691
3692static
3693const Py_UNICODE *findchar(const Py_UNICODE *s,
3694 int size,
3695 Py_UNICODE ch)
3696{
3697 /* like wcschr, but doesn't stop at NULL characters */
3698
3699 while (size-- > 0) {
3700 if (*s == ch)
3701 return s;
3702 s++;
3703 }
3704
3705 return NULL;
3706}
3707
3708/* Apply fixfct filter to the Unicode object self and return a
3709 reference to the modified object */
3710
3711static
3712PyObject *fixup(PyUnicodeObject *self,
3713 int (*fixfct)(PyUnicodeObject *s))
3714{
3715
3716 PyUnicodeObject *u;
3717
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003718 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 if (u == NULL)
3720 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003721
3722 Py_UNICODE_COPY(u->str, self->str, self->length);
3723
Tim Peters7a29bd52001-09-12 03:03:31 +00003724 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 /* fixfct should return TRUE if it modified the buffer. If
3726 FALSE, return a reference to the original buffer instead
3727 (to save space, not time) */
3728 Py_INCREF(self);
3729 Py_DECREF(u);
3730 return (PyObject*) self;
3731 }
3732 return (PyObject*) u;
3733}
3734
3735static
3736int fixupper(PyUnicodeObject *self)
3737{
3738 int len = self->length;
3739 Py_UNICODE *s = self->str;
3740 int status = 0;
3741
3742 while (len-- > 0) {
3743 register Py_UNICODE ch;
3744
3745 ch = Py_UNICODE_TOUPPER(*s);
3746 if (ch != *s) {
3747 status = 1;
3748 *s = ch;
3749 }
3750 s++;
3751 }
3752
3753 return status;
3754}
3755
3756static
3757int fixlower(PyUnicodeObject *self)
3758{
3759 int len = self->length;
3760 Py_UNICODE *s = self->str;
3761 int status = 0;
3762
3763 while (len-- > 0) {
3764 register Py_UNICODE ch;
3765
3766 ch = Py_UNICODE_TOLOWER(*s);
3767 if (ch != *s) {
3768 status = 1;
3769 *s = ch;
3770 }
3771 s++;
3772 }
3773
3774 return status;
3775}
3776
3777static
3778int fixswapcase(PyUnicodeObject *self)
3779{
3780 int len = self->length;
3781 Py_UNICODE *s = self->str;
3782 int status = 0;
3783
3784 while (len-- > 0) {
3785 if (Py_UNICODE_ISUPPER(*s)) {
3786 *s = Py_UNICODE_TOLOWER(*s);
3787 status = 1;
3788 } else if (Py_UNICODE_ISLOWER(*s)) {
3789 *s = Py_UNICODE_TOUPPER(*s);
3790 status = 1;
3791 }
3792 s++;
3793 }
3794
3795 return status;
3796}
3797
3798static
3799int fixcapitalize(PyUnicodeObject *self)
3800{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003801 int len = self->length;
3802 Py_UNICODE *s = self->str;
3803 int status = 0;
3804
3805 if (len == 0)
3806 return 0;
3807 if (Py_UNICODE_ISLOWER(*s)) {
3808 *s = Py_UNICODE_TOUPPER(*s);
3809 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003811 s++;
3812 while (--len > 0) {
3813 if (Py_UNICODE_ISUPPER(*s)) {
3814 *s = Py_UNICODE_TOLOWER(*s);
3815 status = 1;
3816 }
3817 s++;
3818 }
3819 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820}
3821
3822static
3823int fixtitle(PyUnicodeObject *self)
3824{
3825 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3826 register Py_UNICODE *e;
3827 int previous_is_cased;
3828
3829 /* Shortcut for single character strings */
3830 if (PyUnicode_GET_SIZE(self) == 1) {
3831 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3832 if (*p != ch) {
3833 *p = ch;
3834 return 1;
3835 }
3836 else
3837 return 0;
3838 }
3839
3840 e = p + PyUnicode_GET_SIZE(self);
3841 previous_is_cased = 0;
3842 for (; p < e; p++) {
3843 register const Py_UNICODE ch = *p;
3844
3845 if (previous_is_cased)
3846 *p = Py_UNICODE_TOLOWER(ch);
3847 else
3848 *p = Py_UNICODE_TOTITLE(ch);
3849
3850 if (Py_UNICODE_ISLOWER(ch) ||
3851 Py_UNICODE_ISUPPER(ch) ||
3852 Py_UNICODE_ISTITLE(ch))
3853 previous_is_cased = 1;
3854 else
3855 previous_is_cased = 0;
3856 }
3857 return 1;
3858}
3859
3860PyObject *PyUnicode_Join(PyObject *separator,
3861 PyObject *seq)
3862{
3863 Py_UNICODE *sep;
3864 int seplen;
3865 PyUnicodeObject *res = NULL;
3866 int reslen = 0;
3867 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 int sz = 100;
3869 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003870 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871
Tim Peters2cfe3682001-05-05 05:36:48 +00003872 it = PyObject_GetIter(seq);
3873 if (it == NULL)
3874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
3876 if (separator == NULL) {
3877 Py_UNICODE blank = ' ';
3878 sep = &blank;
3879 seplen = 1;
3880 }
3881 else {
3882 separator = PyUnicode_FromObject(separator);
3883 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 sep = PyUnicode_AS_UNICODE(separator);
3886 seplen = PyUnicode_GET_SIZE(separator);
3887 }
3888
3889 res = _PyUnicode_New(sz);
3890 if (res == NULL)
3891 goto onError;
3892 p = PyUnicode_AS_UNICODE(res);
3893 reslen = 0;
3894
Tim Peters2cfe3682001-05-05 05:36:48 +00003895 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003897 PyObject *item = PyIter_Next(it);
3898 if (item == NULL) {
3899 if (PyErr_Occurred())
3900 goto onError;
3901 break;
3902 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 if (!PyUnicode_Check(item)) {
3904 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003905 if (!PyString_Check(item)) {
3906 PyErr_Format(PyExc_TypeError,
3907 "sequence item %i: expected string or Unicode,"
3908 " %.80s found",
3909 i, item->ob_type->tp_name);
3910 Py_DECREF(item);
3911 goto onError;
3912 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 v = PyUnicode_FromObject(item);
3914 Py_DECREF(item);
3915 item = v;
3916 if (item == NULL)
3917 goto onError;
3918 }
3919 itemlen = PyUnicode_GET_SIZE(item);
3920 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003921 if (_PyUnicode_Resize(&res, sz*2)) {
3922 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 sz *= 2;
3926 p = PyUnicode_AS_UNICODE(res) + reslen;
3927 }
3928 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003929 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 p += seplen;
3931 reslen += seplen;
3932 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003933 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 p += itemlen;
3935 reslen += itemlen;
3936 Py_DECREF(item);
3937 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003938 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 goto onError;
3940
3941 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003942 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 return (PyObject *)res;
3944
3945 onError:
3946 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003947 Py_XDECREF(res);
3948 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 return NULL;
3950}
3951
3952static
3953PyUnicodeObject *pad(PyUnicodeObject *self,
3954 int left,
3955 int right,
3956 Py_UNICODE fill)
3957{
3958 PyUnicodeObject *u;
3959
3960 if (left < 0)
3961 left = 0;
3962 if (right < 0)
3963 right = 0;
3964
Tim Peters7a29bd52001-09-12 03:03:31 +00003965 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 Py_INCREF(self);
3967 return self;
3968 }
3969
3970 u = _PyUnicode_New(left + self->length + right);
3971 if (u) {
3972 if (left)
3973 Py_UNICODE_FILL(u->str, fill, left);
3974 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3975 if (right)
3976 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3977 }
3978
3979 return u;
3980}
3981
3982#define SPLIT_APPEND(data, left, right) \
3983 str = PyUnicode_FromUnicode(data + left, right - left); \
3984 if (!str) \
3985 goto onError; \
3986 if (PyList_Append(list, str)) { \
3987 Py_DECREF(str); \
3988 goto onError; \
3989 } \
3990 else \
3991 Py_DECREF(str);
3992
3993static
3994PyObject *split_whitespace(PyUnicodeObject *self,
3995 PyObject *list,
3996 int maxcount)
3997{
3998 register int i;
3999 register int j;
4000 int len = self->length;
4001 PyObject *str;
4002
4003 for (i = j = 0; i < len; ) {
4004 /* find a token */
4005 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4006 i++;
4007 j = i;
4008 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4009 i++;
4010 if (j < i) {
4011 if (maxcount-- <= 0)
4012 break;
4013 SPLIT_APPEND(self->str, j, i);
4014 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4015 i++;
4016 j = i;
4017 }
4018 }
4019 if (j < len) {
4020 SPLIT_APPEND(self->str, j, len);
4021 }
4022 return list;
4023
4024 onError:
4025 Py_DECREF(list);
4026 return NULL;
4027}
4028
4029PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004030 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031{
4032 register int i;
4033 register int j;
4034 int len;
4035 PyObject *list;
4036 PyObject *str;
4037 Py_UNICODE *data;
4038
4039 string = PyUnicode_FromObject(string);
4040 if (string == NULL)
4041 return NULL;
4042 data = PyUnicode_AS_UNICODE(string);
4043 len = PyUnicode_GET_SIZE(string);
4044
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 list = PyList_New(0);
4046 if (!list)
4047 goto onError;
4048
4049 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004050 int eol;
4051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 /* Find a line and append it */
4053 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4054 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055
4056 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004057 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 if (i < len) {
4059 if (data[i] == '\r' && i + 1 < len &&
4060 data[i+1] == '\n')
4061 i += 2;
4062 else
4063 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004064 if (keepends)
4065 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
Guido van Rossum86662912000-04-11 15:38:46 +00004067 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 j = i;
4069 }
4070 if (j < len) {
4071 SPLIT_APPEND(data, j, len);
4072 }
4073
4074 Py_DECREF(string);
4075 return list;
4076
4077 onError:
4078 Py_DECREF(list);
4079 Py_DECREF(string);
4080 return NULL;
4081}
4082
4083static
4084PyObject *split_char(PyUnicodeObject *self,
4085 PyObject *list,
4086 Py_UNICODE ch,
4087 int maxcount)
4088{
4089 register int i;
4090 register int j;
4091 int len = self->length;
4092 PyObject *str;
4093
4094 for (i = j = 0; i < len; ) {
4095 if (self->str[i] == ch) {
4096 if (maxcount-- <= 0)
4097 break;
4098 SPLIT_APPEND(self->str, j, i);
4099 i = j = i + 1;
4100 } else
4101 i++;
4102 }
4103 if (j <= len) {
4104 SPLIT_APPEND(self->str, j, len);
4105 }
4106 return list;
4107
4108 onError:
4109 Py_DECREF(list);
4110 return NULL;
4111}
4112
4113static
4114PyObject *split_substring(PyUnicodeObject *self,
4115 PyObject *list,
4116 PyUnicodeObject *substring,
4117 int maxcount)
4118{
4119 register int i;
4120 register int j;
4121 int len = self->length;
4122 int sublen = substring->length;
4123 PyObject *str;
4124
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004125 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 if (Py_UNICODE_MATCH(self, i, substring)) {
4127 if (maxcount-- <= 0)
4128 break;
4129 SPLIT_APPEND(self->str, j, i);
4130 i = j = i + sublen;
4131 } else
4132 i++;
4133 }
4134 if (j <= len) {
4135 SPLIT_APPEND(self->str, j, len);
4136 }
4137 return list;
4138
4139 onError:
4140 Py_DECREF(list);
4141 return NULL;
4142}
4143
4144#undef SPLIT_APPEND
4145
4146static
4147PyObject *split(PyUnicodeObject *self,
4148 PyUnicodeObject *substring,
4149 int maxcount)
4150{
4151 PyObject *list;
4152
4153 if (maxcount < 0)
4154 maxcount = INT_MAX;
4155
4156 list = PyList_New(0);
4157 if (!list)
4158 return NULL;
4159
4160 if (substring == NULL)
4161 return split_whitespace(self,list,maxcount);
4162
4163 else if (substring->length == 1)
4164 return split_char(self,list,substring->str[0],maxcount);
4165
4166 else if (substring->length == 0) {
4167 Py_DECREF(list);
4168 PyErr_SetString(PyExc_ValueError, "empty separator");
4169 return NULL;
4170 }
4171 else
4172 return split_substring(self,list,substring,maxcount);
4173}
4174
4175static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176PyObject *replace(PyUnicodeObject *self,
4177 PyUnicodeObject *str1,
4178 PyUnicodeObject *str2,
4179 int maxcount)
4180{
4181 PyUnicodeObject *u;
4182
4183 if (maxcount < 0)
4184 maxcount = INT_MAX;
4185
4186 if (str1->length == 1 && str2->length == 1) {
4187 int i;
4188
4189 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004190 if (!findchar(self->str, self->length, str1->str[0]) &&
4191 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 /* nothing to replace, return original string */
4193 Py_INCREF(self);
4194 u = self;
4195 } else {
4196 Py_UNICODE u1 = str1->str[0];
4197 Py_UNICODE u2 = str2->str[0];
4198
4199 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004200 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 self->length
4202 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004203 if (u != NULL) {
4204 Py_UNICODE_COPY(u->str, self->str,
4205 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 for (i = 0; i < u->length; i++)
4207 if (u->str[i] == u1) {
4208 if (--maxcount < 0)
4209 break;
4210 u->str[i] = u2;
4211 }
4212 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214
4215 } else {
4216 int n, i;
4217 Py_UNICODE *p;
4218
4219 /* replace strings */
4220 n = count(self, 0, self->length, str1);
4221 if (n > maxcount)
4222 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004223 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004225 if (PyUnicode_CheckExact(self)) {
4226 Py_INCREF(self);
4227 u = self;
4228 }
4229 else {
4230 u = (PyUnicodeObject *)
4231 PyUnicode_FromUnicode(self->str, self->length);
4232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 } else {
4234 u = _PyUnicode_New(
4235 self->length + n * (str2->length - str1->length));
4236 if (u) {
4237 i = 0;
4238 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004239 if (str1->length > 0) {
4240 while (i <= self->length - str1->length)
4241 if (Py_UNICODE_MATCH(self, i, str1)) {
4242 /* replace string segment */
4243 Py_UNICODE_COPY(p, str2->str, str2->length);
4244 p += str2->length;
4245 i += str1->length;
4246 if (--n <= 0) {
4247 /* copy remaining part */
4248 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4249 break;
4250 }
4251 } else
4252 *p++ = self->str[i++];
4253 } else {
4254 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 Py_UNICODE_COPY(p, str2->str, str2->length);
4256 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004257 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004260 }
4261 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 }
4264 }
4265 }
4266
4267 return (PyObject *) u;
4268}
4269
4270/* --- Unicode Object Methods --------------------------------------------- */
4271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004272PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273"S.title() -> unicode\n\
4274\n\
4275Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004276characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277
4278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004279unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 return fixup(self, fixtitle);
4282}
4283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004284PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285"S.capitalize() -> unicode\n\
4286\n\
4287Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004288have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289
4290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004291unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 return fixup(self, fixcapitalize);
4294}
4295
4296#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004297PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298"S.capwords() -> unicode\n\
4299\n\
4300Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004301normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302
4303static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004304unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305{
4306 PyObject *list;
4307 PyObject *item;
4308 int i;
4309
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 /* Split into words */
4311 list = split(self, NULL, -1);
4312 if (!list)
4313 return NULL;
4314
4315 /* Capitalize each word */
4316 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4317 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4318 fixcapitalize);
4319 if (item == NULL)
4320 goto onError;
4321 Py_DECREF(PyList_GET_ITEM(list, i));
4322 PyList_SET_ITEM(list, i, item);
4323 }
4324
4325 /* Join the words to form a new string */
4326 item = PyUnicode_Join(NULL, list);
4327
4328onError:
4329 Py_DECREF(list);
4330 return (PyObject *)item;
4331}
4332#endif
4333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004334PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335"S.center(width) -> unicode\n\
4336\n\
4337Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004338using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339
4340static PyObject *
4341unicode_center(PyUnicodeObject *self, PyObject *args)
4342{
4343 int marg, left;
4344 int width;
4345
4346 if (!PyArg_ParseTuple(args, "i:center", &width))
4347 return NULL;
4348
Tim Peters7a29bd52001-09-12 03:03:31 +00004349 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 Py_INCREF(self);
4351 return (PyObject*) self;
4352 }
4353
4354 marg = width - self->length;
4355 left = marg / 2 + (marg & width & 1);
4356
4357 return (PyObject*) pad(self, left, marg - left, ' ');
4358}
4359
Marc-André Lemburge5034372000-08-08 08:04:29 +00004360#if 0
4361
4362/* This code should go into some future Unicode collation support
4363 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004364 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004365
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004366/* speedy UTF-16 code point order comparison */
4367/* gleaned from: */
4368/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4369
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004370static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004371{
4372 0, 0, 0, 0, 0, 0, 0, 0,
4373 0, 0, 0, 0, 0, 0, 0, 0,
4374 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004375 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004376};
4377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378static int
4379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4380{
4381 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 Py_UNICODE *s1 = str1->str;
4384 Py_UNICODE *s2 = str2->str;
4385
4386 len1 = str1->length;
4387 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004388
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004390 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004391
4392 c1 = *s1++;
4393 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004394
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004395 if (c1 > (1<<11) * 26)
4396 c1 += utf16Fixup[c1>>11];
4397 if (c2 > (1<<11) * 26)
4398 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004399 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004400
4401 if (c1 != c2)
4402 return (c1 < c2) ? -1 : 1;
4403
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004404 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 }
4406
4407 return (len1 < len2) ? -1 : (len1 != len2);
4408}
4409
Marc-André Lemburge5034372000-08-08 08:04:29 +00004410#else
4411
4412static int
4413unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4414{
4415 register int len1, len2;
4416
4417 Py_UNICODE *s1 = str1->str;
4418 Py_UNICODE *s2 = str2->str;
4419
4420 len1 = str1->length;
4421 len2 = str2->length;
4422
4423 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004424 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004425
Fredrik Lundh45714e92001-06-26 16:39:36 +00004426 c1 = *s1++;
4427 c2 = *s2++;
4428
4429 if (c1 != c2)
4430 return (c1 < c2) ? -1 : 1;
4431
Marc-André Lemburge5034372000-08-08 08:04:29 +00004432 len1--; len2--;
4433 }
4434
4435 return (len1 < len2) ? -1 : (len1 != len2);
4436}
4437
4438#endif
4439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440int PyUnicode_Compare(PyObject *left,
4441 PyObject *right)
4442{
4443 PyUnicodeObject *u = NULL, *v = NULL;
4444 int result;
4445
4446 /* Coerce the two arguments */
4447 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4448 if (u == NULL)
4449 goto onError;
4450 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4451 if (v == NULL)
4452 goto onError;
4453
Thomas Wouters7e474022000-07-16 12:04:32 +00004454 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 if (v == u) {
4456 Py_DECREF(u);
4457 Py_DECREF(v);
4458 return 0;
4459 }
4460
4461 result = unicode_compare(u, v);
4462
4463 Py_DECREF(u);
4464 Py_DECREF(v);
4465 return result;
4466
4467onError:
4468 Py_XDECREF(u);
4469 Py_XDECREF(v);
4470 return -1;
4471}
4472
Guido van Rossum403d68b2000-03-13 15:55:09 +00004473int PyUnicode_Contains(PyObject *container,
4474 PyObject *element)
4475{
4476 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004477 int result, size;
4478 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004479
4480 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004481 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004482 if (v == NULL) {
4483 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004484 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004485 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004486 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004487 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4488 if (u == NULL) {
4489 Py_DECREF(v);
4490 goto onError;
4491 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004492
Barry Warsaw817918c2002-08-06 16:58:21 +00004493 size = PyUnicode_GET_SIZE(v);
4494 rhs = PyUnicode_AS_UNICODE(v);
4495 lhs = PyUnicode_AS_UNICODE(u);
4496
Guido van Rossum403d68b2000-03-13 15:55:09 +00004497 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004498 if (size == 1) {
4499 end = lhs + PyUnicode_GET_SIZE(u);
4500 while (lhs < end) {
4501 if (*lhs++ == *rhs) {
4502 result = 1;
4503 break;
4504 }
4505 }
4506 }
4507 else {
4508 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4509 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004510 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004511 result = 1;
4512 break;
4513 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004514 }
4515 }
4516
4517 Py_DECREF(u);
4518 Py_DECREF(v);
4519 return result;
4520
4521onError:
4522 Py_XDECREF(u);
4523 Py_XDECREF(v);
4524 return -1;
4525}
4526
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527/* Concat to string or Unicode object giving a new Unicode object. */
4528
4529PyObject *PyUnicode_Concat(PyObject *left,
4530 PyObject *right)
4531{
4532 PyUnicodeObject *u = NULL, *v = NULL, *w;
4533
4534 /* Coerce the two arguments */
4535 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4536 if (u == NULL)
4537 goto onError;
4538 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4539 if (v == NULL)
4540 goto onError;
4541
4542 /* Shortcuts */
4543 if (v == unicode_empty) {
4544 Py_DECREF(v);
4545 return (PyObject *)u;
4546 }
4547 if (u == unicode_empty) {
4548 Py_DECREF(u);
4549 return (PyObject *)v;
4550 }
4551
4552 /* Concat the two Unicode strings */
4553 w = _PyUnicode_New(u->length + v->length);
4554 if (w == NULL)
4555 goto onError;
4556 Py_UNICODE_COPY(w->str, u->str, u->length);
4557 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4558
4559 Py_DECREF(u);
4560 Py_DECREF(v);
4561 return (PyObject *)w;
4562
4563onError:
4564 Py_XDECREF(u);
4565 Py_XDECREF(v);
4566 return NULL;
4567}
4568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004569PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570"S.count(sub[, start[, end]]) -> int\n\
4571\n\
4572Return the number of occurrences of substring sub in Unicode string\n\
4573S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004574interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
4576static PyObject *
4577unicode_count(PyUnicodeObject *self, PyObject *args)
4578{
4579 PyUnicodeObject *substring;
4580 int start = 0;
4581 int end = INT_MAX;
4582 PyObject *result;
4583
Guido van Rossumb8872e62000-05-09 14:14:27 +00004584 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4585 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 return NULL;
4587
4588 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4589 (PyObject *)substring);
4590 if (substring == NULL)
4591 return NULL;
4592
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 if (start < 0)
4594 start += self->length;
4595 if (start < 0)
4596 start = 0;
4597 if (end > self->length)
4598 end = self->length;
4599 if (end < 0)
4600 end += self->length;
4601 if (end < 0)
4602 end = 0;
4603
4604 result = PyInt_FromLong((long) count(self, start, end, substring));
4605
4606 Py_DECREF(substring);
4607 return result;
4608}
4609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004610PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611"S.encode([encoding[,errors]]) -> string\n\
4612\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004613Return an encoded string version of S. Default encoding is the current\n\
4614default string encoding. errors may be given to set a different error\n\
4615handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4617'xmlcharrefreplace' as well as any other name registered with\n\
4618codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619
4620static PyObject *
4621unicode_encode(PyUnicodeObject *self, PyObject *args)
4622{
4623 char *encoding = NULL;
4624 char *errors = NULL;
4625 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4626 return NULL;
4627 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4628}
4629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004630PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631"S.expandtabs([tabsize]) -> unicode\n\
4632\n\
4633Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004634If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635
4636static PyObject*
4637unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4638{
4639 Py_UNICODE *e;
4640 Py_UNICODE *p;
4641 Py_UNICODE *q;
4642 int i, j;
4643 PyUnicodeObject *u;
4644 int tabsize = 8;
4645
4646 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4647 return NULL;
4648
Thomas Wouters7e474022000-07-16 12:04:32 +00004649 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 i = j = 0;
4651 e = self->str + self->length;
4652 for (p = self->str; p < e; p++)
4653 if (*p == '\t') {
4654 if (tabsize > 0)
4655 j += tabsize - (j % tabsize);
4656 }
4657 else {
4658 j++;
4659 if (*p == '\n' || *p == '\r') {
4660 i += j;
4661 j = 0;
4662 }
4663 }
4664
4665 /* Second pass: create output string and fill it */
4666 u = _PyUnicode_New(i + j);
4667 if (!u)
4668 return NULL;
4669
4670 j = 0;
4671 q = u->str;
4672
4673 for (p = self->str; p < e; p++)
4674 if (*p == '\t') {
4675 if (tabsize > 0) {
4676 i = tabsize - (j % tabsize);
4677 j += i;
4678 while (i--)
4679 *q++ = ' ';
4680 }
4681 }
4682 else {
4683 j++;
4684 *q++ = *p;
4685 if (*p == '\n' || *p == '\r')
4686 j = 0;
4687 }
4688
4689 return (PyObject*) u;
4690}
4691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004692PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693"S.find(sub [,start [,end]]) -> int\n\
4694\n\
4695Return the lowest index in S where substring sub is found,\n\
4696such that sub is contained within s[start,end]. Optional\n\
4697arguments start and end are interpreted as in slice notation.\n\
4698\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004699Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700
4701static PyObject *
4702unicode_find(PyUnicodeObject *self, PyObject *args)
4703{
4704 PyUnicodeObject *substring;
4705 int start = 0;
4706 int end = INT_MAX;
4707 PyObject *result;
4708
Guido van Rossumb8872e62000-05-09 14:14:27 +00004709 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4710 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 return NULL;
4712 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4713 (PyObject *)substring);
4714 if (substring == NULL)
4715 return NULL;
4716
4717 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4718
4719 Py_DECREF(substring);
4720 return result;
4721}
4722
4723static PyObject *
4724unicode_getitem(PyUnicodeObject *self, int index)
4725{
4726 if (index < 0 || index >= self->length) {
4727 PyErr_SetString(PyExc_IndexError, "string index out of range");
4728 return NULL;
4729 }
4730
4731 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4732}
4733
4734static long
4735unicode_hash(PyUnicodeObject *self)
4736{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004737 /* Since Unicode objects compare equal to their ASCII string
4738 counterparts, they should use the individual character values
4739 as basis for their hash value. This is needed to assure that
4740 strings and Unicode objects behave in the same way as
4741 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Fredrik Lundhdde61642000-07-10 18:27:47 +00004743 register int len;
4744 register Py_UNICODE *p;
4745 register long x;
4746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 if (self->hash != -1)
4748 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004749 len = PyUnicode_GET_SIZE(self);
4750 p = PyUnicode_AS_UNICODE(self);
4751 x = *p << 7;
4752 while (--len >= 0)
4753 x = (1000003*x) ^ *p++;
4754 x ^= PyUnicode_GET_SIZE(self);
4755 if (x == -1)
4756 x = -2;
4757 self->hash = x;
4758 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759}
4760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004761PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762"S.index(sub [,start [,end]]) -> int\n\
4763\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004764Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765
4766static PyObject *
4767unicode_index(PyUnicodeObject *self, PyObject *args)
4768{
4769 int result;
4770 PyUnicodeObject *substring;
4771 int start = 0;
4772 int end = INT_MAX;
4773
Guido van Rossumb8872e62000-05-09 14:14:27 +00004774 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4775 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 return NULL;
4777
4778 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4779 (PyObject *)substring);
4780 if (substring == NULL)
4781 return NULL;
4782
4783 result = findstring(self, substring, start, end, 1);
4784
4785 Py_DECREF(substring);
4786 if (result < 0) {
4787 PyErr_SetString(PyExc_ValueError, "substring not found");
4788 return NULL;
4789 }
4790 return PyInt_FromLong(result);
4791}
4792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004793PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004794"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004796Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004797at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798
4799static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004800unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801{
4802 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4803 register const Py_UNICODE *e;
4804 int cased;
4805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 /* Shortcut for single character strings */
4807 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004808 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004810 /* Special case for empty strings */
4811 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004812 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004813
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 e = p + PyUnicode_GET_SIZE(self);
4815 cased = 0;
4816 for (; p < e; p++) {
4817 register const Py_UNICODE ch = *p;
4818
4819 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004820 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 else if (!cased && Py_UNICODE_ISLOWER(ch))
4822 cased = 1;
4823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004824 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825}
4826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004827PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004828"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004830Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832
4833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004834unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
4836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4837 register const Py_UNICODE *e;
4838 int cased;
4839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 /* Shortcut for single character strings */
4841 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004842 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004844 /* Special case for empty strings */
4845 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004846 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004847
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 e = p + PyUnicode_GET_SIZE(self);
4849 cased = 0;
4850 for (; p < e; p++) {
4851 register const Py_UNICODE ch = *p;
4852
4853 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004854 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 else if (!cased && Py_UNICODE_ISUPPER(ch))
4856 cased = 1;
4857 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004858 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859}
4860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004861PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004862"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004864Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4865characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004866only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867
4868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004869unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
4871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4872 register const Py_UNICODE *e;
4873 int cased, previous_is_cased;
4874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 /* Shortcut for single character strings */
4876 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004877 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4878 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004880 /* Special case for empty strings */
4881 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004882 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 e = p + PyUnicode_GET_SIZE(self);
4885 cased = 0;
4886 previous_is_cased = 0;
4887 for (; p < e; p++) {
4888 register const Py_UNICODE ch = *p;
4889
4890 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4891 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004892 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 previous_is_cased = 1;
4894 cased = 1;
4895 }
4896 else if (Py_UNICODE_ISLOWER(ch)) {
4897 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004898 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 previous_is_cased = 1;
4900 cased = 1;
4901 }
4902 else
4903 previous_is_cased = 0;
4904 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004905 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906}
4907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004908PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004909"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004912False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
4914static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004915unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916{
4917 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4918 register const Py_UNICODE *e;
4919
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 /* Shortcut for single character strings */
4921 if (PyUnicode_GET_SIZE(self) == 1 &&
4922 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004923 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004925 /* Special case for empty strings */
4926 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004927 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004928
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 e = p + PyUnicode_GET_SIZE(self);
4930 for (; p < e; p++) {
4931 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004932 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004934 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935}
4936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004937PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004938"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004939\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004940Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004941and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004942
4943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004944unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004945{
4946 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4947 register const Py_UNICODE *e;
4948
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004949 /* Shortcut for single character strings */
4950 if (PyUnicode_GET_SIZE(self) == 1 &&
4951 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004952 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004953
4954 /* Special case for empty strings */
4955 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004956 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004957
4958 e = p + PyUnicode_GET_SIZE(self);
4959 for (; p < e; p++) {
4960 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004961 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004962 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004963 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004964}
4965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004966PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004967"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004968\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004969Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004970and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004971
4972static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004973unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004974{
4975 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4976 register const Py_UNICODE *e;
4977
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004978 /* Shortcut for single character strings */
4979 if (PyUnicode_GET_SIZE(self) == 1 &&
4980 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004981 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004982
4983 /* Special case for empty strings */
4984 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004985 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004986
4987 e = p + PyUnicode_GET_SIZE(self);
4988 for (; p < e; p++) {
4989 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004990 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004991 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004992 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004993}
4994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004995PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004996"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004998Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004999False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000
5001static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005002unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003{
5004 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5005 register const Py_UNICODE *e;
5006
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 /* Shortcut for single character strings */
5008 if (PyUnicode_GET_SIZE(self) == 1 &&
5009 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005010 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005012 /* Special case for empty strings */
5013 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005014 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005015
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 e = p + PyUnicode_GET_SIZE(self);
5017 for (; p < e; p++) {
5018 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005019 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005021 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022}
5023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005024PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005025"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005027Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005028False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029
5030static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005031unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032{
5033 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5034 register const Py_UNICODE *e;
5035
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 /* Shortcut for single character strings */
5037 if (PyUnicode_GET_SIZE(self) == 1 &&
5038 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005039 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005041 /* Special case for empty strings */
5042 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005043 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005044
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 e = p + PyUnicode_GET_SIZE(self);
5046 for (; p < e; p++) {
5047 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005048 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005050 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051}
5052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005053PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005054"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005056Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005057False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058
5059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005060unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061{
5062 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5063 register const Py_UNICODE *e;
5064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 /* Shortcut for single character strings */
5066 if (PyUnicode_GET_SIZE(self) == 1 &&
5067 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005068 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005070 /* Special case for empty strings */
5071 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005072 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005073
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 e = p + PyUnicode_GET_SIZE(self);
5075 for (; p < e; p++) {
5076 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005077 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005079 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080}
5081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005082PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083"S.join(sequence) -> unicode\n\
5084\n\
5085Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005086sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087
5088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005089unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005091 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092}
5093
5094static int
5095unicode_length(PyUnicodeObject *self)
5096{
5097 return self->length;
5098}
5099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005100PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101"S.ljust(width) -> unicode\n\
5102\n\
5103Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005104done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105
5106static PyObject *
5107unicode_ljust(PyUnicodeObject *self, PyObject *args)
5108{
5109 int width;
5110 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5111 return NULL;
5112
Tim Peters7a29bd52001-09-12 03:03:31 +00005113 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_INCREF(self);
5115 return (PyObject*) self;
5116 }
5117
5118 return (PyObject*) pad(self, 0, width - self->length, ' ');
5119}
5120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005121PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122"S.lower() -> unicode\n\
5123\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005124Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125
5126static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005127unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 return fixup(self, fixlower);
5130}
5131
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005132#define LEFTSTRIP 0
5133#define RIGHTSTRIP 1
5134#define BOTHSTRIP 2
5135
5136/* Arrays indexed by above */
5137static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5138
5139#define STRIPNAME(i) (stripformat[i]+3)
5140
5141static const Py_UNICODE *
5142unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5143{
Tim Peters030a5ce2002-04-22 19:00:10 +00005144 size_t i;
5145 for (i = 0; i < n; ++i)
5146 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005147 return s+i;
5148 return NULL;
5149}
5150
5151/* externally visible for str.strip(unicode) */
5152PyObject *
5153_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5154{
5155 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5156 int len = PyUnicode_GET_SIZE(self);
5157 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5158 int seplen = PyUnicode_GET_SIZE(sepobj);
5159 int i, j;
5160
5161 i = 0;
5162 if (striptype != RIGHTSTRIP) {
5163 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5164 i++;
5165 }
5166 }
5167
5168 j = len;
5169 if (striptype != LEFTSTRIP) {
5170 do {
5171 j--;
5172 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5173 j++;
5174 }
5175
5176 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5177 Py_INCREF(self);
5178 return (PyObject*)self;
5179 }
5180 else
5181 return PyUnicode_FromUnicode(s+i, j-i);
5182}
5183
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
5185static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005186do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005188 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5189 int len = PyUnicode_GET_SIZE(self), i, j;
5190
5191 i = 0;
5192 if (striptype != RIGHTSTRIP) {
5193 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5194 i++;
5195 }
5196 }
5197
5198 j = len;
5199 if (striptype != LEFTSTRIP) {
5200 do {
5201 j--;
5202 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5203 j++;
5204 }
5205
5206 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5207 Py_INCREF(self);
5208 return (PyObject*)self;
5209 }
5210 else
5211 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212}
5213
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005214
5215static PyObject *
5216do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5217{
5218 PyObject *sep = NULL;
5219
5220 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5221 return NULL;
5222
5223 if (sep != NULL && sep != Py_None) {
5224 if (PyUnicode_Check(sep))
5225 return _PyUnicode_XStrip(self, striptype, sep);
5226 else if (PyString_Check(sep)) {
5227 PyObject *res;
5228 sep = PyUnicode_FromObject(sep);
5229 if (sep==NULL)
5230 return NULL;
5231 res = _PyUnicode_XStrip(self, striptype, sep);
5232 Py_DECREF(sep);
5233 return res;
5234 }
5235 else {
5236 PyErr_Format(PyExc_TypeError,
5237 "%s arg must be None, unicode or str",
5238 STRIPNAME(striptype));
5239 return NULL;
5240 }
5241 }
5242
5243 return do_strip(self, striptype);
5244}
5245
5246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005247PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005248"S.strip([sep]) -> unicode\n\
5249\n\
5250Return a copy of the string S with leading and trailing\n\
5251whitespace removed.\n\
5252If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005253If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005254
5255static PyObject *
5256unicode_strip(PyUnicodeObject *self, PyObject *args)
5257{
5258 if (PyTuple_GET_SIZE(args) == 0)
5259 return do_strip(self, BOTHSTRIP); /* Common case */
5260 else
5261 return do_argstrip(self, BOTHSTRIP, args);
5262}
5263
5264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005265PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005266"S.lstrip([sep]) -> unicode\n\
5267\n\
5268Return a copy of the string S with leading whitespace removed.\n\
5269If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005270If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005271
5272static PyObject *
5273unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5274{
5275 if (PyTuple_GET_SIZE(args) == 0)
5276 return do_strip(self, LEFTSTRIP); /* Common case */
5277 else
5278 return do_argstrip(self, LEFTSTRIP, args);
5279}
5280
5281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005282PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005283"S.rstrip([sep]) -> unicode\n\
5284\n\
5285Return a copy of the string S with trailing whitespace removed.\n\
5286If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005287If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005288
5289static PyObject *
5290unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5291{
5292 if (PyTuple_GET_SIZE(args) == 0)
5293 return do_strip(self, RIGHTSTRIP); /* Common case */
5294 else
5295 return do_argstrip(self, RIGHTSTRIP, args);
5296}
5297
5298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299static PyObject*
5300unicode_repeat(PyUnicodeObject *str, int len)
5301{
5302 PyUnicodeObject *u;
5303 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005304 int nchars;
5305 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306
5307 if (len < 0)
5308 len = 0;
5309
Tim Peters7a29bd52001-09-12 03:03:31 +00005310 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 /* no repeat, return original string */
5312 Py_INCREF(str);
5313 return (PyObject*) str;
5314 }
Tim Peters8f422462000-09-09 06:13:41 +00005315
5316 /* ensure # of chars needed doesn't overflow int and # of bytes
5317 * needed doesn't overflow size_t
5318 */
5319 nchars = len * str->length;
5320 if (len && nchars / len != str->length) {
5321 PyErr_SetString(PyExc_OverflowError,
5322 "repeated string is too long");
5323 return NULL;
5324 }
5325 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5326 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5327 PyErr_SetString(PyExc_OverflowError,
5328 "repeated string is too long");
5329 return NULL;
5330 }
5331 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 if (!u)
5333 return NULL;
5334
5335 p = u->str;
5336
5337 while (len-- > 0) {
5338 Py_UNICODE_COPY(p, str->str, str->length);
5339 p += str->length;
5340 }
5341
5342 return (PyObject*) u;
5343}
5344
5345PyObject *PyUnicode_Replace(PyObject *obj,
5346 PyObject *subobj,
5347 PyObject *replobj,
5348 int maxcount)
5349{
5350 PyObject *self;
5351 PyObject *str1;
5352 PyObject *str2;
5353 PyObject *result;
5354
5355 self = PyUnicode_FromObject(obj);
5356 if (self == NULL)
5357 return NULL;
5358 str1 = PyUnicode_FromObject(subobj);
5359 if (str1 == NULL) {
5360 Py_DECREF(self);
5361 return NULL;
5362 }
5363 str2 = PyUnicode_FromObject(replobj);
5364 if (str2 == NULL) {
5365 Py_DECREF(self);
5366 Py_DECREF(str1);
5367 return NULL;
5368 }
5369 result = replace((PyUnicodeObject *)self,
5370 (PyUnicodeObject *)str1,
5371 (PyUnicodeObject *)str2,
5372 maxcount);
5373 Py_DECREF(self);
5374 Py_DECREF(str1);
5375 Py_DECREF(str2);
5376 return result;
5377}
5378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005379PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380"S.replace (old, new[, maxsplit]) -> unicode\n\
5381\n\
5382Return a copy of S with all occurrences of substring\n\
5383old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005384given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
5386static PyObject*
5387unicode_replace(PyUnicodeObject *self, PyObject *args)
5388{
5389 PyUnicodeObject *str1;
5390 PyUnicodeObject *str2;
5391 int maxcount = -1;
5392 PyObject *result;
5393
5394 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5395 return NULL;
5396 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5397 if (str1 == NULL)
5398 return NULL;
5399 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5400 if (str2 == NULL)
5401 return NULL;
5402
5403 result = replace(self, str1, str2, maxcount);
5404
5405 Py_DECREF(str1);
5406 Py_DECREF(str2);
5407 return result;
5408}
5409
5410static
5411PyObject *unicode_repr(PyObject *unicode)
5412{
5413 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5414 PyUnicode_GET_SIZE(unicode),
5415 1);
5416}
5417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005418PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419"S.rfind(sub [,start [,end]]) -> int\n\
5420\n\
5421Return the highest index in S where substring sub is found,\n\
5422such that sub is contained within s[start,end]. Optional\n\
5423arguments start and end are interpreted as in slice notation.\n\
5424\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005425Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
5427static PyObject *
5428unicode_rfind(PyUnicodeObject *self, PyObject *args)
5429{
5430 PyUnicodeObject *substring;
5431 int start = 0;
5432 int end = INT_MAX;
5433 PyObject *result;
5434
Guido van Rossumb8872e62000-05-09 14:14:27 +00005435 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5436 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 return NULL;
5438 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5439 (PyObject *)substring);
5440 if (substring == NULL)
5441 return NULL;
5442
5443 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5444
5445 Py_DECREF(substring);
5446 return result;
5447}
5448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005449PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450"S.rindex(sub [,start [,end]]) -> int\n\
5451\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005452Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453
5454static PyObject *
5455unicode_rindex(PyUnicodeObject *self, PyObject *args)
5456{
5457 int result;
5458 PyUnicodeObject *substring;
5459 int start = 0;
5460 int end = INT_MAX;
5461
Guido van Rossumb8872e62000-05-09 14:14:27 +00005462 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5463 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 return NULL;
5465 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5466 (PyObject *)substring);
5467 if (substring == NULL)
5468 return NULL;
5469
5470 result = findstring(self, substring, start, end, -1);
5471
5472 Py_DECREF(substring);
5473 if (result < 0) {
5474 PyErr_SetString(PyExc_ValueError, "substring not found");
5475 return NULL;
5476 }
5477 return PyInt_FromLong(result);
5478}
5479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005480PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481"S.rjust(width) -> unicode\n\
5482\n\
5483Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005484done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
5486static PyObject *
5487unicode_rjust(PyUnicodeObject *self, PyObject *args)
5488{
5489 int width;
5490 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5491 return NULL;
5492
Tim Peters7a29bd52001-09-12 03:03:31 +00005493 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 Py_INCREF(self);
5495 return (PyObject*) self;
5496 }
5497
5498 return (PyObject*) pad(self, width - self->length, 0, ' ');
5499}
5500
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501static PyObject*
5502unicode_slice(PyUnicodeObject *self, int start, int end)
5503{
5504 /* standard clamping */
5505 if (start < 0)
5506 start = 0;
5507 if (end < 0)
5508 end = 0;
5509 if (end > self->length)
5510 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005511 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 /* full slice, return original string */
5513 Py_INCREF(self);
5514 return (PyObject*) self;
5515 }
5516 if (start > end)
5517 start = end;
5518 /* copy slice */
5519 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5520 end - start);
5521}
5522
5523PyObject *PyUnicode_Split(PyObject *s,
5524 PyObject *sep,
5525 int maxsplit)
5526{
5527 PyObject *result;
5528
5529 s = PyUnicode_FromObject(s);
5530 if (s == NULL)
5531 return NULL;
5532 if (sep != NULL) {
5533 sep = PyUnicode_FromObject(sep);
5534 if (sep == NULL) {
5535 Py_DECREF(s);
5536 return NULL;
5537 }
5538 }
5539
5540 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5541
5542 Py_DECREF(s);
5543 Py_XDECREF(sep);
5544 return result;
5545}
5546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005547PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548"S.split([sep [,maxsplit]]) -> list of strings\n\
5549\n\
5550Return a list of the words in S, using sep as the\n\
5551delimiter string. If maxsplit is given, at most maxsplit\n\
5552splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005553is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554
5555static PyObject*
5556unicode_split(PyUnicodeObject *self, PyObject *args)
5557{
5558 PyObject *substring = Py_None;
5559 int maxcount = -1;
5560
5561 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5562 return NULL;
5563
5564 if (substring == Py_None)
5565 return split(self, NULL, maxcount);
5566 else if (PyUnicode_Check(substring))
5567 return split(self, (PyUnicodeObject *)substring, maxcount);
5568 else
5569 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5570}
5571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005572PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005573"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574\n\
5575Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005576Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005577is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
5579static PyObject*
5580unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5581{
Guido van Rossum86662912000-04-11 15:38:46 +00005582 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
Guido van Rossum86662912000-04-11 15:38:46 +00005584 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 return NULL;
5586
Guido van Rossum86662912000-04-11 15:38:46 +00005587 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588}
5589
5590static
5591PyObject *unicode_str(PyUnicodeObject *self)
5592{
Fred Drakee4315f52000-05-09 19:53:39 +00005593 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594}
5595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005596PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597"S.swapcase() -> unicode\n\
5598\n\
5599Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005600and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
5602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005603unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return fixup(self, fixswapcase);
5606}
5607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005608PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609"S.translate(table) -> unicode\n\
5610\n\
5611Return a copy of the string S, where all characters have been mapped\n\
5612through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005613Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5614Unmapped characters are left untouched. Characters mapped to None\n\
5615are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
5617static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005618unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 return PyUnicode_TranslateCharmap(self->str,
5621 self->length,
5622 table,
5623 "ignore");
5624}
5625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005626PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627"S.upper() -> unicode\n\
5628\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005629Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630
5631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005632unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 return fixup(self, fixupper);
5635}
5636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005637PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638"S.zfill(width) -> unicode\n\
5639\n\
5640Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005641of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
5643static PyObject *
5644unicode_zfill(PyUnicodeObject *self, PyObject *args)
5645{
5646 int fill;
5647 PyUnicodeObject *u;
5648
5649 int width;
5650 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5651 return NULL;
5652
5653 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005654 if (PyUnicode_CheckExact(self)) {
5655 Py_INCREF(self);
5656 return (PyObject*) self;
5657 }
5658 else
5659 return PyUnicode_FromUnicode(
5660 PyUnicode_AS_UNICODE(self),
5661 PyUnicode_GET_SIZE(self)
5662 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 }
5664
5665 fill = width - self->length;
5666
5667 u = pad(self, fill, 0, '0');
5668
Walter Dörwald068325e2002-04-15 13:36:47 +00005669 if (u == NULL)
5670 return NULL;
5671
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 if (u->str[fill] == '+' || u->str[fill] == '-') {
5673 /* move sign to beginning of string */
5674 u->str[0] = u->str[fill];
5675 u->str[fill] = '0';
5676 }
5677
5678 return (PyObject*) u;
5679}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
5681#if 0
5682static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005683unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 return PyInt_FromLong(unicode_freelist_size);
5686}
5687#endif
5688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005689PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005690"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005692Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695
5696static PyObject *
5697unicode_startswith(PyUnicodeObject *self,
5698 PyObject *args)
5699{
5700 PyUnicodeObject *substring;
5701 int start = 0;
5702 int end = INT_MAX;
5703 PyObject *result;
5704
Guido van Rossumb8872e62000-05-09 14:14:27 +00005705 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5706 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 return NULL;
5708 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5709 (PyObject *)substring);
5710 if (substring == NULL)
5711 return NULL;
5712
Guido van Rossum77f6a652002-04-03 22:41:51 +00005713 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
5715 Py_DECREF(substring);
5716 return result;
5717}
5718
5719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005720PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005721"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005723Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005725comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
5727static PyObject *
5728unicode_endswith(PyUnicodeObject *self,
5729 PyObject *args)
5730{
5731 PyUnicodeObject *substring;
5732 int start = 0;
5733 int end = INT_MAX;
5734 PyObject *result;
5735
Guido van Rossumb8872e62000-05-09 14:14:27 +00005736 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5737 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 return NULL;
5739 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5740 (PyObject *)substring);
5741 if (substring == NULL)
5742 return NULL;
5743
Guido van Rossum77f6a652002-04-03 22:41:51 +00005744 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
5746 Py_DECREF(substring);
5747 return result;
5748}
5749
5750
5751static PyMethodDef unicode_methods[] = {
5752
5753 /* Order is according to common usage: often used methods should
5754 appear first, since lookup is done sequentially. */
5755
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005756 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5757 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5758 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5759 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5760 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5761 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5762 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5763 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5764 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5765 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5766 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5767 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5768 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005769 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005770/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5771 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5772 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5773 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005774 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005775 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005776 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005777 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5778 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5779 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5780 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5781 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5782 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5783 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5784 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5785 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5786 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5787 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5788 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5789 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5790 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005791 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005792#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005793 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794#endif
5795
5796#if 0
5797 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005798 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799#endif
5800
5801 {NULL, NULL}
5802};
5803
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804static PySequenceMethods unicode_as_sequence = {
5805 (inquiry) unicode_length, /* sq_length */
5806 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5807 (intargfunc) unicode_repeat, /* sq_repeat */
5808 (intargfunc) unicode_getitem, /* sq_item */
5809 (intintargfunc) unicode_slice, /* sq_slice */
5810 0, /* sq_ass_item */
5811 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005812 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813};
5814
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005815static PyObject*
5816unicode_subscript(PyUnicodeObject* self, PyObject* item)
5817{
5818 if (PyInt_Check(item)) {
5819 long i = PyInt_AS_LONG(item);
5820 if (i < 0)
5821 i += PyString_GET_SIZE(self);
5822 return unicode_getitem(self, i);
5823 } else if (PyLong_Check(item)) {
5824 long i = PyLong_AsLong(item);
5825 if (i == -1 && PyErr_Occurred())
5826 return NULL;
5827 if (i < 0)
5828 i += PyString_GET_SIZE(self);
5829 return unicode_getitem(self, i);
5830 } else if (PySlice_Check(item)) {
5831 int start, stop, step, slicelength, cur, i;
5832 Py_UNICODE* source_buf;
5833 Py_UNICODE* result_buf;
5834 PyObject* result;
5835
5836 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5837 &start, &stop, &step, &slicelength) < 0) {
5838 return NULL;
5839 }
5840
5841 if (slicelength <= 0) {
5842 return PyUnicode_FromUnicode(NULL, 0);
5843 } else {
5844 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5845 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5846
5847 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5848 result_buf[i] = source_buf[cur];
5849 }
5850
5851 result = PyUnicode_FromUnicode(result_buf, slicelength);
5852 PyMem_FREE(result_buf);
5853 return result;
5854 }
5855 } else {
5856 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5857 return NULL;
5858 }
5859}
5860
5861static PyMappingMethods unicode_as_mapping = {
5862 (inquiry)unicode_length, /* mp_length */
5863 (binaryfunc)unicode_subscript, /* mp_subscript */
5864 (objobjargproc)0, /* mp_ass_subscript */
5865};
5866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867static int
5868unicode_buffer_getreadbuf(PyUnicodeObject *self,
5869 int index,
5870 const void **ptr)
5871{
5872 if (index != 0) {
5873 PyErr_SetString(PyExc_SystemError,
5874 "accessing non-existent unicode segment");
5875 return -1;
5876 }
5877 *ptr = (void *) self->str;
5878 return PyUnicode_GET_DATA_SIZE(self);
5879}
5880
5881static int
5882unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5883 const void **ptr)
5884{
5885 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005886 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 return -1;
5888}
5889
5890static int
5891unicode_buffer_getsegcount(PyUnicodeObject *self,
5892 int *lenp)
5893{
5894 if (lenp)
5895 *lenp = PyUnicode_GET_DATA_SIZE(self);
5896 return 1;
5897}
5898
5899static int
5900unicode_buffer_getcharbuf(PyUnicodeObject *self,
5901 int index,
5902 const void **ptr)
5903{
5904 PyObject *str;
5905
5906 if (index != 0) {
5907 PyErr_SetString(PyExc_SystemError,
5908 "accessing non-existent unicode segment");
5909 return -1;
5910 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005911 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 if (str == NULL)
5913 return -1;
5914 *ptr = (void *) PyString_AS_STRING(str);
5915 return PyString_GET_SIZE(str);
5916}
5917
5918/* Helpers for PyUnicode_Format() */
5919
5920static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005921getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
5923 int argidx = *p_argidx;
5924 if (argidx < arglen) {
5925 (*p_argidx)++;
5926 if (arglen < 0)
5927 return args;
5928 else
5929 return PyTuple_GetItem(args, argidx);
5930 }
5931 PyErr_SetString(PyExc_TypeError,
5932 "not enough arguments for format string");
5933 return NULL;
5934}
5935
5936#define F_LJUST (1<<0)
5937#define F_SIGN (1<<1)
5938#define F_BLANK (1<<2)
5939#define F_ALT (1<<3)
5940#define F_ZERO (1<<4)
5941
5942static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944{
5945 register int i;
5946 int len;
5947 va_list va;
5948 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
5951 /* First, format the string as char array, then expand to Py_UNICODE
5952 array. */
5953 charbuffer = (char *)buffer;
5954 len = vsprintf(charbuffer, format, va);
5955 for (i = len - 1; i >= 0; i--)
5956 buffer[i] = (Py_UNICODE) charbuffer[i];
5957
5958 va_end(va);
5959 return len;
5960}
5961
Guido van Rossum078151d2002-08-11 04:24:12 +00005962/* XXX To save some code duplication, formatfloat/long/int could have been
5963 shared with stringobject.c, converting from 8-bit to Unicode after the
5964 formatting is done. */
5965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966static int
5967formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005968 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 int flags,
5970 int prec,
5971 int type,
5972 PyObject *v)
5973{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005974 /* fmt = '%#.' + `prec` + `type`
5975 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 char fmt[20];
5977 double x;
5978
5979 x = PyFloat_AsDouble(v);
5980 if (x == -1.0 && PyErr_Occurred())
5981 return -1;
5982 if (prec < 0)
5983 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5985 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005986 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5987 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005988 /* worst case length calc to ensure no buffer overrun:
5989 fmt = %#.<prec>g
5990 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5991 for any double rep.)
5992 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5993 If prec=0 the effective precision is 1 (the leading digit is
5994 always given), therefore increase by one to 10+prec. */
5995 if (buflen <= (size_t)10 + (size_t)prec) {
5996 PyErr_SetString(PyExc_OverflowError,
5997 "formatted float is too long (precision too long?)");
5998 return -1;
5999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 return usprintf(buf, fmt, x);
6001}
6002
Tim Peters38fd5b62000-09-21 05:43:11 +00006003static PyObject*
6004formatlong(PyObject *val, int flags, int prec, int type)
6005{
6006 char *buf;
6007 int i, len;
6008 PyObject *str; /* temporary string object. */
6009 PyUnicodeObject *result;
6010
6011 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6012 if (!str)
6013 return NULL;
6014 result = _PyUnicode_New(len);
6015 for (i = 0; i < len; i++)
6016 result->str[i] = buf[i];
6017 result->str[len] = 0;
6018 Py_DECREF(str);
6019 return (PyObject*)result;
6020}
6021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022static int
6023formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006024 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 int flags,
6026 int prec,
6027 int type,
6028 PyObject *v)
6029{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006030 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006031 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6032 * + 1 + 1
6033 * = 24
6034 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006035 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 long x;
6037
6038 x = PyInt_AsLong(v);
6039 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006040 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006041 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006042 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006043 "%u/%o/%x/%X of negative int will return "
6044 "a signed string in Python 2.4 and up") < 0)
6045 return -1;
6046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006048 prec = 1;
6049
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006050 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006051 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6052 */
6053 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006054 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006055 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006056 return -1;
6057 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006058
6059 if ((flags & F_ALT) &&
6060 (type == 'x' || type == 'X')) {
6061 /* When converting under %#x or %#X, there are a number
6062 * of issues that cause pain:
6063 * - when 0 is being converted, the C standard leaves off
6064 * the '0x' or '0X', which is inconsistent with other
6065 * %#x/%#X conversions and inconsistent with Python's
6066 * hex() function
6067 * - there are platforms that violate the standard and
6068 * convert 0 with the '0x' or '0X'
6069 * (Metrowerks, Compaq Tru64)
6070 * - there are platforms that give '0x' when converting
6071 * under %#X, but convert 0 in accordance with the
6072 * standard (OS/2 EMX)
6073 *
6074 * We can achieve the desired consistency by inserting our
6075 * own '0x' or '0X' prefix, and substituting %x/%X in place
6076 * of %#x/%#X.
6077 *
6078 * Note that this is the same approach as used in
6079 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006080 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006081 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6082 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006083 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006084 else {
6085 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6086 (flags&F_ALT) ? "#" : "",
6087 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 return usprintf(buf, fmt, x);
6090}
6091
6092static int
6093formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006094 size_t buflen,
6095 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006097 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006098 if (PyUnicode_Check(v)) {
6099 if (PyUnicode_GET_SIZE(v) != 1)
6100 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006104 else if (PyString_Check(v)) {
6105 if (PyString_GET_SIZE(v) != 1)
6106 goto onError;
6107 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
6110 else {
6111 /* Integer input truncated to a character */
6112 long x;
6113 x = PyInt_AsLong(v);
6114 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006115 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006116#ifdef Py_UNICODE_WIDE
6117 if (x < 0 || x > 0x10ffff) {
6118 PyErr_SetString(PyExc_ValueError,
6119 "%c arg not in range(0x110000) "
6120 "(wide Python build)");
6121 return -1;
6122 }
6123#else
6124 if (x < 0 || x > 0xffff) {
6125 PyErr_SetString(PyExc_ValueError,
6126 "%c arg not in range(0x10000) "
6127 "(narrow Python build)");
6128 return -1;
6129 }
6130#endif
6131 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
6133 buf[1] = '\0';
6134 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006135
6136 onError:
6137 PyErr_SetString(PyExc_TypeError,
6138 "%c requires int or char");
6139 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140}
6141
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006142/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6143
6144 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6145 chars are formatted. XXX This is a magic number. Each formatting
6146 routine does bounds checking to ensure no overflow, but a better
6147 solution may be to malloc a buffer of appropriate size for each
6148 format. For now, the current solution is sufficient.
6149*/
6150#define FORMATBUFLEN (size_t)120
6151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152PyObject *PyUnicode_Format(PyObject *format,
6153 PyObject *args)
6154{
6155 Py_UNICODE *fmt, *res;
6156 int fmtcnt, rescnt, reslen, arglen, argidx;
6157 int args_owned = 0;
6158 PyUnicodeObject *result = NULL;
6159 PyObject *dict = NULL;
6160 PyObject *uformat;
6161
6162 if (format == NULL || args == NULL) {
6163 PyErr_BadInternalCall();
6164 return NULL;
6165 }
6166 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006167 if (uformat == NULL)
6168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 fmt = PyUnicode_AS_UNICODE(uformat);
6170 fmtcnt = PyUnicode_GET_SIZE(uformat);
6171
6172 reslen = rescnt = fmtcnt + 100;
6173 result = _PyUnicode_New(reslen);
6174 if (result == NULL)
6175 goto onError;
6176 res = PyUnicode_AS_UNICODE(result);
6177
6178 if (PyTuple_Check(args)) {
6179 arglen = PyTuple_Size(args);
6180 argidx = 0;
6181 }
6182 else {
6183 arglen = -1;
6184 argidx = -2;
6185 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006186 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 dict = args;
6188
6189 while (--fmtcnt >= 0) {
6190 if (*fmt != '%') {
6191 if (--rescnt < 0) {
6192 rescnt = fmtcnt + 100;
6193 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006194 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 return NULL;
6196 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6197 --rescnt;
6198 }
6199 *res++ = *fmt++;
6200 }
6201 else {
6202 /* Got a format specifier */
6203 int flags = 0;
6204 int width = -1;
6205 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 Py_UNICODE c = '\0';
6207 Py_UNICODE fill;
6208 PyObject *v = NULL;
6209 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006210 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 Py_UNICODE sign;
6212 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006213 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
6215 fmt++;
6216 if (*fmt == '(') {
6217 Py_UNICODE *keystart;
6218 int keylen;
6219 PyObject *key;
6220 int pcount = 1;
6221
6222 if (dict == NULL) {
6223 PyErr_SetString(PyExc_TypeError,
6224 "format requires a mapping");
6225 goto onError;
6226 }
6227 ++fmt;
6228 --fmtcnt;
6229 keystart = fmt;
6230 /* Skip over balanced parentheses */
6231 while (pcount > 0 && --fmtcnt >= 0) {
6232 if (*fmt == ')')
6233 --pcount;
6234 else if (*fmt == '(')
6235 ++pcount;
6236 fmt++;
6237 }
6238 keylen = fmt - keystart - 1;
6239 if (fmtcnt < 0 || pcount > 0) {
6240 PyErr_SetString(PyExc_ValueError,
6241 "incomplete format key");
6242 goto onError;
6243 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006244#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006245 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 then looked up since Python uses strings to hold
6247 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006248 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 key = PyUnicode_EncodeUTF8(keystart,
6250 keylen,
6251 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006252#else
6253 key = PyUnicode_FromUnicode(keystart, keylen);
6254#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 if (key == NULL)
6256 goto onError;
6257 if (args_owned) {
6258 Py_DECREF(args);
6259 args_owned = 0;
6260 }
6261 args = PyObject_GetItem(dict, key);
6262 Py_DECREF(key);
6263 if (args == NULL) {
6264 goto onError;
6265 }
6266 args_owned = 1;
6267 arglen = -1;
6268 argidx = -2;
6269 }
6270 while (--fmtcnt >= 0) {
6271 switch (c = *fmt++) {
6272 case '-': flags |= F_LJUST; continue;
6273 case '+': flags |= F_SIGN; continue;
6274 case ' ': flags |= F_BLANK; continue;
6275 case '#': flags |= F_ALT; continue;
6276 case '0': flags |= F_ZERO; continue;
6277 }
6278 break;
6279 }
6280 if (c == '*') {
6281 v = getnextarg(args, arglen, &argidx);
6282 if (v == NULL)
6283 goto onError;
6284 if (!PyInt_Check(v)) {
6285 PyErr_SetString(PyExc_TypeError,
6286 "* wants int");
6287 goto onError;
6288 }
6289 width = PyInt_AsLong(v);
6290 if (width < 0) {
6291 flags |= F_LJUST;
6292 width = -width;
6293 }
6294 if (--fmtcnt >= 0)
6295 c = *fmt++;
6296 }
6297 else if (c >= '0' && c <= '9') {
6298 width = c - '0';
6299 while (--fmtcnt >= 0) {
6300 c = *fmt++;
6301 if (c < '0' || c > '9')
6302 break;
6303 if ((width*10) / 10 != width) {
6304 PyErr_SetString(PyExc_ValueError,
6305 "width too big");
6306 goto onError;
6307 }
6308 width = width*10 + (c - '0');
6309 }
6310 }
6311 if (c == '.') {
6312 prec = 0;
6313 if (--fmtcnt >= 0)
6314 c = *fmt++;
6315 if (c == '*') {
6316 v = getnextarg(args, arglen, &argidx);
6317 if (v == NULL)
6318 goto onError;
6319 if (!PyInt_Check(v)) {
6320 PyErr_SetString(PyExc_TypeError,
6321 "* wants int");
6322 goto onError;
6323 }
6324 prec = PyInt_AsLong(v);
6325 if (prec < 0)
6326 prec = 0;
6327 if (--fmtcnt >= 0)
6328 c = *fmt++;
6329 }
6330 else if (c >= '0' && c <= '9') {
6331 prec = c - '0';
6332 while (--fmtcnt >= 0) {
6333 c = Py_CHARMASK(*fmt++);
6334 if (c < '0' || c > '9')
6335 break;
6336 if ((prec*10) / 10 != prec) {
6337 PyErr_SetString(PyExc_ValueError,
6338 "prec too big");
6339 goto onError;
6340 }
6341 prec = prec*10 + (c - '0');
6342 }
6343 }
6344 } /* prec */
6345 if (fmtcnt >= 0) {
6346 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 if (--fmtcnt >= 0)
6348 c = *fmt++;
6349 }
6350 }
6351 if (fmtcnt < 0) {
6352 PyErr_SetString(PyExc_ValueError,
6353 "incomplete format");
6354 goto onError;
6355 }
6356 if (c != '%') {
6357 v = getnextarg(args, arglen, &argidx);
6358 if (v == NULL)
6359 goto onError;
6360 }
6361 sign = 0;
6362 fill = ' ';
6363 switch (c) {
6364
6365 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006366 pbuf = formatbuf;
6367 /* presume that buffer length is at least 1 */
6368 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 len = 1;
6370 break;
6371
6372 case 's':
6373 case 'r':
6374 if (PyUnicode_Check(v) && c == 's') {
6375 temp = v;
6376 Py_INCREF(temp);
6377 }
6378 else {
6379 PyObject *unicode;
6380 if (c == 's')
6381 temp = PyObject_Str(v);
6382 else
6383 temp = PyObject_Repr(v);
6384 if (temp == NULL)
6385 goto onError;
6386 if (!PyString_Check(temp)) {
6387 /* XXX Note: this should never happen, since
6388 PyObject_Repr() and PyObject_Str() assure
6389 this */
6390 Py_DECREF(temp);
6391 PyErr_SetString(PyExc_TypeError,
6392 "%s argument has non-string str()");
6393 goto onError;
6394 }
Fred Drakee4315f52000-05-09 19:53:39 +00006395 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006397 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 "strict");
6399 Py_DECREF(temp);
6400 temp = unicode;
6401 if (temp == NULL)
6402 goto onError;
6403 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006404 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 len = PyUnicode_GET_SIZE(temp);
6406 if (prec >= 0 && len > prec)
6407 len = prec;
6408 break;
6409
6410 case 'i':
6411 case 'd':
6412 case 'u':
6413 case 'o':
6414 case 'x':
6415 case 'X':
6416 if (c == 'i')
6417 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006418 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006419 temp = formatlong(v, flags, prec, c);
6420 if (!temp)
6421 goto onError;
6422 pbuf = PyUnicode_AS_UNICODE(temp);
6423 len = PyUnicode_GET_SIZE(temp);
6424 /* unbounded ints can always produce
6425 a sign character! */
6426 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006428 else {
6429 pbuf = formatbuf;
6430 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6431 flags, prec, c, v);
6432 if (len < 0)
6433 goto onError;
6434 /* only d conversion is signed */
6435 sign = c == 'd';
6436 }
6437 if (flags & F_ZERO)
6438 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 break;
6440
6441 case 'e':
6442 case 'E':
6443 case 'f':
6444 case 'g':
6445 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006446 pbuf = formatbuf;
6447 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6448 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 if (len < 0)
6450 goto onError;
6451 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006452 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 fill = '0';
6454 break;
6455
6456 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006457 pbuf = formatbuf;
6458 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 if (len < 0)
6460 goto onError;
6461 break;
6462
6463 default:
6464 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006465 "unsupported format character '%c' (0x%x) "
6466 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00006467 (31<=c && c<=126) ? c : '?',
Guido van Rossumefc11882002-09-12 14:43:41 +00006468 c,
6469 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 goto onError;
6471 }
6472 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006473 if (*pbuf == '-' || *pbuf == '+') {
6474 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 len--;
6476 }
6477 else if (flags & F_SIGN)
6478 sign = '+';
6479 else if (flags & F_BLANK)
6480 sign = ' ';
6481 else
6482 sign = 0;
6483 }
6484 if (width < len)
6485 width = len;
6486 if (rescnt < width + (sign != 0)) {
6487 reslen -= rescnt;
6488 rescnt = width + fmtcnt + 100;
6489 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006490 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 return NULL;
6492 res = PyUnicode_AS_UNICODE(result)
6493 + reslen - rescnt;
6494 }
6495 if (sign) {
6496 if (fill != ' ')
6497 *res++ = sign;
6498 rescnt--;
6499 if (width > len)
6500 width--;
6501 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006502 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6503 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006504 assert(pbuf[1] == c);
6505 if (fill != ' ') {
6506 *res++ = *pbuf++;
6507 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006508 }
Tim Petersfff53252001-04-12 18:38:48 +00006509 rescnt -= 2;
6510 width -= 2;
6511 if (width < 0)
6512 width = 0;
6513 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 if (width > len && !(flags & F_LJUST)) {
6516 do {
6517 --rescnt;
6518 *res++ = fill;
6519 } while (--width > len);
6520 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006521 if (fill == ' ') {
6522 if (sign)
6523 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006524 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006525 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006526 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006527 *res++ = *pbuf++;
6528 *res++ = *pbuf++;
6529 }
6530 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006531 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 res += len;
6533 rescnt -= len;
6534 while (--width >= len) {
6535 --rescnt;
6536 *res++ = ' ';
6537 }
6538 if (dict && (argidx < arglen) && c != '%') {
6539 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006540 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 goto onError;
6542 }
6543 Py_XDECREF(temp);
6544 } /* '%' */
6545 } /* until end */
6546 if (argidx < arglen && !dict) {
6547 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006548 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 goto onError;
6550 }
6551
6552 if (args_owned) {
6553 Py_DECREF(args);
6554 }
6555 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006556 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006557 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 return (PyObject *)result;
6559
6560 onError:
6561 Py_XDECREF(result);
6562 Py_DECREF(uformat);
6563 if (args_owned) {
6564 Py_DECREF(args);
6565 }
6566 return NULL;
6567}
6568
6569static PyBufferProcs unicode_as_buffer = {
6570 (getreadbufferproc) unicode_buffer_getreadbuf,
6571 (getwritebufferproc) unicode_buffer_getwritebuf,
6572 (getsegcountproc) unicode_buffer_getsegcount,
6573 (getcharbufferproc) unicode_buffer_getcharbuf,
6574};
6575
Jeremy Hylton938ace62002-07-17 16:30:39 +00006576static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006577unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6578
Tim Peters6d6c1a32001-08-02 04:15:00 +00006579static PyObject *
6580unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6581{
6582 PyObject *x = NULL;
6583 static char *kwlist[] = {"string", "encoding", "errors", 0};
6584 char *encoding = NULL;
6585 char *errors = NULL;
6586
Guido van Rossume023fe02001-08-30 03:12:59 +00006587 if (type != &PyUnicode_Type)
6588 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006589 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6590 kwlist, &x, &encoding, &errors))
6591 return NULL;
6592 if (x == NULL)
6593 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006594 if (encoding == NULL && errors == NULL)
6595 return PyObject_Unicode(x);
6596 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006597 return PyUnicode_FromEncodedObject(x, encoding, errors);
6598}
6599
Guido van Rossume023fe02001-08-30 03:12:59 +00006600static PyObject *
6601unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6602{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006603 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006604 int n;
6605
6606 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6607 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6608 if (tmp == NULL)
6609 return NULL;
6610 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006611 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6612 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00006613 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00006614 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6615 if (pnew->str == NULL) {
6616 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006617 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00006618 return NULL;
6619 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006620 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6621 pnew->length = n;
6622 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006623 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006624 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006625}
6626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006627PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006628"unicode(string [, encoding[, errors]]) -> object\n\
6629\n\
6630Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006631encoding defaults to the current default string encoding.\n\
6632errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006633
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634PyTypeObject PyUnicode_Type = {
6635 PyObject_HEAD_INIT(&PyType_Type)
6636 0, /* ob_size */
6637 "unicode", /* tp_name */
6638 sizeof(PyUnicodeObject), /* tp_size */
6639 0, /* tp_itemsize */
6640 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006641 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006643 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 0, /* tp_setattr */
6645 (cmpfunc) unicode_compare, /* tp_compare */
6646 (reprfunc) unicode_repr, /* tp_repr */
6647 0, /* tp_as_number */
6648 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006649 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 (hashfunc) unicode_hash, /* tp_hash*/
6651 0, /* tp_call*/
6652 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006653 PyObject_GenericGetAttr, /* tp_getattro */
6654 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00006656 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006657 unicode_doc, /* tp_doc */
6658 0, /* tp_traverse */
6659 0, /* tp_clear */
6660 0, /* tp_richcompare */
6661 0, /* tp_weaklistoffset */
6662 0, /* tp_iter */
6663 0, /* tp_iternext */
6664 unicode_methods, /* tp_methods */
6665 0, /* tp_members */
6666 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006667 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006668 0, /* tp_dict */
6669 0, /* tp_descr_get */
6670 0, /* tp_descr_set */
6671 0, /* tp_dictoffset */
6672 0, /* tp_init */
6673 0, /* tp_alloc */
6674 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006675 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676};
6677
6678/* Initialize the Unicode implementation */
6679
Thomas Wouters78890102000-07-22 19:25:51 +00006680void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006682 int i;
6683
Fred Drakee4315f52000-05-09 19:53:39 +00006684 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006685 unicode_freelist = NULL;
6686 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006688 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006689 for (i = 0; i < 256; i++)
6690 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006691 if (PyType_Ready(&PyUnicode_Type) < 0)
6692 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693}
6694
6695/* Finalize the Unicode implementation */
6696
6697void
Thomas Wouters78890102000-07-22 19:25:51 +00006698_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006700 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006701 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006703 Py_XDECREF(unicode_empty);
6704 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006705
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006706 for (i = 0; i < 256; i++) {
6707 if (unicode_latin1[i]) {
6708 Py_DECREF(unicode_latin1[i]);
6709 unicode_latin1[i] = NULL;
6710 }
6711 }
6712
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006713 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 PyUnicodeObject *v = u;
6715 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006716 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006717 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006718 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006719 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006721 unicode_freelist = NULL;
6722 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723}