blob: cf730e5626d83cc5fae6b96d83b521f15ac99e55 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Nicholas Bastin1ce9e4c2004-06-17 18:27:18 +000044#ifdef __SUNPRO_C
45#pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
46#endif
47
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Guido van Rossumd57fd912000-03-10 22:53:23 +000090/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000091static PyUnicodeObject *unicode_freelist;
92static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000093
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000094/* The empty Unicode object is shared to improve performance. */
95static PyUnicodeObject *unicode_empty;
96
97/* Single character Unicode strings in the Latin-1 range are being
98 shared as well. */
99static PyUnicodeObject *unicode_latin1[256];
100
Fred Drakee4315f52000-05-09 19:53:39 +0000101/* Default encoding to use and assume when NULL is passed as encoding
102 parameter; it is initialized by _PyUnicode_Init().
103
104 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000105 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000106
107*/
Fred Drakee4315f52000-05-09 19:53:39 +0000108static char unicode_default_encoding[100];
109
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000111PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000112{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000113#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000114 return 0x10FFFF;
115#else
116 /* This is actually an illegal character, so it should
117 not be passed to unichr. */
118 return 0xFFFF;
119#endif
120}
121
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122/* --- Unicode Object ----------------------------------------------------- */
123
124static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000125int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000126 int length)
127{
128 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000129
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000132 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000133
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000134 /* Resizing shared object (unicode_empty or single character
135 objects) in-place is not allowed. Use PyUnicode_Resize()
136 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000137 if (unicode == unicode_empty ||
138 (unicode->length == 1 &&
139 /* MvL said unicode->str[] may be signed. Python generally assumes
140 * an int contains at least 32 bits, and we don't use more than
141 * 32 bits even in a UCS4 build, so casting to unsigned int should
142 * be correct.
143 */
144 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000145 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000147 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000148 return -1;
149 }
150
151 /* We allocate one more byte to make sure the string is
152 Ux0000 terminated -- XXX is this needed ? */
153 oldstr = unicode->str;
154 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
155 if (!unicode->str) {
156 unicode->str = oldstr;
157 PyErr_NoMemory();
158 return -1;
159 }
160 unicode->str[length] = 0;
161 unicode->length = length;
162
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000163 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000165 if (unicode->defenc) {
166 Py_DECREF(unicode->defenc);
167 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 }
169 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000170
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 return 0;
172}
173
174/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000175 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176
177 XXX This allocator could further be enhanced by assuring that the
178 free list never reduces its size below 1.
179
180*/
181
182static
183PyUnicodeObject *_PyUnicode_New(int length)
184{
185 register PyUnicodeObject *unicode;
186
Tim Petersced69f82003-09-16 20:30:58 +0000187 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 if (length == 0 && unicode_empty != NULL) {
189 Py_INCREF(unicode_empty);
190 return unicode_empty;
191 }
192
193 /* Unicode freelist & memory allocation */
194 if (unicode_freelist) {
195 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000196 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000197 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000199 /* Keep-Alive optimization: we only upsize the buffer,
200 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000201 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000202 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000203 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 }
206 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000207 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000209 }
210 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 }
212 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000213 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 if (unicode == NULL)
215 return NULL;
216 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
217 }
218
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 if (!unicode->str) {
220 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000221 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000222 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000223 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000224 * the caller fails before initializing str -- unicode_resize()
225 * reads str[0], and the Keep-Alive optimization can keep memory
226 * allocated for str alive across a call to unicode_dealloc(unicode).
227 * We don't want unicode_resize to read uninitialized memory in
228 * that case.
229 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000230 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 unicode->str[length] = 0;
232 unicode->length = length;
233 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000234 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000235 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236
237 onError:
238 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000239 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241}
242
243static
Guido van Rossum9475a232001-10-05 20:51:39 +0000244void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000246 if (PyUnicode_CheckExact(unicode) &&
247 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization */
249 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode->str = NULL;
252 unicode->length = 0;
253 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000254 if (unicode->defenc) {
255 Py_DECREF(unicode->defenc);
256 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 }
258 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 *(PyUnicodeObject **)unicode = unicode_freelist;
260 unicode_freelist = unicode;
261 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000264 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000266 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268}
269
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000270int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000271{
272 register PyUnicodeObject *v;
273
274 /* Argument checks */
275 if (unicode == NULL) {
276 PyErr_BadInternalCall();
277 return -1;
278 }
279 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000280 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281 PyErr_BadInternalCall();
282 return -1;
283 }
284
285 /* Resizing unicode_empty and single character objects is not
286 possible since these are being shared. We simply return a fresh
287 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000288 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000289 (v == unicode_empty || v->length == 1)) {
290 PyUnicodeObject *w = _PyUnicode_New(length);
291 if (w == NULL)
292 return -1;
293 Py_UNICODE_COPY(w->str, v->str,
294 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000295 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 *unicode = (PyObject *)w;
297 return 0;
298 }
299
300 /* Note that we don't have to modify *unicode for unshared Unicode
301 objects, since we can modify them in-place. */
302 return unicode_resize(v, length);
303}
304
305/* Internal API for use in unicodeobject.c only ! */
306#define _PyUnicode_Resize(unicodevar, length) \
307 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
308
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
310 int size)
311{
312 PyUnicodeObject *unicode;
313
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 /* If the Unicode data is known at construction time, we can apply
315 some optimizations which share commonly used objects. */
316 if (u != NULL) {
317
318 /* Optimization for empty strings */
319 if (size == 0 && unicode_empty != NULL) {
320 Py_INCREF(unicode_empty);
321 return (PyObject *)unicode_empty;
322 }
323
324 /* Single character Unicode objects in the Latin-1 range are
325 shared when using this constructor */
326 if (size == 1 && *u < 256) {
327 unicode = unicode_latin1[*u];
328 if (!unicode) {
329 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 if (!unicode)
331 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000332 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000333 unicode_latin1[*u] = unicode;
334 }
335 Py_INCREF(unicode);
336 return (PyObject *)unicode;
337 }
338 }
Tim Petersced69f82003-09-16 20:30:58 +0000339
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 unicode = _PyUnicode_New(size);
341 if (!unicode)
342 return NULL;
343
344 /* Copy the Unicode data into the new object */
345 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347
348 return (PyObject *)unicode;
349}
350
351#ifdef HAVE_WCHAR_H
352
353PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
354 int size)
355{
356 PyUnicodeObject *unicode;
357
358 if (w == NULL) {
359 PyErr_BadInternalCall();
360 return NULL;
361 }
362
363 unicode = _PyUnicode_New(size);
364 if (!unicode)
365 return NULL;
366
367 /* Copy the wchar_t data into the new object */
368#ifdef HAVE_USABLE_WCHAR_T
369 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000370#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 {
372 register Py_UNICODE *u;
373 register int i;
374 u = PyUnicode_AS_UNICODE(unicode);
375 for (i = size; i >= 0; i--)
376 *u++ = *w++;
377 }
378#endif
379
380 return (PyObject *)unicode;
381}
382
383int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
384 register wchar_t *w,
385 int size)
386{
387 if (unicode == NULL) {
388 PyErr_BadInternalCall();
389 return -1;
390 }
391 if (size > PyUnicode_GET_SIZE(unicode))
392 size = PyUnicode_GET_SIZE(unicode);
393#ifdef HAVE_USABLE_WCHAR_T
394 memcpy(w, unicode->str, size * sizeof(wchar_t));
395#else
396 {
397 register Py_UNICODE *u;
398 register int i;
399 u = PyUnicode_AS_UNICODE(unicode);
400 for (i = size; i >= 0; i--)
401 *w++ = *u++;
402 }
403#endif
404
405 return size;
406}
407
408#endif
409
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000410PyObject *PyUnicode_FromOrdinal(int ordinal)
411{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000412 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000413
414#ifdef Py_UNICODE_WIDE
415 if (ordinal < 0 || ordinal > 0x10ffff) {
416 PyErr_SetString(PyExc_ValueError,
417 "unichr() arg not in range(0x110000) "
418 "(wide Python build)");
419 return NULL;
420 }
421#else
422 if (ordinal < 0 || ordinal > 0xffff) {
423 PyErr_SetString(PyExc_ValueError,
424 "unichr() arg not in range(0x10000) "
425 "(narrow Python build)");
426 return NULL;
427 }
428#endif
429
Hye-Shik Chang40574832004-04-06 07:24:51 +0000430 s[0] = (Py_UNICODE)ordinal;
431 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000432}
433
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434PyObject *PyUnicode_FromObject(register PyObject *obj)
435{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000436 /* XXX Perhaps we should make this API an alias of
437 PyObject_Unicode() instead ?! */
438 if (PyUnicode_CheckExact(obj)) {
439 Py_INCREF(obj);
440 return obj;
441 }
442 if (PyUnicode_Check(obj)) {
443 /* For a Unicode subtype that's not a Unicode object,
444 return a true Unicode object with the same data. */
445 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
446 PyUnicode_GET_SIZE(obj));
447 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
449}
450
451PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
452 const char *encoding,
453 const char *errors)
454{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000455 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 if (obj == NULL) {
460 PyErr_BadInternalCall();
461 return NULL;
462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000464#if 0
465 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000466 that no encodings is given and then redirect to
467 PyObject_Unicode() which then applies the additional logic for
468 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000469
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000470 NOTE: This API should really only be used for object which
471 represent *encoded* Unicode !
472
473 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000474 if (PyUnicode_Check(obj)) {
475 if (encoding) {
476 PyErr_SetString(PyExc_TypeError,
477 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000479 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482#else
483 if (PyUnicode_Check(obj)) {
484 PyErr_SetString(PyExc_TypeError,
485 "decoding Unicode is not supported");
486 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000487 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000488#endif
489
490 /* Coerce object */
491 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 s = PyString_AS_STRING(obj);
493 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000495 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
496 /* Overwrite the error message with something more useful in
497 case of a TypeError. */
498 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000499 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000500 "coercing to Unicode: need string or buffer, "
501 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000502 obj->ob_type->tp_name);
503 goto onError;
504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000506 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 if (len == 0) {
508 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 }
Tim Petersced69f82003-09-16 20:30:58 +0000511 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000513
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 return v;
515
516 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518}
519
520PyObject *PyUnicode_Decode(const char *s,
521 int size,
522 const char *encoding,
523 const char *errors)
524{
525 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000526
527 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000528 encoding = PyUnicode_GetDefaultEncoding();
529
530 /* Shortcuts for common default encodings */
531 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000535#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
536 else if (strcmp(encoding, "mbcs") == 0)
537 return PyUnicode_DecodeMBCS(s, size, errors);
538#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000539 else if (strcmp(encoding, "ascii") == 0)
540 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541
542 /* Decode via the codec registry */
543 buffer = PyBuffer_FromMemory((void *)s, size);
544 if (buffer == NULL)
545 goto onError;
546 unicode = PyCodec_Decode(buffer, encoding, errors);
547 if (unicode == NULL)
548 goto onError;
549 if (!PyUnicode_Check(unicode)) {
550 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000551 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552 unicode->ob_type->tp_name);
553 Py_DECREF(unicode);
554 goto onError;
555 }
556 Py_DECREF(buffer);
557 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 onError:
560 Py_XDECREF(buffer);
561 return NULL;
562}
563
564PyObject *PyUnicode_Encode(const Py_UNICODE *s,
565 int size,
566 const char *encoding,
567 const char *errors)
568{
569 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000570
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 unicode = PyUnicode_FromUnicode(s, size);
572 if (unicode == NULL)
573 return NULL;
574 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
575 Py_DECREF(unicode);
576 return v;
577}
578
579PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
580 const char *encoding,
581 const char *errors)
582{
583 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000584
Guido van Rossumd57fd912000-03-10 22:53:23 +0000585 if (!PyUnicode_Check(unicode)) {
586 PyErr_BadArgument();
587 goto onError;
588 }
Fred Drakee4315f52000-05-09 19:53:39 +0000589
Tim Petersced69f82003-09-16 20:30:58 +0000590 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000591 encoding = PyUnicode_GetDefaultEncoding();
592
593 /* Shortcuts for common default encodings */
594 if (errors == NULL) {
595 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000596 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000597 else if (strcmp(encoding, "latin-1") == 0)
598 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000599#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
600 else if (strcmp(encoding, "mbcs") == 0)
601 return PyUnicode_AsMBCSString(unicode);
602#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000603 else if (strcmp(encoding, "ascii") == 0)
604 return PyUnicode_AsASCIIString(unicode);
605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606
607 /* Encode via the codec registry */
608 v = PyCodec_Encode(unicode, encoding, errors);
609 if (v == NULL)
610 goto onError;
611 /* XXX Should we really enforce this ? */
612 if (!PyString_Check(v)) {
613 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000614 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 v->ob_type->tp_name);
616 Py_DECREF(v);
617 goto onError;
618 }
619 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000620
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 onError:
622 return NULL;
623}
624
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000625PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
626 const char *errors)
627{
628 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
629
630 if (v)
631 return v;
632 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
633 if (v && errors == NULL)
634 ((PyUnicodeObject *)unicode)->defenc = v;
635 return v;
636}
637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
639{
640 if (!PyUnicode_Check(unicode)) {
641 PyErr_BadArgument();
642 goto onError;
643 }
644 return PyUnicode_AS_UNICODE(unicode);
645
646 onError:
647 return NULL;
648}
649
650int PyUnicode_GetSize(PyObject *unicode)
651{
652 if (!PyUnicode_Check(unicode)) {
653 PyErr_BadArgument();
654 goto onError;
655 }
656 return PyUnicode_GET_SIZE(unicode);
657
658 onError:
659 return -1;
660}
661
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000662int PyUnicode_GetWidth(PyObject *unicode)
663{
664 const Py_UNICODE *p, *e;
665 int width;
666
667 if (!PyUnicode_Check(unicode)) {
668 PyErr_BadArgument();
669 return -1;
670 }
671
672 p = PyUnicode_AS_UNICODE(unicode);
673 e = p + PyUnicode_GET_SIZE(unicode);
674 for (width = 0; p < e; p++)
675 if (Py_UNICODE_ISWIDE(*p))
676 width += 2;
677 else
678 width++;
679
680 return width;
681}
682
Thomas Wouters78890102000-07-22 19:25:51 +0000683const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000684{
685 return unicode_default_encoding;
686}
687
688int PyUnicode_SetDefaultEncoding(const char *encoding)
689{
690 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000691
Fred Drakee4315f52000-05-09 19:53:39 +0000692 /* Make sure the encoding is valid. As side effect, this also
693 loads the encoding into the codec registry cache. */
694 v = _PyCodec_Lookup(encoding);
695 if (v == NULL)
696 goto onError;
697 Py_DECREF(v);
698 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000699 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000700 sizeof(unicode_default_encoding));
701 return 0;
702
703 onError:
704 return -1;
705}
706
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000707/* error handling callback helper:
708 build arguments, call the callback and check the arguments,
709 if no exception occured, copy the replacement to the output
710 and adjust various state variables.
711 return 0 on success, -1 on error
712*/
713
714static
715int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
716 const char *encoding, const char *reason,
717 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
718 PyObject **output, int *outpos, Py_UNICODE **outptr)
719{
720 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
721
722 PyObject *restuple = NULL;
723 PyObject *repunicode = NULL;
724 int outsize = PyUnicode_GET_SIZE(*output);
725 int requiredsize;
726 int newpos;
727 Py_UNICODE *repptr;
728 int repsize;
729 int res = -1;
730
731 if (*errorHandler == NULL) {
732 *errorHandler = PyCodec_LookupError(errors);
733 if (*errorHandler == NULL)
734 goto onError;
735 }
736
737 if (*exceptionObject == NULL) {
738 *exceptionObject = PyUnicodeDecodeError_Create(
739 encoding, input, insize, *startinpos, *endinpos, reason);
740 if (*exceptionObject == NULL)
741 goto onError;
742 }
743 else {
744 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
745 goto onError;
746 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
747 goto onError;
748 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
749 goto onError;
750 }
751
752 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
753 if (restuple == NULL)
754 goto onError;
755 if (!PyTuple_Check(restuple)) {
756 PyErr_Format(PyExc_TypeError, &argparse[4]);
757 goto onError;
758 }
759 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
760 goto onError;
761 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000762 newpos = insize+newpos;
763 if (newpos<0 || newpos>insize) {
764 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
765 goto onError;
766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000767
768 /* need more space? (at least enough for what we
769 have+the replacement+the rest of the string (starting
770 at the new input position), so we won't have to check space
771 when there are no errors in the rest of the string) */
772 repptr = PyUnicode_AS_UNICODE(repunicode);
773 repsize = PyUnicode_GET_SIZE(repunicode);
774 requiredsize = *outpos + repsize + insize-newpos;
775 if (requiredsize > outsize) {
776 if (requiredsize<2*outsize)
777 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000778 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000779 goto onError;
780 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
781 }
782 *endinpos = newpos;
783 *inptr = input + newpos;
784 Py_UNICODE_COPY(*outptr, repptr, repsize);
785 *outptr += repsize;
786 *outpos += repsize;
787 /* we made it! */
788 res = 0;
789
790 onError:
791 Py_XDECREF(restuple);
792 return res;
793}
794
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000795/* --- UTF-7 Codec -------------------------------------------------------- */
796
797/* see RFC2152 for details */
798
Tim Petersced69f82003-09-16 20:30:58 +0000799static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000800char utf7_special[128] = {
801 /* indicate whether a UTF-7 character is special i.e. cannot be directly
802 encoded:
803 0 - not special
804 1 - special
805 2 - whitespace (optional)
806 3 - RFC2152 Set O (optional) */
807 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
808 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
809 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
810 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
811 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
813 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
815
816};
817
818#define SPECIAL(c, encodeO, encodeWS) \
819 (((c)>127 || utf7_special[(c)] == 1) || \
820 (encodeWS && (utf7_special[(c)] == 2)) || \
821 (encodeO && (utf7_special[(c)] == 3)))
822
823#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
824#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
825#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
826 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
827
828#define ENCODE(out, ch, bits) \
829 while (bits >= 6) { \
830 *out++ = B64(ch >> (bits-6)); \
831 bits -= 6; \
832 }
833
834#define DECODE(out, ch, bits, surrogate) \
835 while (bits >= 16) { \
836 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
837 bits -= 16; \
838 if (surrogate) { \
839 /* We have already generated an error for the high surrogate
840 so let's not bother seeing if the low surrogate is correct or not */\
841 surrogate = 0; \
842 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
843 /* This is a surrogate pair. Unfortunately we can't represent \
844 it in a 16-bit character */ \
845 surrogate = 1; \
846 errmsg = "code pairs are not supported"; \
847 goto utf7Error; \
848 } else { \
849 *out++ = outCh; \
850 } \
851 } \
852
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000853PyObject *PyUnicode_DecodeUTF7(const char *s,
854 int size,
855 const char *errors)
856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857 const char *starts = s;
858 int startinpos;
859 int endinpos;
860 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861 const char *e;
862 PyUnicodeObject *unicode;
863 Py_UNICODE *p;
864 const char *errmsg = "";
865 int inShift = 0;
866 unsigned int bitsleft = 0;
867 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000868 int surrogate = 0;
869 PyObject *errorHandler = NULL;
870 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000871
872 unicode = _PyUnicode_New(size);
873 if (!unicode)
874 return NULL;
875 if (size == 0)
876 return (PyObject *)unicode;
877
878 p = unicode->str;
879 e = s + size;
880
881 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000882 Py_UNICODE ch;
883 restart:
884 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000885
886 if (inShift) {
887 if ((ch == '-') || !B64CHAR(ch)) {
888 inShift = 0;
889 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000890
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000891 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
892 if (bitsleft >= 6) {
893 /* The shift sequence has a partial character in it. If
894 bitsleft < 6 then we could just classify it as padding
895 but that is not the case here */
896
897 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000898 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000899 }
900 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000901 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000902 here so indicate the potential of a misencoded character. */
903
904 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
905 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
906 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 }
909
910 if (ch == '-') {
911 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000912 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913 inShift = 1;
914 }
915 } else if (SPECIAL(ch,0,0)) {
916 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000917 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000918 } else {
919 *p++ = ch;
920 }
921 } else {
922 charsleft = (charsleft << 6) | UB64(ch);
923 bitsleft += 6;
924 s++;
925 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
926 }
927 }
928 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 s++;
931 if (s < e && *s == '-') {
932 s++;
933 *p++ = '+';
934 } else
935 {
936 inShift = 1;
937 bitsleft = 0;
938 }
939 }
940 else if (SPECIAL(ch,0,0)) {
941 errmsg = "unexpected special character";
942 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000943 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 }
945 else {
946 *p++ = ch;
947 s++;
948 }
949 continue;
950 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 outpos = p-PyUnicode_AS_UNICODE(unicode);
952 endinpos = s-starts;
953 if (unicode_decode_call_errorhandler(
954 errors, &errorHandler,
955 "utf7", errmsg,
956 starts, size, &startinpos, &endinpos, &exc, &s,
957 (PyObject **)&unicode, &outpos, &p))
958 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 }
960
961 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000962 outpos = p-PyUnicode_AS_UNICODE(unicode);
963 endinpos = size;
964 if (unicode_decode_call_errorhandler(
965 errors, &errorHandler,
966 "utf7", "unterminated shift sequence",
967 starts, size, &startinpos, &endinpos, &exc, &s,
968 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000970 if (s < e)
971 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000972 }
973
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000974 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000975 goto onError;
976
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000977 Py_XDECREF(errorHandler);
978 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979 return (PyObject *)unicode;
980
981onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000982 Py_XDECREF(errorHandler);
983 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000984 Py_DECREF(unicode);
985 return NULL;
986}
987
988
989PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
990 int size,
991 int encodeSetO,
992 int encodeWhiteSpace,
993 const char *errors)
994{
995 PyObject *v;
996 /* It might be possible to tighten this worst case */
997 unsigned int cbAllocated = 5 * size;
998 int inShift = 0;
999 int i = 0;
1000 unsigned int bitsleft = 0;
1001 unsigned long charsleft = 0;
1002 char * out;
1003 char * start;
1004
1005 if (size == 0)
1006 return PyString_FromStringAndSize(NULL, 0);
1007
1008 v = PyString_FromStringAndSize(NULL, cbAllocated);
1009 if (v == NULL)
1010 return NULL;
1011
1012 start = out = PyString_AS_STRING(v);
1013 for (;i < size; ++i) {
1014 Py_UNICODE ch = s[i];
1015
1016 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001017 if (ch == '+') {
1018 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 *out++ = '-';
1020 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 charsleft = ch;
1022 bitsleft = 16;
1023 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001024 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001025 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001026 } else {
1027 *out++ = (char) ch;
1028 }
1029 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001030 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1031 *out++ = B64(charsleft << (6-bitsleft));
1032 charsleft = 0;
1033 bitsleft = 0;
1034 /* Characters not in the BASE64 set implicitly unshift the sequence
1035 so no '-' is required, except if the character is itself a '-' */
1036 if (B64CHAR(ch) || ch == '-') {
1037 *out++ = '-';
1038 }
1039 inShift = 0;
1040 *out++ = (char) ch;
1041 } else {
1042 bitsleft += 16;
1043 charsleft = (charsleft << 16) | ch;
1044 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1045
1046 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001047 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 or '-' then the shift sequence will be terminated implicitly and we
1049 don't have to insert a '-'. */
1050
1051 if (bitsleft == 0) {
1052 if (i + 1 < size) {
1053 Py_UNICODE ch2 = s[i+1];
1054
1055 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001056
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 } else if (B64CHAR(ch2) || ch2 == '-') {
1058 *out++ = '-';
1059 inShift = 0;
1060 } else {
1061 inShift = 0;
1062 }
1063
1064 }
1065 else {
1066 *out++ = '-';
1067 inShift = 0;
1068 }
1069 }
Tim Petersced69f82003-09-16 20:30:58 +00001070 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001071 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001072 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 if (bitsleft) {
1074 *out++= B64(charsleft << (6-bitsleft) );
1075 *out++ = '-';
1076 }
1077
Tim Peters5de98422002-04-27 18:44:32 +00001078 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001079 return v;
1080}
1081
1082#undef SPECIAL
1083#undef B64
1084#undef B64CHAR
1085#undef UB64
1086#undef ENCODE
1087#undef DECODE
1088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089/* --- UTF-8 Codec -------------------------------------------------------- */
1090
Tim Petersced69f82003-09-16 20:30:58 +00001091static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092char utf8_code_length[256] = {
1093 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1094 illegal prefix. see RFC 2279 for details */
1095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1101 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1108 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1109 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1110 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1111};
1112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113PyObject *PyUnicode_DecodeUTF8(const char *s,
1114 int size,
1115 const char *errors)
1116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001117 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001119 int startinpos;
1120 int endinpos;
1121 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 const char *e;
1123 PyUnicodeObject *unicode;
1124 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001125 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001126 PyObject *errorHandler = NULL;
1127 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128
1129 /* Note: size will always be longer than the resulting Unicode
1130 character count */
1131 unicode = _PyUnicode_New(size);
1132 if (!unicode)
1133 return NULL;
1134 if (size == 0)
1135 return (PyObject *)unicode;
1136
1137 /* Unpack UTF-8 encoded data */
1138 p = unicode->str;
1139 e = s + size;
1140
1141 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001142 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143
1144 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001145 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146 s++;
1147 continue;
1148 }
1149
1150 n = utf8_code_length[ch];
1151
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 if (s + n > e) {
1153 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001154 startinpos = s-starts;
1155 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001156 goto utf8Error;
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158
1159 switch (n) {
1160
1161 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 startinpos = s-starts;
1164 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001165 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166
1167 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001169 startinpos = s-starts;
1170 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172
1173 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 if ((s[1] & 0xc0) != 0x80) {
1175 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001176 startinpos = s-starts;
1177 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 goto utf8Error;
1179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001181 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001182 startinpos = s-starts;
1183 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001184 errmsg = "illegal encoding";
1185 goto utf8Error;
1186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001188 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 break;
1190
1191 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001192 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 (s[2] & 0xc0) != 0x80) {
1194 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001195 startinpos = s-starts;
1196 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001197 goto utf8Error;
1198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001200 if (ch < 0x0800) {
1201 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001202 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001203
1204 XXX For wide builds (UCS-4) we should probably try
1205 to recombine the surrogates into a single code
1206 unit.
1207 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001209 startinpos = s-starts;
1210 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 goto utf8Error;
1212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001214 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001215 break;
1216
1217 case 4:
1218 if ((s[1] & 0xc0) != 0x80 ||
1219 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001220 (s[3] & 0xc0) != 0x80) {
1221 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001222 startinpos = s-starts;
1223 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 goto utf8Error;
1225 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001226 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1227 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1228 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001229 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001230 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001231 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001232 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001233 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235 startinpos = s-starts;
1236 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001237 goto utf8Error;
1238 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001239#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001240 *p++ = (Py_UNICODE)ch;
1241#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001243
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001244 /* translate from 10000..10FFFF to 0..FFFF */
1245 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001246
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 /* high surrogate = top 10 bits added to D800 */
1248 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001249
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001250 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001251 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001252#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 break;
1254
1255 default:
1256 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001258 startinpos = s-starts;
1259 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001260 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 }
1262 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001264
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001266 outpos = p-PyUnicode_AS_UNICODE(unicode);
1267 if (unicode_decode_call_errorhandler(
1268 errors, &errorHandler,
1269 "utf8", errmsg,
1270 starts, size, &startinpos, &endinpos, &exc, &s,
1271 (PyObject **)&unicode, &outpos, &p))
1272 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 }
1274
1275 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001276 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 goto onError;
1278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001279 Py_XDECREF(errorHandler);
1280 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 return (PyObject *)unicode;
1282
1283onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 Py_XDECREF(errorHandler);
1285 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 Py_DECREF(unicode);
1287 return NULL;
1288}
1289
Tim Peters602f7402002-04-27 18:03:26 +00001290/* Allocation strategy: if the string is short, convert into a stack buffer
1291 and allocate exactly as much space needed at the end. Else allocate the
1292 maximum possible needed (4 result bytes per Unicode character), and return
1293 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001294*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001295PyObject *
1296PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1297 int size,
1298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299{
Tim Peters602f7402002-04-27 18:03:26 +00001300#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 int i; /* index into s of next input byte */
1303 PyObject *v; /* result string object */
1304 char *p; /* next free byte in output buffer */
1305 int nallocated; /* number of result bytes allocated */
1306 int nneeded; /* number of result bytes needed */
1307 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001308
Tim Peters602f7402002-04-27 18:03:26 +00001309 assert(s != NULL);
1310 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311
Tim Peters602f7402002-04-27 18:03:26 +00001312 if (size <= MAX_SHORT_UNICHARS) {
1313 /* Write into the stack buffer; nallocated can't overflow.
1314 * At the end, we'll allocate exactly as much heap space as it
1315 * turns out we need.
1316 */
1317 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1318 v = NULL; /* will allocate after we're done */
1319 p = stackbuf;
1320 }
1321 else {
1322 /* Overallocate on the heap, and give the excess back at the end. */
1323 nallocated = size * 4;
1324 if (nallocated / 4 != size) /* overflow! */
1325 return PyErr_NoMemory();
1326 v = PyString_FromStringAndSize(NULL, nallocated);
1327 if (v == NULL)
1328 return NULL;
1329 p = PyString_AS_STRING(v);
1330 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001331
Tim Peters602f7402002-04-27 18:03:26 +00001332 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001335 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001336 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001338
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001340 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001341 *p++ = (char)(0xc0 | (ch >> 6));
1342 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001343 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001344 else {
Tim Peters602f7402002-04-27 18:03:26 +00001345 /* Encode UCS2 Unicode ordinals */
1346 if (ch < 0x10000) {
1347 /* Special case: check for high surrogate */
1348 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1349 Py_UCS4 ch2 = s[i];
1350 /* Check for low surrogate and combine the two to
1351 form a UCS4 value */
1352 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001353 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001354 i++;
1355 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 }
Tim Peters602f7402002-04-27 18:03:26 +00001357 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001358 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001360 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1361 *p++ = (char)(0x80 | (ch & 0x3f));
1362 continue;
1363 }
1364encodeUCS4:
1365 /* Encode UCS4 Unicode ordinals */
1366 *p++ = (char)(0xf0 | (ch >> 18));
1367 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1368 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1369 *p++ = (char)(0x80 | (ch & 0x3f));
1370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001372
Tim Peters602f7402002-04-27 18:03:26 +00001373 if (v == NULL) {
1374 /* This was stack allocated. */
1375 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1376 assert(nneeded <= nallocated);
1377 v = PyString_FromStringAndSize(stackbuf, nneeded);
1378 }
1379 else {
1380 /* Cut back to size actually needed. */
1381 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1382 assert(nneeded <= nallocated);
1383 _PyString_Resize(&v, nneeded);
1384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001386
Tim Peters602f7402002-04-27 18:03:26 +00001387#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388}
1389
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1391{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 if (!PyUnicode_Check(unicode)) {
1393 PyErr_BadArgument();
1394 return NULL;
1395 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001396 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1397 PyUnicode_GET_SIZE(unicode),
1398 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399}
1400
1401/* --- UTF-16 Codec ------------------------------------------------------- */
1402
Tim Peters772747b2001-08-09 22:21:55 +00001403PyObject *
1404PyUnicode_DecodeUTF16(const char *s,
1405 int size,
1406 const char *errors,
1407 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 const char *starts = s;
1410 int startinpos;
1411 int endinpos;
1412 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413 PyUnicodeObject *unicode;
1414 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001415 const unsigned char *q, *e;
1416 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001417 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001418 /* Offsets from q for retrieving byte pairs in the right order. */
1419#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1420 int ihi = 1, ilo = 0;
1421#else
1422 int ihi = 0, ilo = 1;
1423#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001424 PyObject *errorHandler = NULL;
1425 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426
1427 /* Note: size will always be longer than the resulting Unicode
1428 character count */
1429 unicode = _PyUnicode_New(size);
1430 if (!unicode)
1431 return NULL;
1432 if (size == 0)
1433 return (PyObject *)unicode;
1434
1435 /* Unpack UTF-16 encoded data */
1436 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001437 q = (unsigned char *)s;
1438 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439
1440 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001441 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001443 /* Check for BOM marks (U+FEFF) in the input and adjust current
1444 byte order setting accordingly. In native mode, the leading BOM
1445 mark is skipped, in all other modes, it is copied to the output
1446 stream as-is (giving a ZWNBSP character). */
1447 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001448 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001450 if (bom == 0xFEFF) {
1451 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001452 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001453 }
1454 else if (bom == 0xFFFE) {
1455 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001456 bo = 1;
1457 }
Tim Petersced69f82003-09-16 20:30:58 +00001458#else
Tim Peters772747b2001-08-09 22:21:55 +00001459 if (bom == 0xFEFF) {
1460 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001461 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001462 }
1463 else if (bom == 0xFFFE) {
1464 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001465 bo = -1;
1466 }
1467#endif
1468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469
Tim Peters772747b2001-08-09 22:21:55 +00001470 if (bo == -1) {
1471 /* force LE */
1472 ihi = 1;
1473 ilo = 0;
1474 }
1475 else if (bo == 1) {
1476 /* force BE */
1477 ihi = 0;
1478 ilo = 1;
1479 }
1480
1481 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 Py_UNICODE ch;
1483 /* remaing bytes at the end? (size should be even) */
1484 if (e-q<2) {
1485 errmsg = "truncated data";
1486 startinpos = ((const char *)q)-starts;
1487 endinpos = ((const char *)e)-starts;
1488 goto utf16Error;
1489 /* The remaining input chars are ignored if the callback
1490 chooses to skip the input */
1491 }
1492 ch = (q[ihi] << 8) | q[ilo];
1493
Tim Peters772747b2001-08-09 22:21:55 +00001494 q += 2;
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 if (ch < 0xD800 || ch > 0xDFFF) {
1497 *p++ = ch;
1498 continue;
1499 }
1500
1501 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001502 if (q >= e) {
1503 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 startinpos = (((const char *)q)-2)-starts;
1505 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001506 goto utf16Error;
1507 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001508 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001509 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1510 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001511 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001512#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001513 *p++ = ch;
1514 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001515#else
1516 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001517#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001518 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001519 }
1520 else {
1521 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001522 startinpos = (((const char *)q)-4)-starts;
1523 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001524 goto utf16Error;
1525 }
1526
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001528 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 startinpos = (((const char *)q)-2)-starts;
1530 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001531 /* Fall through to report the error */
1532
1533 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 outpos = p-PyUnicode_AS_UNICODE(unicode);
1535 if (unicode_decode_call_errorhandler(
1536 errors, &errorHandler,
1537 "utf16", errmsg,
1538 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1539 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
1542
1543 if (byteorder)
1544 *byteorder = bo;
1545
1546 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001547 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 goto onError;
1549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 Py_XDECREF(errorHandler);
1551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552 return (PyObject *)unicode;
1553
1554onError:
1555 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 Py_XDECREF(errorHandler);
1557 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 return NULL;
1559}
1560
Tim Peters772747b2001-08-09 22:21:55 +00001561PyObject *
1562PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1563 int size,
1564 const char *errors,
1565 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566{
1567 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001568 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001569#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001570 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001571#else
1572 const int pairs = 0;
1573#endif
Tim Peters772747b2001-08-09 22:21:55 +00001574 /* Offsets from p for storing byte pairs in the right order. */
1575#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1576 int ihi = 1, ilo = 0;
1577#else
1578 int ihi = 0, ilo = 1;
1579#endif
1580
1581#define STORECHAR(CH) \
1582 do { \
1583 p[ihi] = ((CH) >> 8) & 0xff; \
1584 p[ilo] = (CH) & 0xff; \
1585 p += 2; \
1586 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001588#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001589 for (i = pairs = 0; i < size; i++)
1590 if (s[i] >= 0x10000)
1591 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001592#endif
Tim Petersced69f82003-09-16 20:30:58 +00001593 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001594 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 if (v == NULL)
1596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597
Tim Peters772747b2001-08-09 22:21:55 +00001598 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001600 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001601 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001602 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001603
1604 if (byteorder == -1) {
1605 /* force LE */
1606 ihi = 1;
1607 ilo = 0;
1608 }
1609 else if (byteorder == 1) {
1610 /* force BE */
1611 ihi = 0;
1612 ilo = 1;
1613 }
1614
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001615 while (size-- > 0) {
1616 Py_UNICODE ch = *s++;
1617 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001618#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001619 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001620 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1621 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001623#endif
Tim Peters772747b2001-08-09 22:21:55 +00001624 STORECHAR(ch);
1625 if (ch2)
1626 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001629#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630}
1631
1632PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1633{
1634 if (!PyUnicode_Check(unicode)) {
1635 PyErr_BadArgument();
1636 return NULL;
1637 }
1638 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1639 PyUnicode_GET_SIZE(unicode),
1640 NULL,
1641 0);
1642}
1643
1644/* --- Unicode Escape Codec ----------------------------------------------- */
1645
Fredrik Lundh06d12682001-01-24 07:59:11 +00001646static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001647
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1649 int size,
1650 const char *errors)
1651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 const char *starts = s;
1653 int startinpos;
1654 int endinpos;
1655 int outpos;
1656 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660 char* message;
1661 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 PyObject *errorHandler = NULL;
1663 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001664
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 /* Escaped strings will always be longer than the resulting
1666 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 length after conversion to the true value.
1668 (but if the error callback returns a long replacement string
1669 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 v = _PyUnicode_New(size);
1671 if (v == NULL)
1672 goto onError;
1673 if (size == 0)
1674 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001678
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 while (s < end) {
1680 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001681 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683
1684 /* Non-escape characters are interpreted as Unicode ordinals */
1685 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 continue;
1688 }
1689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001690 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691 /* \ - Escapes */
1692 s++;
1693 switch (*s++) {
1694
1695 /* \x escapes */
1696 case '\n': break;
1697 case '\\': *p++ = '\\'; break;
1698 case '\'': *p++ = '\''; break;
1699 case '\"': *p++ = '\"'; break;
1700 case 'b': *p++ = '\b'; break;
1701 case 'f': *p++ = '\014'; break; /* FF */
1702 case 't': *p++ = '\t'; break;
1703 case 'n': *p++ = '\n'; break;
1704 case 'r': *p++ = '\r'; break;
1705 case 'v': *p++ = '\013'; break; /* VT */
1706 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1707
1708 /* \OOO (octal) escapes */
1709 case '0': case '1': case '2': case '3':
1710 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001711 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001713 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001715 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001717 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 break;
1719
Fredrik Lundhccc74732001-02-18 22:13:49 +00001720 /* hex escapes */
1721 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001723 digits = 2;
1724 message = "truncated \\xXX escape";
1725 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
Fredrik Lundhccc74732001-02-18 22:13:49 +00001727 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001729 digits = 4;
1730 message = "truncated \\uXXXX escape";
1731 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001735 digits = 8;
1736 message = "truncated \\UXXXXXXXX escape";
1737 hexescape:
1738 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 outpos = p-PyUnicode_AS_UNICODE(v);
1740 if (s+digits>end) {
1741 endinpos = size;
1742 if (unicode_decode_call_errorhandler(
1743 errors, &errorHandler,
1744 "unicodeescape", "end of string in escape sequence",
1745 starts, size, &startinpos, &endinpos, &exc, &s,
1746 (PyObject **)&v, &outpos, &p))
1747 goto onError;
1748 goto nextByte;
1749 }
1750 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001751 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001752 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 endinpos = (s+i+1)-starts;
1754 if (unicode_decode_call_errorhandler(
1755 errors, &errorHandler,
1756 "unicodeescape", message,
1757 starts, size, &startinpos, &endinpos, &exc, &s,
1758 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001759 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001761 }
1762 chr = (chr<<4) & ~0xF;
1763 if (c >= '0' && c <= '9')
1764 chr += c - '0';
1765 else if (c >= 'a' && c <= 'f')
1766 chr += 10 + c - 'a';
1767 else
1768 chr += 10 + c - 'A';
1769 }
1770 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001771 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 /* _decoding_error will have already written into the
1773 target buffer. */
1774 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001775 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001776 /* when we get here, chr is a 32-bit unicode character */
1777 if (chr <= 0xffff)
1778 /* UCS-2 character */
1779 *p++ = (Py_UNICODE) chr;
1780 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001781 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001782 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001783#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001784 *p++ = chr;
1785#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001786 chr -= 0x10000L;
1787 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001788 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001789#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001790 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 endinpos = s-starts;
1792 outpos = p-PyUnicode_AS_UNICODE(v);
1793 if (unicode_decode_call_errorhandler(
1794 errors, &errorHandler,
1795 "unicodeescape", "illegal Unicode character",
1796 starts, size, &startinpos, &endinpos, &exc, &s,
1797 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001798 goto onError;
1799 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800 break;
1801
1802 /* \N{name} */
1803 case 'N':
1804 message = "malformed \\N character escape";
1805 if (ucnhash_CAPI == NULL) {
1806 /* load the unicode data module */
1807 PyObject *m, *v;
1808 m = PyImport_ImportModule("unicodedata");
1809 if (m == NULL)
1810 goto ucnhashError;
1811 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1812 Py_DECREF(m);
1813 if (v == NULL)
1814 goto ucnhashError;
1815 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1816 Py_DECREF(v);
1817 if (ucnhash_CAPI == NULL)
1818 goto ucnhashError;
1819 }
1820 if (*s == '{') {
1821 const char *start = s+1;
1822 /* look for the closing brace */
1823 while (*s != '}' && s < end)
1824 s++;
1825 if (s > start && s < end && *s == '}') {
1826 /* found a name. look it up in the unicode database */
1827 message = "unknown Unicode character name";
1828 s++;
1829 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1830 goto store;
1831 }
1832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 endinpos = s-starts;
1834 outpos = p-PyUnicode_AS_UNICODE(v);
1835 if (unicode_decode_call_errorhandler(
1836 errors, &errorHandler,
1837 "unicodeescape", message,
1838 starts, size, &startinpos, &endinpos, &exc, &s,
1839 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001840 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001841 break;
1842
1843 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001844 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 message = "\\ at end of string";
1846 s--;
1847 endinpos = s-starts;
1848 outpos = p-PyUnicode_AS_UNICODE(v);
1849 if (unicode_decode_call_errorhandler(
1850 errors, &errorHandler,
1851 "unicodeescape", message,
1852 starts, size, &startinpos, &endinpos, &exc, &s,
1853 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001854 goto onError;
1855 }
1856 else {
1857 *p++ = '\\';
1858 *p++ = (unsigned char)s[-1];
1859 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001860 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 nextByte:
1863 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001865 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001867 Py_XDECREF(errorHandler);
1868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001870
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001872 PyErr_SetString(
1873 PyExc_UnicodeError,
1874 "\\N escapes not supported (can't load unicodedata module)"
1875 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 Py_XDECREF(errorHandler);
1877 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001878 return NULL;
1879
Fredrik Lundhccc74732001-02-18 22:13:49 +00001880onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001882 Py_XDECREF(errorHandler);
1883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 return NULL;
1885}
1886
1887/* Return a Unicode-Escape string version of the Unicode object.
1888
1889 If quotes is true, the string is enclosed in u"" or u'' quotes as
1890 appropriate.
1891
1892*/
1893
Barry Warsaw51ac5802000-03-20 16:36:48 +00001894static const Py_UNICODE *findchar(const Py_UNICODE *s,
1895 int size,
1896 Py_UNICODE ch);
1897
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898static
1899PyObject *unicodeescape_string(const Py_UNICODE *s,
1900 int size,
1901 int quotes)
1902{
1903 PyObject *repr;
1904 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001906 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907
1908 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1909 if (repr == NULL)
1910 return NULL;
1911
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001912 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
1914 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001916 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 !findchar(s, size, '"')) ? '"' : '\'';
1918 }
1919 while (size-- > 0) {
1920 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001923 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001924 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 *p++ = '\\';
1926 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001927 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001928 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001929
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001930#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001931 /* Map 21-bit characters to '\U00xxxxxx' */
1932 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001933 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001934
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001935 /* Resize the string if necessary */
1936 if (offset + 12 > PyString_GET_SIZE(repr)) {
1937 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001938 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001939 p = PyString_AS_STRING(repr) + offset;
1940 }
1941
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001942 *p++ = '\\';
1943 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001944 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1945 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1946 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1947 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1948 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1949 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1950 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001951 *p++ = hexdigit[ch & 0x0000000F];
1952 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001953 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001954#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001955 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1956 else if (ch >= 0xD800 && ch < 0xDC00) {
1957 Py_UNICODE ch2;
1958 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001959
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001960 ch2 = *s++;
1961 size--;
1962 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1963 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1964 *p++ = '\\';
1965 *p++ = 'U';
1966 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1967 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1968 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1969 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1970 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1971 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1972 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1973 *p++ = hexdigit[ucs & 0x0000000F];
1974 continue;
1975 }
1976 /* Fall through: isolated surrogates are copied as-is */
1977 s--;
1978 size++;
1979 }
1980
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001982 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 *p++ = '\\';
1984 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001985 *p++ = hexdigit[(ch >> 12) & 0x000F];
1986 *p++ = hexdigit[(ch >> 8) & 0x000F];
1987 *p++ = hexdigit[(ch >> 4) & 0x000F];
1988 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001991 /* Map special whitespace to '\t', \n', '\r' */
1992 else if (ch == '\t') {
1993 *p++ = '\\';
1994 *p++ = 't';
1995 }
1996 else if (ch == '\n') {
1997 *p++ = '\\';
1998 *p++ = 'n';
1999 }
2000 else if (ch == '\r') {
2001 *p++ = '\\';
2002 *p++ = 'r';
2003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002004
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002005 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002006 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002008 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002009 *p++ = hexdigit[(ch >> 4) & 0x000F];
2010 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002012
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 /* Copy everything else as-is */
2014 else
2015 *p++ = (char) ch;
2016 }
2017 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002018 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002021 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 return repr;
2023}
2024
2025PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2026 int size)
2027{
2028 return unicodeescape_string(s, size, 0);
2029}
2030
2031PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2032{
2033 if (!PyUnicode_Check(unicode)) {
2034 PyErr_BadArgument();
2035 return NULL;
2036 }
2037 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2038 PyUnicode_GET_SIZE(unicode));
2039}
2040
2041/* --- Raw Unicode Escape Codec ------------------------------------------- */
2042
2043PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2044 int size,
2045 const char *errors)
2046{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 const char *starts = s;
2048 int startinpos;
2049 int endinpos;
2050 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002052 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 const char *end;
2054 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002055 PyObject *errorHandler = NULL;
2056 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002057
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 /* Escaped strings will always be longer than the resulting
2059 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 length after conversion to the true value. (But decoding error
2061 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 v = _PyUnicode_New(size);
2063 if (v == NULL)
2064 goto onError;
2065 if (size == 0)
2066 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 end = s + size;
2069 while (s < end) {
2070 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002071 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002073 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074
2075 /* Non-escape characters are interpreted as Unicode ordinals */
2076 if (*s != '\\') {
2077 *p++ = (unsigned char)*s++;
2078 continue;
2079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081
2082 /* \u-escapes are only interpreted iff the number of leading
2083 backslashes if odd */
2084 bs = s;
2085 for (;s < end;) {
2086 if (*s != '\\')
2087 break;
2088 *p++ = (unsigned char)*s++;
2089 }
2090 if (((s - bs) & 1) == 0 ||
2091 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002092 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 continue;
2094 }
2095 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002096 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 s++;
2098
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002099 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002101 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002104 endinpos = s-starts;
2105 if (unicode_decode_call_errorhandler(
2106 errors, &errorHandler,
2107 "rawunicodeescape", "truncated \\uXXXX",
2108 starts, size, &startinpos, &endinpos, &exc, &s,
2109 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 }
2113 x = (x<<4) & ~0xF;
2114 if (c >= '0' && c <= '9')
2115 x += c - '0';
2116 else if (c >= 'a' && c <= 'f')
2117 x += 10 + c - 'a';
2118 else
2119 x += 10 + c - 'A';
2120 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002121#ifndef Py_UNICODE_WIDE
2122 if (x > 0x10000) {
2123 if (unicode_decode_call_errorhandler(
2124 errors, &errorHandler,
2125 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2126 starts, size, &startinpos, &endinpos, &exc, &s,
2127 (PyObject **)&v, &outpos, &p))
2128 goto onError;
2129 }
2130#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 *p++ = x;
2132 nextByte:
2133 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002135 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 Py_XDECREF(errorHandler);
2138 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002140
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 onError:
2142 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 Py_XDECREF(errorHandler);
2144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 return NULL;
2146}
2147
2148PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2149 int size)
2150{
2151 PyObject *repr;
2152 char *p;
2153 char *q;
2154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002155 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002157#ifdef Py_UNICODE_WIDE
2158 repr = PyString_FromStringAndSize(NULL, 10 * size);
2159#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002161#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 if (repr == NULL)
2163 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002164 if (size == 0)
2165 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166
2167 p = q = PyString_AS_STRING(repr);
2168 while (size-- > 0) {
2169 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002170#ifdef Py_UNICODE_WIDE
2171 /* Map 32-bit characters to '\Uxxxxxxxx' */
2172 if (ch >= 0x10000) {
2173 *p++ = '\\';
2174 *p++ = 'U';
2175 *p++ = hexdigit[(ch >> 28) & 0xf];
2176 *p++ = hexdigit[(ch >> 24) & 0xf];
2177 *p++ = hexdigit[(ch >> 20) & 0xf];
2178 *p++ = hexdigit[(ch >> 16) & 0xf];
2179 *p++ = hexdigit[(ch >> 12) & 0xf];
2180 *p++ = hexdigit[(ch >> 8) & 0xf];
2181 *p++ = hexdigit[(ch >> 4) & 0xf];
2182 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002183 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002184 else
2185#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 /* Map 16-bit characters to '\uxxxx' */
2187 if (ch >= 256) {
2188 *p++ = '\\';
2189 *p++ = 'u';
2190 *p++ = hexdigit[(ch >> 12) & 0xf];
2191 *p++ = hexdigit[(ch >> 8) & 0xf];
2192 *p++ = hexdigit[(ch >> 4) & 0xf];
2193 *p++ = hexdigit[ch & 15];
2194 }
2195 /* Copy everything else as-is */
2196 else
2197 *p++ = (char) ch;
2198 }
2199 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002200 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 return repr;
2202}
2203
2204PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2205{
2206 if (!PyUnicode_Check(unicode)) {
2207 PyErr_BadArgument();
2208 return NULL;
2209 }
2210 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2211 PyUnicode_GET_SIZE(unicode));
2212}
2213
2214/* --- Latin-1 Codec ------------------------------------------------------ */
2215
2216PyObject *PyUnicode_DecodeLatin1(const char *s,
2217 int size,
2218 const char *errors)
2219{
2220 PyUnicodeObject *v;
2221 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002222
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002224 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002225 Py_UNICODE r = *(unsigned char*)s;
2226 return PyUnicode_FromUnicode(&r, 1);
2227 }
2228
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 v = _PyUnicode_New(size);
2230 if (v == NULL)
2231 goto onError;
2232 if (size == 0)
2233 return (PyObject *)v;
2234 p = PyUnicode_AS_UNICODE(v);
2235 while (size-- > 0)
2236 *p++ = (unsigned char)*s++;
2237 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002238
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 onError:
2240 Py_XDECREF(v);
2241 return NULL;
2242}
2243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244/* create or adjust a UnicodeEncodeError */
2245static void make_encode_exception(PyObject **exceptionObject,
2246 const char *encoding,
2247 const Py_UNICODE *unicode, int size,
2248 int startpos, int endpos,
2249 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 if (*exceptionObject == NULL) {
2252 *exceptionObject = PyUnicodeEncodeError_Create(
2253 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 }
2255 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2257 goto onError;
2258 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2259 goto onError;
2260 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2261 goto onError;
2262 return;
2263 onError:
2264 Py_DECREF(*exceptionObject);
2265 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 }
2267}
2268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002269/* raises a UnicodeEncodeError */
2270static void raise_encode_exception(PyObject **exceptionObject,
2271 const char *encoding,
2272 const Py_UNICODE *unicode, int size,
2273 int startpos, int endpos,
2274 const char *reason)
2275{
2276 make_encode_exception(exceptionObject,
2277 encoding, unicode, size, startpos, endpos, reason);
2278 if (*exceptionObject != NULL)
2279 PyCodec_StrictErrors(*exceptionObject);
2280}
2281
2282/* error handling callback helper:
2283 build arguments, call the callback and check the arguments,
2284 put the result into newpos and return the replacement string, which
2285 has to be freed by the caller */
2286static PyObject *unicode_encode_call_errorhandler(const char *errors,
2287 PyObject **errorHandler,
2288 const char *encoding, const char *reason,
2289 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2290 int startpos, int endpos,
2291 int *newpos)
2292{
2293 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2294
2295 PyObject *restuple;
2296 PyObject *resunicode;
2297
2298 if (*errorHandler == NULL) {
2299 *errorHandler = PyCodec_LookupError(errors);
2300 if (*errorHandler == NULL)
2301 return NULL;
2302 }
2303
2304 make_encode_exception(exceptionObject,
2305 encoding, unicode, size, startpos, endpos, reason);
2306 if (*exceptionObject == NULL)
2307 return NULL;
2308
2309 restuple = PyObject_CallFunctionObjArgs(
2310 *errorHandler, *exceptionObject, NULL);
2311 if (restuple == NULL)
2312 return NULL;
2313 if (!PyTuple_Check(restuple)) {
2314 PyErr_Format(PyExc_TypeError, &argparse[4]);
2315 Py_DECREF(restuple);
2316 return NULL;
2317 }
2318 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2319 &resunicode, newpos)) {
2320 Py_DECREF(restuple);
2321 return NULL;
2322 }
2323 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002324 *newpos = size+*newpos;
2325 if (*newpos<0 || *newpos>size) {
2326 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2327 Py_DECREF(restuple);
2328 return NULL;
2329 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002330 Py_INCREF(resunicode);
2331 Py_DECREF(restuple);
2332 return resunicode;
2333}
2334
2335static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2336 int size,
2337 const char *errors,
2338 int limit)
2339{
2340 /* output object */
2341 PyObject *res;
2342 /* pointers to the beginning and end+1 of input */
2343 const Py_UNICODE *startp = p;
2344 const Py_UNICODE *endp = p + size;
2345 /* pointer to the beginning of the unencodable characters */
2346 /* const Py_UNICODE *badp = NULL; */
2347 /* pointer into the output */
2348 char *str;
2349 /* current output position */
2350 int respos = 0;
2351 int ressize;
2352 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2353 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2354 PyObject *errorHandler = NULL;
2355 PyObject *exc = NULL;
2356 /* the following variable is used for caching string comparisons
2357 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2358 int known_errorHandler = -1;
2359
2360 /* allocate enough for a simple encoding without
2361 replacements, if we need more, we'll resize */
2362 res = PyString_FromStringAndSize(NULL, size);
2363 if (res == NULL)
2364 goto onError;
2365 if (size == 0)
2366 return res;
2367 str = PyString_AS_STRING(res);
2368 ressize = size;
2369
2370 while (p<endp) {
2371 Py_UNICODE c = *p;
2372
2373 /* can we encode this? */
2374 if (c<limit) {
2375 /* no overflow check, because we know that the space is enough */
2376 *str++ = (char)c;
2377 ++p;
2378 }
2379 else {
2380 int unicodepos = p-startp;
2381 int requiredsize;
2382 PyObject *repunicode;
2383 int repsize;
2384 int newpos;
2385 int respos;
2386 Py_UNICODE *uni2;
2387 /* startpos for collecting unencodable chars */
2388 const Py_UNICODE *collstart = p;
2389 const Py_UNICODE *collend = p;
2390 /* find all unecodable characters */
2391 while ((collend < endp) && ((*collend)>=limit))
2392 ++collend;
2393 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2394 if (known_errorHandler==-1) {
2395 if ((errors==NULL) || (!strcmp(errors, "strict")))
2396 known_errorHandler = 1;
2397 else if (!strcmp(errors, "replace"))
2398 known_errorHandler = 2;
2399 else if (!strcmp(errors, "ignore"))
2400 known_errorHandler = 3;
2401 else if (!strcmp(errors, "xmlcharrefreplace"))
2402 known_errorHandler = 4;
2403 else
2404 known_errorHandler = 0;
2405 }
2406 switch (known_errorHandler) {
2407 case 1: /* strict */
2408 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2409 goto onError;
2410 case 2: /* replace */
2411 while (collstart++<collend)
2412 *str++ = '?'; /* fall through */
2413 case 3: /* ignore */
2414 p = collend;
2415 break;
2416 case 4: /* xmlcharrefreplace */
2417 respos = str-PyString_AS_STRING(res);
2418 /* determine replacement size (temporarily (mis)uses p) */
2419 for (p = collstart, repsize = 0; p < collend; ++p) {
2420 if (*p<10)
2421 repsize += 2+1+1;
2422 else if (*p<100)
2423 repsize += 2+2+1;
2424 else if (*p<1000)
2425 repsize += 2+3+1;
2426 else if (*p<10000)
2427 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002428#ifndef Py_UNICODE_WIDE
2429 else
2430 repsize += 2+5+1;
2431#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432 else if (*p<100000)
2433 repsize += 2+5+1;
2434 else if (*p<1000000)
2435 repsize += 2+6+1;
2436 else
2437 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002438#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002439 }
2440 requiredsize = respos+repsize+(endp-collend);
2441 if (requiredsize > ressize) {
2442 if (requiredsize<2*ressize)
2443 requiredsize = 2*ressize;
2444 if (_PyString_Resize(&res, requiredsize))
2445 goto onError;
2446 str = PyString_AS_STRING(res) + respos;
2447 ressize = requiredsize;
2448 }
2449 /* generate replacement (temporarily (mis)uses p) */
2450 for (p = collstart; p < collend; ++p) {
2451 str += sprintf(str, "&#%d;", (int)*p);
2452 }
2453 p = collend;
2454 break;
2455 default:
2456 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2457 encoding, reason, startp, size, &exc,
2458 collstart-startp, collend-startp, &newpos);
2459 if (repunicode == NULL)
2460 goto onError;
2461 /* need more space? (at least enough for what we
2462 have+the replacement+the rest of the string, so
2463 we won't have to check space for encodable characters) */
2464 respos = str-PyString_AS_STRING(res);
2465 repsize = PyUnicode_GET_SIZE(repunicode);
2466 requiredsize = respos+repsize+(endp-collend);
2467 if (requiredsize > ressize) {
2468 if (requiredsize<2*ressize)
2469 requiredsize = 2*ressize;
2470 if (_PyString_Resize(&res, requiredsize)) {
2471 Py_DECREF(repunicode);
2472 goto onError;
2473 }
2474 str = PyString_AS_STRING(res) + respos;
2475 ressize = requiredsize;
2476 }
2477 /* check if there is anything unencodable in the replacement
2478 and copy it to the output */
2479 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2480 c = *uni2;
2481 if (c >= limit) {
2482 raise_encode_exception(&exc, encoding, startp, size,
2483 unicodepos, unicodepos+1, reason);
2484 Py_DECREF(repunicode);
2485 goto onError;
2486 }
2487 *str = (char)c;
2488 }
2489 p = startp + newpos;
2490 Py_DECREF(repunicode);
2491 }
2492 }
2493 }
2494 /* Resize if we allocated to much */
2495 respos = str-PyString_AS_STRING(res);
2496 if (respos<ressize)
2497 /* If this falls res will be NULL */
2498 _PyString_Resize(&res, respos);
2499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
2501 return res;
2502
2503 onError:
2504 Py_XDECREF(res);
2505 Py_XDECREF(errorHandler);
2506 Py_XDECREF(exc);
2507 return NULL;
2508}
2509
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2511 int size,
2512 const char *errors)
2513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515}
2516
2517PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2518{
2519 if (!PyUnicode_Check(unicode)) {
2520 PyErr_BadArgument();
2521 return NULL;
2522 }
2523 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2524 PyUnicode_GET_SIZE(unicode),
2525 NULL);
2526}
2527
2528/* --- 7-bit ASCII Codec -------------------------------------------------- */
2529
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530PyObject *PyUnicode_DecodeASCII(const char *s,
2531 int size,
2532 const char *errors)
2533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 PyUnicodeObject *v;
2536 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 int startinpos;
2538 int endinpos;
2539 int outpos;
2540 const char *e;
2541 PyObject *errorHandler = NULL;
2542 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002545 if (size == 1 && *(unsigned char*)s < 128) {
2546 Py_UNICODE r = *(unsigned char*)s;
2547 return PyUnicode_FromUnicode(&r, 1);
2548 }
Tim Petersced69f82003-09-16 20:30:58 +00002549
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 v = _PyUnicode_New(size);
2551 if (v == NULL)
2552 goto onError;
2553 if (size == 0)
2554 return (PyObject *)v;
2555 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 e = s + size;
2557 while (s < e) {
2558 register unsigned char c = (unsigned char)*s;
2559 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 ++s;
2562 }
2563 else {
2564 startinpos = s-starts;
2565 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002566 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 if (unicode_decode_call_errorhandler(
2568 errors, &errorHandler,
2569 "ascii", "ordinal not in range(128)",
2570 starts, size, &startinpos, &endinpos, &exc, &s,
2571 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002575 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002576 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002577 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 Py_XDECREF(errorHandler);
2579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002581
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 onError:
2583 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 Py_XDECREF(errorHandler);
2585 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 return NULL;
2587}
2588
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2590 int size,
2591 const char *errors)
2592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594}
2595
2596PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2597{
2598 if (!PyUnicode_Check(unicode)) {
2599 PyErr_BadArgument();
2600 return NULL;
2601 }
2602 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2603 PyUnicode_GET_SIZE(unicode),
2604 NULL);
2605}
2606
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002607#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002608
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002609/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002610
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002611PyObject *PyUnicode_DecodeMBCS(const char *s,
2612 int size,
2613 const char *errors)
2614{
2615 PyUnicodeObject *v;
2616 Py_UNICODE *p;
2617
2618 /* First get the size of the result */
2619 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002620 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002621 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2622
2623 v = _PyUnicode_New(usize);
2624 if (v == NULL)
2625 return NULL;
2626 if (usize == 0)
2627 return (PyObject *)v;
2628 p = PyUnicode_AS_UNICODE(v);
2629 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2630 Py_DECREF(v);
2631 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2632 }
2633
2634 return (PyObject *)v;
2635}
2636
2637PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2638 int size,
2639 const char *errors)
2640{
2641 PyObject *repr;
2642 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002643 DWORD mbcssize;
2644
2645 /* If there are no characters, bail now! */
2646 if (size==0)
2647 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002648
2649 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002650 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002651 if (mbcssize==0)
2652 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2653
2654 repr = PyString_FromStringAndSize(NULL, mbcssize);
2655 if (repr == NULL)
2656 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002657 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658 return repr;
2659
2660 /* Do the conversion */
2661 s = PyString_AS_STRING(repr);
2662 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2663 Py_DECREF(repr);
2664 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2665 }
2666 return repr;
2667}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002668
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002669PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2670{
2671 if (!PyUnicode_Check(unicode)) {
2672 PyErr_BadArgument();
2673 return NULL;
2674 }
2675 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2676 PyUnicode_GET_SIZE(unicode),
2677 NULL);
2678}
2679
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002680#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002681
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682/* --- Character Mapping Codec -------------------------------------------- */
2683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684PyObject *PyUnicode_DecodeCharmap(const char *s,
2685 int size,
2686 PyObject *mapping,
2687 const char *errors)
2688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *starts = s;
2690 int startinpos;
2691 int endinpos;
2692 int outpos;
2693 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 PyUnicodeObject *v;
2695 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002696 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 PyObject *errorHandler = NULL;
2698 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002699
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 /* Default to Latin-1 */
2701 if (mapping == NULL)
2702 return PyUnicode_DecodeLatin1(s, size, errors);
2703
2704 v = _PyUnicode_New(size);
2705 if (v == NULL)
2706 goto onError;
2707 if (size == 0)
2708 return (PyObject *)v;
2709 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 e = s + size;
2711 while (s < e) {
2712 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 PyObject *w, *x;
2714
2715 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2716 w = PyInt_FromLong((long)ch);
2717 if (w == NULL)
2718 goto onError;
2719 x = PyObject_GetItem(mapping, w);
2720 Py_DECREF(w);
2721 if (x == NULL) {
2722 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002723 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002725 x = Py_None;
2726 Py_INCREF(x);
2727 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002728 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 }
2730
2731 /* Apply mapping */
2732 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002733 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 if (value < 0 || value > 65535) {
2735 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002736 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 Py_DECREF(x);
2738 goto onError;
2739 }
2740 *p++ = (Py_UNICODE)value;
2741 }
2742 else if (x == Py_None) {
2743 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 outpos = p-PyUnicode_AS_UNICODE(v);
2745 startinpos = s-starts;
2746 endinpos = startinpos+1;
2747 if (unicode_decode_call_errorhandler(
2748 errors, &errorHandler,
2749 "charmap", "character maps to <undefined>",
2750 starts, size, &startinpos, &endinpos, &exc, &s,
2751 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 Py_DECREF(x);
2753 goto onError;
2754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 }
2757 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002758 int targetsize = PyUnicode_GET_SIZE(x);
2759
2760 if (targetsize == 1)
2761 /* 1-1 mapping */
2762 *p++ = *PyUnicode_AS_UNICODE(x);
2763
2764 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002766 if (targetsize > extrachars) {
2767 /* resize first */
2768 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2769 int needed = (targetsize - extrachars) + \
2770 (targetsize << 2);
2771 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002772 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002773 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002774 Py_DECREF(x);
2775 goto onError;
2776 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002777 p = PyUnicode_AS_UNICODE(v) + oldpos;
2778 }
2779 Py_UNICODE_COPY(p,
2780 PyUnicode_AS_UNICODE(x),
2781 targetsize);
2782 p += targetsize;
2783 extrachars -= targetsize;
2784 }
2785 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 }
2787 else {
2788 /* wrong return value */
2789 PyErr_SetString(PyExc_TypeError,
2790 "character mapping must return integer, None or unicode");
2791 Py_DECREF(x);
2792 goto onError;
2793 }
2794 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002795 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 }
2797 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002798 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 Py_XDECREF(errorHandler);
2801 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002803
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 Py_XDECREF(errorHandler);
2806 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 Py_XDECREF(v);
2808 return NULL;
2809}
2810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811/* Lookup the character ch in the mapping. If the character
2812 can't be found, Py_None is returned (or NULL, if another
2813 error occured). */
2814static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 PyObject *w = PyInt_FromLong((long)c);
2817 PyObject *x;
2818
2819 if (w == NULL)
2820 return NULL;
2821 x = PyObject_GetItem(mapping, w);
2822 Py_DECREF(w);
2823 if (x == NULL) {
2824 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2825 /* No mapping found means: mapping is undefined. */
2826 PyErr_Clear();
2827 x = Py_None;
2828 Py_INCREF(x);
2829 return x;
2830 } else
2831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002833 else if (x == Py_None)
2834 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 else if (PyInt_Check(x)) {
2836 long value = PyInt_AS_LONG(x);
2837 if (value < 0 || value > 255) {
2838 PyErr_SetString(PyExc_TypeError,
2839 "character mapping must be in range(256)");
2840 Py_DECREF(x);
2841 return NULL;
2842 }
2843 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 else if (PyString_Check(x))
2846 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 /* wrong return value */
2849 PyErr_SetString(PyExc_TypeError,
2850 "character mapping must return integer, None or str");
2851 Py_DECREF(x);
2852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 }
2854}
2855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856/* lookup the character, put the result in the output string and adjust
2857 various state variables. Reallocate the output string if not enough
2858 space is available. Return a new reference to the object that
2859 was put in the output buffer, or Py_None, if the mapping was undefined
2860 (in which case no character was written) or NULL, if a
2861 reallocation error ocurred. The called must decref the result */
2862static
2863PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2864 PyObject **outobj, int *outpos)
2865{
2866 PyObject *rep = charmapencode_lookup(c, mapping);
2867
2868 if (rep==NULL)
2869 return NULL;
2870 else if (rep==Py_None)
2871 return rep;
2872 else {
2873 char *outstart = PyString_AS_STRING(*outobj);
2874 int outsize = PyString_GET_SIZE(*outobj);
2875 if (PyInt_Check(rep)) {
2876 int requiredsize = *outpos+1;
2877 if (outsize<requiredsize) {
2878 /* exponentially overallocate to minimize reallocations */
2879 if (requiredsize < 2*outsize)
2880 requiredsize = 2*outsize;
2881 if (_PyString_Resize(outobj, requiredsize)) {
2882 Py_DECREF(rep);
2883 return NULL;
2884 }
2885 outstart = PyString_AS_STRING(*outobj);
2886 }
2887 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2888 }
2889 else {
2890 const char *repchars = PyString_AS_STRING(rep);
2891 int repsize = PyString_GET_SIZE(rep);
2892 int requiredsize = *outpos+repsize;
2893 if (outsize<requiredsize) {
2894 /* exponentially overallocate to minimize reallocations */
2895 if (requiredsize < 2*outsize)
2896 requiredsize = 2*outsize;
2897 if (_PyString_Resize(outobj, requiredsize)) {
2898 Py_DECREF(rep);
2899 return NULL;
2900 }
2901 outstart = PyString_AS_STRING(*outobj);
2902 }
2903 memcpy(outstart + *outpos, repchars, repsize);
2904 *outpos += repsize;
2905 }
2906 }
2907 return rep;
2908}
2909
2910/* handle an error in PyUnicode_EncodeCharmap
2911 Return 0 on success, -1 on error */
2912static
2913int charmap_encoding_error(
2914 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2915 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002916 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 PyObject **res, int *respos)
2918{
2919 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2920 int repsize;
2921 int newpos;
2922 Py_UNICODE *uni2;
2923 /* startpos for collecting unencodable chars */
2924 int collstartpos = *inpos;
2925 int collendpos = *inpos+1;
2926 int collpos;
2927 char *encoding = "charmap";
2928 char *reason = "character maps to <undefined>";
2929
2930 PyObject *x;
2931 /* find all unencodable characters */
2932 while (collendpos < size) {
2933 x = charmapencode_lookup(p[collendpos], mapping);
2934 if (x==NULL)
2935 return -1;
2936 else if (x!=Py_None) {
2937 Py_DECREF(x);
2938 break;
2939 }
2940 Py_DECREF(x);
2941 ++collendpos;
2942 }
2943 /* cache callback name lookup
2944 * (if not done yet, i.e. it's the first error) */
2945 if (*known_errorHandler==-1) {
2946 if ((errors==NULL) || (!strcmp(errors, "strict")))
2947 *known_errorHandler = 1;
2948 else if (!strcmp(errors, "replace"))
2949 *known_errorHandler = 2;
2950 else if (!strcmp(errors, "ignore"))
2951 *known_errorHandler = 3;
2952 else if (!strcmp(errors, "xmlcharrefreplace"))
2953 *known_errorHandler = 4;
2954 else
2955 *known_errorHandler = 0;
2956 }
2957 switch (*known_errorHandler) {
2958 case 1: /* strict */
2959 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2960 return -1;
2961 case 2: /* replace */
2962 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2963 x = charmapencode_output('?', mapping, res, respos);
2964 if (x==NULL) {
2965 return -1;
2966 }
2967 else if (x==Py_None) {
2968 Py_DECREF(x);
2969 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2970 return -1;
2971 }
2972 Py_DECREF(x);
2973 }
2974 /* fall through */
2975 case 3: /* ignore */
2976 *inpos = collendpos;
2977 break;
2978 case 4: /* xmlcharrefreplace */
2979 /* generate replacement (temporarily (mis)uses p) */
2980 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2981 char buffer[2+29+1+1];
2982 char *cp;
2983 sprintf(buffer, "&#%d;", (int)p[collpos]);
2984 for (cp = buffer; *cp; ++cp) {
2985 x = charmapencode_output(*cp, mapping, res, respos);
2986 if (x==NULL)
2987 return -1;
2988 else if (x==Py_None) {
2989 Py_DECREF(x);
2990 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2991 return -1;
2992 }
2993 Py_DECREF(x);
2994 }
2995 }
2996 *inpos = collendpos;
2997 break;
2998 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002999 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 encoding, reason, p, size, exceptionObject,
3001 collstartpos, collendpos, &newpos);
3002 if (repunicode == NULL)
3003 return -1;
3004 /* generate replacement */
3005 repsize = PyUnicode_GET_SIZE(repunicode);
3006 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3007 x = charmapencode_output(*uni2, mapping, res, respos);
3008 if (x==NULL) {
3009 Py_DECREF(repunicode);
3010 return -1;
3011 }
3012 else if (x==Py_None) {
3013 Py_DECREF(repunicode);
3014 Py_DECREF(x);
3015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3016 return -1;
3017 }
3018 Py_DECREF(x);
3019 }
3020 *inpos = newpos;
3021 Py_DECREF(repunicode);
3022 }
3023 return 0;
3024}
3025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3027 int size,
3028 PyObject *mapping,
3029 const char *errors)
3030{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 /* output object */
3032 PyObject *res = NULL;
3033 /* current input position */
3034 int inpos = 0;
3035 /* current output position */
3036 int respos = 0;
3037 PyObject *errorHandler = NULL;
3038 PyObject *exc = NULL;
3039 /* the following variable is used for caching string comparisons
3040 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3041 * 3=ignore, 4=xmlcharrefreplace */
3042 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043
3044 /* Default to Latin-1 */
3045 if (mapping == NULL)
3046 return PyUnicode_EncodeLatin1(p, size, errors);
3047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 /* allocate enough for a simple encoding without
3049 replacements, if we need more, we'll resize */
3050 res = PyString_FromStringAndSize(NULL, size);
3051 if (res == NULL)
3052 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003053 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 while (inpos<size) {
3057 /* try to encode it */
3058 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3059 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 if (x==Py_None) { /* unencodable character */
3062 if (charmap_encoding_error(p, size, &inpos, mapping,
3063 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003064 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003065 &res, &respos)) {
3066 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003067 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 else
3071 /* done with this character => adjust input position */
3072 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 Py_DECREF(x);
3074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 /* Resize if we allocated to much */
3077 if (respos<PyString_GET_SIZE(res)) {
3078 if (_PyString_Resize(&res, respos))
3079 goto onError;
3080 }
3081 Py_XDECREF(exc);
3082 Py_XDECREF(errorHandler);
3083 return res;
3084
3085 onError:
3086 Py_XDECREF(res);
3087 Py_XDECREF(exc);
3088 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 return NULL;
3090}
3091
3092PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3093 PyObject *mapping)
3094{
3095 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3096 PyErr_BadArgument();
3097 return NULL;
3098 }
3099 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3100 PyUnicode_GET_SIZE(unicode),
3101 mapping,
3102 NULL);
3103}
3104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105/* create or adjust a UnicodeTranslateError */
3106static void make_translate_exception(PyObject **exceptionObject,
3107 const Py_UNICODE *unicode, int size,
3108 int startpos, int endpos,
3109 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 if (*exceptionObject == NULL) {
3112 *exceptionObject = PyUnicodeTranslateError_Create(
3113 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 }
3115 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3117 goto onError;
3118 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3119 goto onError;
3120 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3121 goto onError;
3122 return;
3123 onError:
3124 Py_DECREF(*exceptionObject);
3125 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 }
3127}
3128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129/* raises a UnicodeTranslateError */
3130static void raise_translate_exception(PyObject **exceptionObject,
3131 const Py_UNICODE *unicode, int size,
3132 int startpos, int endpos,
3133 const char *reason)
3134{
3135 make_translate_exception(exceptionObject,
3136 unicode, size, startpos, endpos, reason);
3137 if (*exceptionObject != NULL)
3138 PyCodec_StrictErrors(*exceptionObject);
3139}
3140
3141/* error handling callback helper:
3142 build arguments, call the callback and check the arguments,
3143 put the result into newpos and return the replacement string, which
3144 has to be freed by the caller */
3145static PyObject *unicode_translate_call_errorhandler(const char *errors,
3146 PyObject **errorHandler,
3147 const char *reason,
3148 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3149 int startpos, int endpos,
3150 int *newpos)
3151{
3152 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3153
3154 PyObject *restuple;
3155 PyObject *resunicode;
3156
3157 if (*errorHandler == NULL) {
3158 *errorHandler = PyCodec_LookupError(errors);
3159 if (*errorHandler == NULL)
3160 return NULL;
3161 }
3162
3163 make_translate_exception(exceptionObject,
3164 unicode, size, startpos, endpos, reason);
3165 if (*exceptionObject == NULL)
3166 return NULL;
3167
3168 restuple = PyObject_CallFunctionObjArgs(
3169 *errorHandler, *exceptionObject, NULL);
3170 if (restuple == NULL)
3171 return NULL;
3172 if (!PyTuple_Check(restuple)) {
3173 PyErr_Format(PyExc_TypeError, &argparse[4]);
3174 Py_DECREF(restuple);
3175 return NULL;
3176 }
3177 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3178 &resunicode, newpos)) {
3179 Py_DECREF(restuple);
3180 return NULL;
3181 }
3182 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003183 *newpos = size+*newpos;
3184 if (*newpos<0 || *newpos>size) {
3185 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3186 Py_DECREF(restuple);
3187 return NULL;
3188 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189 Py_INCREF(resunicode);
3190 Py_DECREF(restuple);
3191 return resunicode;
3192}
3193
3194/* Lookup the character ch in the mapping and put the result in result,
3195 which must be decrefed by the caller.
3196 Return 0 on success, -1 on error */
3197static
3198int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3199{
3200 PyObject *w = PyInt_FromLong((long)c);
3201 PyObject *x;
3202
3203 if (w == NULL)
3204 return -1;
3205 x = PyObject_GetItem(mapping, w);
3206 Py_DECREF(w);
3207 if (x == NULL) {
3208 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3209 /* No mapping found means: use 1:1 mapping. */
3210 PyErr_Clear();
3211 *result = NULL;
3212 return 0;
3213 } else
3214 return -1;
3215 }
3216 else if (x == Py_None) {
3217 *result = x;
3218 return 0;
3219 }
3220 else if (PyInt_Check(x)) {
3221 long value = PyInt_AS_LONG(x);
3222 long max = PyUnicode_GetMax();
3223 if (value < 0 || value > max) {
3224 PyErr_Format(PyExc_TypeError,
3225 "character mapping must be in range(0x%lx)", max+1);
3226 Py_DECREF(x);
3227 return -1;
3228 }
3229 *result = x;
3230 return 0;
3231 }
3232 else if (PyUnicode_Check(x)) {
3233 *result = x;
3234 return 0;
3235 }
3236 else {
3237 /* wrong return value */
3238 PyErr_SetString(PyExc_TypeError,
3239 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003240 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 return -1;
3242 }
3243}
3244/* ensure that *outobj is at least requiredsize characters long,
3245if not reallocate and adjust various state variables.
3246Return 0 on success, -1 on error */
3247static
Walter Dörwald4894c302003-10-24 14:25:28 +00003248int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 int requiredsize)
3250{
Walter Dörwald4894c302003-10-24 14:25:28 +00003251 int oldsize = PyUnicode_GET_SIZE(*outobj);
3252 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 /* remember old output position */
3254 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3255 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003256 if (requiredsize < 2 * oldsize)
3257 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003258 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259 return -1;
3260 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 }
3262 return 0;
3263}
3264/* lookup the character, put the result in the output string and adjust
3265 various state variables. Return a new reference to the object that
3266 was put in the output buffer in *result, or Py_None, if the mapping was
3267 undefined (in which case no character was written).
3268 The called must decref result.
3269 Return 0 on success, -1 on error. */
3270static
Walter Dörwald4894c302003-10-24 14:25:28 +00003271int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3272 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3273 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274{
Walter Dörwald4894c302003-10-24 14:25:28 +00003275 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 return -1;
3277 if (*res==NULL) {
3278 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003279 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 }
3281 else if (*res==Py_None)
3282 ;
3283 else if (PyInt_Check(*res)) {
3284 /* no overflow check, because we know that the space is enough */
3285 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3286 }
3287 else if (PyUnicode_Check(*res)) {
3288 int repsize = PyUnicode_GET_SIZE(*res);
3289 if (repsize==1) {
3290 /* no overflow check, because we know that the space is enough */
3291 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3292 }
3293 else if (repsize!=0) {
3294 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003295 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003296 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003297 repsize - 1;
3298 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 return -1;
3300 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3301 *outp += repsize;
3302 }
3303 }
3304 else
3305 return -1;
3306 return 0;
3307}
3308
3309PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 int size,
3311 PyObject *mapping,
3312 const char *errors)
3313{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314 /* output object */
3315 PyObject *res = NULL;
3316 /* pointers to the beginning and end+1 of input */
3317 const Py_UNICODE *startp = p;
3318 const Py_UNICODE *endp = p + size;
3319 /* pointer into the output */
3320 Py_UNICODE *str;
3321 /* current output position */
3322 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 char *reason = "character maps to <undefined>";
3324 PyObject *errorHandler = NULL;
3325 PyObject *exc = NULL;
3326 /* the following variable is used for caching string comparisons
3327 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3328 * 3=ignore, 4=xmlcharrefreplace */
3329 int known_errorHandler = -1;
3330
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 if (mapping == NULL) {
3332 PyErr_BadArgument();
3333 return NULL;
3334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335
3336 /* allocate enough for a simple 1:1 translation without
3337 replacements, if we need more, we'll resize */
3338 res = PyUnicode_FromUnicode(NULL, size);
3339 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 return res;
3343 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 while (p<endp) {
3346 /* try to encode it */
3347 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003348 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 goto onError;
3351 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003352 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 if (x!=Py_None) /* it worked => adjust input pointer */
3354 ++p;
3355 else { /* untranslatable character */
3356 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3357 int repsize;
3358 int newpos;
3359 Py_UNICODE *uni2;
3360 /* startpos for collecting untranslatable chars */
3361 const Py_UNICODE *collstart = p;
3362 const Py_UNICODE *collend = p+1;
3363 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 /* find all untranslatable characters */
3366 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003367 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 goto onError;
3369 Py_XDECREF(x);
3370 if (x!=Py_None)
3371 break;
3372 ++collend;
3373 }
3374 /* cache callback name lookup
3375 * (if not done yet, i.e. it's the first error) */
3376 if (known_errorHandler==-1) {
3377 if ((errors==NULL) || (!strcmp(errors, "strict")))
3378 known_errorHandler = 1;
3379 else if (!strcmp(errors, "replace"))
3380 known_errorHandler = 2;
3381 else if (!strcmp(errors, "ignore"))
3382 known_errorHandler = 3;
3383 else if (!strcmp(errors, "xmlcharrefreplace"))
3384 known_errorHandler = 4;
3385 else
3386 known_errorHandler = 0;
3387 }
3388 switch (known_errorHandler) {
3389 case 1: /* strict */
3390 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3391 goto onError;
3392 case 2: /* replace */
3393 /* No need to check for space, this is a 1:1 replacement */
3394 for (coll = collstart; coll<collend; ++coll)
3395 *str++ = '?';
3396 /* fall through */
3397 case 3: /* ignore */
3398 p = collend;
3399 break;
3400 case 4: /* xmlcharrefreplace */
3401 /* generate replacement (temporarily (mis)uses p) */
3402 for (p = collstart; p < collend; ++p) {
3403 char buffer[2+29+1+1];
3404 char *cp;
3405 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003406 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3408 goto onError;
3409 for (cp = buffer; *cp; ++cp)
3410 *str++ = *cp;
3411 }
3412 p = collend;
3413 break;
3414 default:
3415 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3416 reason, startp, size, &exc,
3417 collstart-startp, collend-startp, &newpos);
3418 if (repunicode == NULL)
3419 goto onError;
3420 /* generate replacement */
3421 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003422 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3424 Py_DECREF(repunicode);
3425 goto onError;
3426 }
3427 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3428 *str++ = *uni2;
3429 p = startp + newpos;
3430 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431 }
3432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 /* Resize if we allocated to much */
3435 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003436 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003437 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003438 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 }
3440 Py_XDECREF(exc);
3441 Py_XDECREF(errorHandler);
3442 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 onError:
3445 Py_XDECREF(res);
3446 Py_XDECREF(exc);
3447 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 return NULL;
3449}
3450
3451PyObject *PyUnicode_Translate(PyObject *str,
3452 PyObject *mapping,
3453 const char *errors)
3454{
3455 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003456
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 str = PyUnicode_FromObject(str);
3458 if (str == NULL)
3459 goto onError;
3460 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3461 PyUnicode_GET_SIZE(str),
3462 mapping,
3463 errors);
3464 Py_DECREF(str);
3465 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003466
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 onError:
3468 Py_XDECREF(str);
3469 return NULL;
3470}
Tim Petersced69f82003-09-16 20:30:58 +00003471
Guido van Rossum9e896b32000-04-05 20:11:21 +00003472/* --- Decimal Encoder ---------------------------------------------------- */
3473
3474int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3475 int length,
3476 char *output,
3477 const char *errors)
3478{
3479 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 PyObject *errorHandler = NULL;
3481 PyObject *exc = NULL;
3482 const char *encoding = "decimal";
3483 const char *reason = "invalid decimal Unicode string";
3484 /* the following variable is used for caching string comparisons
3485 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3486 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003487
3488 if (output == NULL) {
3489 PyErr_BadArgument();
3490 return -1;
3491 }
3492
3493 p = s;
3494 end = s + length;
3495 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003497 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 PyObject *repunicode;
3499 int repsize;
3500 int newpos;
3501 Py_UNICODE *uni2;
3502 Py_UNICODE *collstart;
3503 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003504
Guido van Rossum9e896b32000-04-05 20:11:21 +00003505 if (Py_UNICODE_ISSPACE(ch)) {
3506 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003508 continue;
3509 }
3510 decimal = Py_UNICODE_TODECIMAL(ch);
3511 if (decimal >= 0) {
3512 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003514 continue;
3515 }
Guido van Rossumba477042000-04-06 18:18:10 +00003516 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003517 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519 continue;
3520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 /* All other characters are considered unencodable */
3522 collstart = p;
3523 collend = p+1;
3524 while (collend < end) {
3525 if ((0 < *collend && *collend < 256) ||
3526 !Py_UNICODE_ISSPACE(*collend) ||
3527 Py_UNICODE_TODECIMAL(*collend))
3528 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 /* cache callback name lookup
3531 * (if not done yet, i.e. it's the first error) */
3532 if (known_errorHandler==-1) {
3533 if ((errors==NULL) || (!strcmp(errors, "strict")))
3534 known_errorHandler = 1;
3535 else if (!strcmp(errors, "replace"))
3536 known_errorHandler = 2;
3537 else if (!strcmp(errors, "ignore"))
3538 known_errorHandler = 3;
3539 else if (!strcmp(errors, "xmlcharrefreplace"))
3540 known_errorHandler = 4;
3541 else
3542 known_errorHandler = 0;
3543 }
3544 switch (known_errorHandler) {
3545 case 1: /* strict */
3546 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3547 goto onError;
3548 case 2: /* replace */
3549 for (p = collstart; p < collend; ++p)
3550 *output++ = '?';
3551 /* fall through */
3552 case 3: /* ignore */
3553 p = collend;
3554 break;
3555 case 4: /* xmlcharrefreplace */
3556 /* generate replacement (temporarily (mis)uses p) */
3557 for (p = collstart; p < collend; ++p)
3558 output += sprintf(output, "&#%d;", (int)*p);
3559 p = collend;
3560 break;
3561 default:
3562 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3563 encoding, reason, s, length, &exc,
3564 collstart-s, collend-s, &newpos);
3565 if (repunicode == NULL)
3566 goto onError;
3567 /* generate replacement */
3568 repsize = PyUnicode_GET_SIZE(repunicode);
3569 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3570 Py_UNICODE ch = *uni2;
3571 if (Py_UNICODE_ISSPACE(ch))
3572 *output++ = ' ';
3573 else {
3574 decimal = Py_UNICODE_TODECIMAL(ch);
3575 if (decimal >= 0)
3576 *output++ = '0' + decimal;
3577 else if (0 < ch && ch < 256)
3578 *output++ = (char)ch;
3579 else {
3580 Py_DECREF(repunicode);
3581 raise_encode_exception(&exc, encoding,
3582 s, length, collstart-s, collend-s, reason);
3583 goto onError;
3584 }
3585 }
3586 }
3587 p = s + newpos;
3588 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003589 }
3590 }
3591 /* 0-terminate the output string */
3592 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 Py_XDECREF(exc);
3594 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003595 return 0;
3596
3597 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 Py_XDECREF(exc);
3599 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003600 return -1;
3601}
3602
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603/* --- Helpers ------------------------------------------------------------ */
3604
Tim Petersced69f82003-09-16 20:30:58 +00003605static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606int count(PyUnicodeObject *self,
3607 int start,
3608 int end,
3609 PyUnicodeObject *substring)
3610{
3611 int count = 0;
3612
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003613 if (start < 0)
3614 start += self->length;
3615 if (start < 0)
3616 start = 0;
3617 if (end > self->length)
3618 end = self->length;
3619 if (end < 0)
3620 end += self->length;
3621 if (end < 0)
3622 end = 0;
3623
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003624 if (substring->length == 0)
3625 return (end - start + 1);
3626
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 end -= substring->length;
3628
3629 while (start <= end)
3630 if (Py_UNICODE_MATCH(self, start, substring)) {
3631 count++;
3632 start += substring->length;
3633 } else
3634 start++;
3635
3636 return count;
3637}
3638
3639int PyUnicode_Count(PyObject *str,
3640 PyObject *substr,
3641 int start,
3642 int end)
3643{
3644 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 str = PyUnicode_FromObject(str);
3647 if (str == NULL)
3648 return -1;
3649 substr = PyUnicode_FromObject(substr);
3650 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003651 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 return -1;
3653 }
Tim Petersced69f82003-09-16 20:30:58 +00003654
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 result = count((PyUnicodeObject *)str,
3656 start, end,
3657 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 Py_DECREF(str);
3660 Py_DECREF(substr);
3661 return result;
3662}
3663
Tim Petersced69f82003-09-16 20:30:58 +00003664static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665int findstring(PyUnicodeObject *self,
3666 PyUnicodeObject *substring,
3667 int start,
3668 int end,
3669 int direction)
3670{
3671 if (start < 0)
3672 start += self->length;
3673 if (start < 0)
3674 start = 0;
3675
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 if (end > self->length)
3677 end = self->length;
3678 if (end < 0)
3679 end += self->length;
3680 if (end < 0)
3681 end = 0;
3682
Guido van Rossum76afbd92002-08-20 17:29:29 +00003683 if (substring->length == 0)
3684 return (direction > 0) ? start : end;
3685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 end -= substring->length;
3687
3688 if (direction < 0) {
3689 for (; end >= start; end--)
3690 if (Py_UNICODE_MATCH(self, end, substring))
3691 return end;
3692 } else {
3693 for (; start <= end; start++)
3694 if (Py_UNICODE_MATCH(self, start, substring))
3695 return start;
3696 }
3697
3698 return -1;
3699}
3700
3701int PyUnicode_Find(PyObject *str,
3702 PyObject *substr,
3703 int start,
3704 int end,
3705 int direction)
3706{
3707 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003708
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 str = PyUnicode_FromObject(str);
3710 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003711 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 substr = PyUnicode_FromObject(substr);
3713 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003714 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003715 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 }
Tim Petersced69f82003-09-16 20:30:58 +00003717
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 result = findstring((PyUnicodeObject *)str,
3719 (PyUnicodeObject *)substr,
3720 start, end, direction);
3721 Py_DECREF(str);
3722 Py_DECREF(substr);
3723 return result;
3724}
3725
Tim Petersced69f82003-09-16 20:30:58 +00003726static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727int tailmatch(PyUnicodeObject *self,
3728 PyUnicodeObject *substring,
3729 int start,
3730 int end,
3731 int direction)
3732{
3733 if (start < 0)
3734 start += self->length;
3735 if (start < 0)
3736 start = 0;
3737
3738 if (substring->length == 0)
3739 return 1;
3740
3741 if (end > self->length)
3742 end = self->length;
3743 if (end < 0)
3744 end += self->length;
3745 if (end < 0)
3746 end = 0;
3747
3748 end -= substring->length;
3749 if (end < start)
3750 return 0;
3751
3752 if (direction > 0) {
3753 if (Py_UNICODE_MATCH(self, end, substring))
3754 return 1;
3755 } else {
3756 if (Py_UNICODE_MATCH(self, start, substring))
3757 return 1;
3758 }
3759
3760 return 0;
3761}
3762
3763int PyUnicode_Tailmatch(PyObject *str,
3764 PyObject *substr,
3765 int start,
3766 int end,
3767 int direction)
3768{
3769 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 str = PyUnicode_FromObject(str);
3772 if (str == NULL)
3773 return -1;
3774 substr = PyUnicode_FromObject(substr);
3775 if (substr == NULL) {
3776 Py_DECREF(substr);
3777 return -1;
3778 }
Tim Petersced69f82003-09-16 20:30:58 +00003779
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 result = tailmatch((PyUnicodeObject *)str,
3781 (PyUnicodeObject *)substr,
3782 start, end, direction);
3783 Py_DECREF(str);
3784 Py_DECREF(substr);
3785 return result;
3786}
3787
Tim Petersced69f82003-09-16 20:30:58 +00003788static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789const Py_UNICODE *findchar(const Py_UNICODE *s,
3790 int size,
3791 Py_UNICODE ch)
3792{
3793 /* like wcschr, but doesn't stop at NULL characters */
3794
3795 while (size-- > 0) {
3796 if (*s == ch)
3797 return s;
3798 s++;
3799 }
3800
3801 return NULL;
3802}
3803
3804/* Apply fixfct filter to the Unicode object self and return a
3805 reference to the modified object */
3806
Tim Petersced69f82003-09-16 20:30:58 +00003807static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808PyObject *fixup(PyUnicodeObject *self,
3809 int (*fixfct)(PyUnicodeObject *s))
3810{
3811
3812 PyUnicodeObject *u;
3813
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003814 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 if (u == NULL)
3816 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003817
3818 Py_UNICODE_COPY(u->str, self->str, self->length);
3819
Tim Peters7a29bd52001-09-12 03:03:31 +00003820 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 /* fixfct should return TRUE if it modified the buffer. If
3822 FALSE, return a reference to the original buffer instead
3823 (to save space, not time) */
3824 Py_INCREF(self);
3825 Py_DECREF(u);
3826 return (PyObject*) self;
3827 }
3828 return (PyObject*) u;
3829}
3830
Tim Petersced69f82003-09-16 20:30:58 +00003831static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832int fixupper(PyUnicodeObject *self)
3833{
3834 int len = self->length;
3835 Py_UNICODE *s = self->str;
3836 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 while (len-- > 0) {
3839 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 ch = Py_UNICODE_TOUPPER(*s);
3842 if (ch != *s) {
3843 status = 1;
3844 *s = ch;
3845 }
3846 s++;
3847 }
3848
3849 return status;
3850}
3851
Tim Petersced69f82003-09-16 20:30:58 +00003852static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853int fixlower(PyUnicodeObject *self)
3854{
3855 int len = self->length;
3856 Py_UNICODE *s = self->str;
3857 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003858
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 while (len-- > 0) {
3860 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003861
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 ch = Py_UNICODE_TOLOWER(*s);
3863 if (ch != *s) {
3864 status = 1;
3865 *s = ch;
3866 }
3867 s++;
3868 }
3869
3870 return status;
3871}
3872
Tim Petersced69f82003-09-16 20:30:58 +00003873static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874int fixswapcase(PyUnicodeObject *self)
3875{
3876 int len = self->length;
3877 Py_UNICODE *s = self->str;
3878 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003879
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 while (len-- > 0) {
3881 if (Py_UNICODE_ISUPPER(*s)) {
3882 *s = Py_UNICODE_TOLOWER(*s);
3883 status = 1;
3884 } else if (Py_UNICODE_ISLOWER(*s)) {
3885 *s = Py_UNICODE_TOUPPER(*s);
3886 status = 1;
3887 }
3888 s++;
3889 }
3890
3891 return status;
3892}
3893
Tim Petersced69f82003-09-16 20:30:58 +00003894static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895int fixcapitalize(PyUnicodeObject *self)
3896{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003897 int len = self->length;
3898 Py_UNICODE *s = self->str;
3899 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003900
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003901 if (len == 0)
3902 return 0;
3903 if (Py_UNICODE_ISLOWER(*s)) {
3904 *s = Py_UNICODE_TOUPPER(*s);
3905 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003907 s++;
3908 while (--len > 0) {
3909 if (Py_UNICODE_ISUPPER(*s)) {
3910 *s = Py_UNICODE_TOLOWER(*s);
3911 status = 1;
3912 }
3913 s++;
3914 }
3915 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916}
3917
3918static
3919int fixtitle(PyUnicodeObject *self)
3920{
3921 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3922 register Py_UNICODE *e;
3923 int previous_is_cased;
3924
3925 /* Shortcut for single character strings */
3926 if (PyUnicode_GET_SIZE(self) == 1) {
3927 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3928 if (*p != ch) {
3929 *p = ch;
3930 return 1;
3931 }
3932 else
3933 return 0;
3934 }
Tim Petersced69f82003-09-16 20:30:58 +00003935
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 e = p + PyUnicode_GET_SIZE(self);
3937 previous_is_cased = 0;
3938 for (; p < e; p++) {
3939 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003940
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 if (previous_is_cased)
3942 *p = Py_UNICODE_TOLOWER(ch);
3943 else
3944 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003945
3946 if (Py_UNICODE_ISLOWER(ch) ||
3947 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 Py_UNICODE_ISTITLE(ch))
3949 previous_is_cased = 1;
3950 else
3951 previous_is_cased = 0;
3952 }
3953 return 1;
3954}
3955
3956PyObject *PyUnicode_Join(PyObject *separator,
3957 PyObject *seq)
3958{
3959 Py_UNICODE *sep;
3960 int seplen;
3961 PyUnicodeObject *res = NULL;
3962 int reslen = 0;
3963 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 int sz = 100;
3965 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003966 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967
Tim Peters2cfe3682001-05-05 05:36:48 +00003968 it = PyObject_GetIter(seq);
3969 if (it == NULL)
3970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971
3972 if (separator == NULL) {
3973 Py_UNICODE blank = ' ';
3974 sep = &blank;
3975 seplen = 1;
3976 }
3977 else {
3978 separator = PyUnicode_FromObject(separator);
3979 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003980 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 sep = PyUnicode_AS_UNICODE(separator);
3982 seplen = PyUnicode_GET_SIZE(separator);
3983 }
Tim Petersced69f82003-09-16 20:30:58 +00003984
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 res = _PyUnicode_New(sz);
3986 if (res == NULL)
3987 goto onError;
3988 p = PyUnicode_AS_UNICODE(res);
3989 reslen = 0;
3990
Tim Peters2cfe3682001-05-05 05:36:48 +00003991 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003993 PyObject *item = PyIter_Next(it);
3994 if (item == NULL) {
3995 if (PyErr_Occurred())
3996 goto onError;
3997 break;
3998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 if (!PyUnicode_Check(item)) {
4000 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004001 if (!PyString_Check(item)) {
4002 PyErr_Format(PyExc_TypeError,
4003 "sequence item %i: expected string or Unicode,"
4004 " %.80s found",
4005 i, item->ob_type->tp_name);
4006 Py_DECREF(item);
4007 goto onError;
4008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 v = PyUnicode_FromObject(item);
4010 Py_DECREF(item);
4011 item = v;
4012 if (item == NULL)
4013 goto onError;
4014 }
4015 itemlen = PyUnicode_GET_SIZE(item);
4016 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004017 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004018 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 sz *= 2;
4022 p = PyUnicode_AS_UNICODE(res) + reslen;
4023 }
4024 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004025 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 p += seplen;
4027 reslen += seplen;
4028 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004029 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 p += itemlen;
4031 reslen += itemlen;
4032 Py_DECREF(item);
4033 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004034 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 goto onError;
4036
4037 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004038 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 return (PyObject *)res;
4040
4041 onError:
4042 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004043 Py_XDECREF(res);
4044 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 return NULL;
4046}
4047
Tim Petersced69f82003-09-16 20:30:58 +00004048static
4049PyUnicodeObject *pad(PyUnicodeObject *self,
4050 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 int right,
4052 Py_UNICODE fill)
4053{
4054 PyUnicodeObject *u;
4055
4056 if (left < 0)
4057 left = 0;
4058 if (right < 0)
4059 right = 0;
4060
Tim Peters7a29bd52001-09-12 03:03:31 +00004061 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 Py_INCREF(self);
4063 return self;
4064 }
4065
4066 u = _PyUnicode_New(left + self->length + right);
4067 if (u) {
4068 if (left)
4069 Py_UNICODE_FILL(u->str, fill, left);
4070 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4071 if (right)
4072 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4073 }
4074
4075 return u;
4076}
4077
4078#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004079 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 if (!str) \
4081 goto onError; \
4082 if (PyList_Append(list, str)) { \
4083 Py_DECREF(str); \
4084 goto onError; \
4085 } \
4086 else \
4087 Py_DECREF(str);
4088
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004089#define SPLIT_INSERT(data, left, right) \
4090 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4091 if (!str) \
4092 goto onError; \
4093 if (PyList_Insert(list, 0, str)) { \
4094 Py_DECREF(str); \
4095 goto onError; \
4096 } \
4097 else \
4098 Py_DECREF(str);
4099
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100static
4101PyObject *split_whitespace(PyUnicodeObject *self,
4102 PyObject *list,
4103 int maxcount)
4104{
4105 register int i;
4106 register int j;
4107 int len = self->length;
4108 PyObject *str;
4109
4110 for (i = j = 0; i < len; ) {
4111 /* find a token */
4112 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4113 i++;
4114 j = i;
4115 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4116 i++;
4117 if (j < i) {
4118 if (maxcount-- <= 0)
4119 break;
4120 SPLIT_APPEND(self->str, j, i);
4121 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4122 i++;
4123 j = i;
4124 }
4125 }
4126 if (j < len) {
4127 SPLIT_APPEND(self->str, j, len);
4128 }
4129 return list;
4130
4131 onError:
4132 Py_DECREF(list);
4133 return NULL;
4134}
4135
4136PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004137 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138{
4139 register int i;
4140 register int j;
4141 int len;
4142 PyObject *list;
4143 PyObject *str;
4144 Py_UNICODE *data;
4145
4146 string = PyUnicode_FromObject(string);
4147 if (string == NULL)
4148 return NULL;
4149 data = PyUnicode_AS_UNICODE(string);
4150 len = PyUnicode_GET_SIZE(string);
4151
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 list = PyList_New(0);
4153 if (!list)
4154 goto onError;
4155
4156 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004157 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 /* Find a line and append it */
4160 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4161 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162
4163 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004164 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 if (i < len) {
4166 if (data[i] == '\r' && i + 1 < len &&
4167 data[i+1] == '\n')
4168 i += 2;
4169 else
4170 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004171 if (keepends)
4172 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 }
Guido van Rossum86662912000-04-11 15:38:46 +00004174 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 j = i;
4176 }
4177 if (j < len) {
4178 SPLIT_APPEND(data, j, len);
4179 }
4180
4181 Py_DECREF(string);
4182 return list;
4183
4184 onError:
4185 Py_DECREF(list);
4186 Py_DECREF(string);
4187 return NULL;
4188}
4189
Tim Petersced69f82003-09-16 20:30:58 +00004190static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191PyObject *split_char(PyUnicodeObject *self,
4192 PyObject *list,
4193 Py_UNICODE ch,
4194 int maxcount)
4195{
4196 register int i;
4197 register int j;
4198 int len = self->length;
4199 PyObject *str;
4200
4201 for (i = j = 0; i < len; ) {
4202 if (self->str[i] == ch) {
4203 if (maxcount-- <= 0)
4204 break;
4205 SPLIT_APPEND(self->str, j, i);
4206 i = j = i + 1;
4207 } else
4208 i++;
4209 }
4210 if (j <= len) {
4211 SPLIT_APPEND(self->str, j, len);
4212 }
4213 return list;
4214
4215 onError:
4216 Py_DECREF(list);
4217 return NULL;
4218}
4219
Tim Petersced69f82003-09-16 20:30:58 +00004220static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221PyObject *split_substring(PyUnicodeObject *self,
4222 PyObject *list,
4223 PyUnicodeObject *substring,
4224 int maxcount)
4225{
4226 register int i;
4227 register int j;
4228 int len = self->length;
4229 int sublen = substring->length;
4230 PyObject *str;
4231
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004232 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 if (Py_UNICODE_MATCH(self, i, substring)) {
4234 if (maxcount-- <= 0)
4235 break;
4236 SPLIT_APPEND(self->str, j, i);
4237 i = j = i + sublen;
4238 } else
4239 i++;
4240 }
4241 if (j <= len) {
4242 SPLIT_APPEND(self->str, j, len);
4243 }
4244 return list;
4245
4246 onError:
4247 Py_DECREF(list);
4248 return NULL;
4249}
4250
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004251static
4252PyObject *rsplit_whitespace(PyUnicodeObject *self,
4253 PyObject *list,
4254 int maxcount)
4255{
4256 register int i;
4257 register int j;
4258 int len = self->length;
4259 PyObject *str;
4260
4261 for (i = j = len - 1; i >= 0; ) {
4262 /* find a token */
4263 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4264 i--;
4265 j = i;
4266 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4267 i--;
4268 if (j > i) {
4269 if (maxcount-- <= 0)
4270 break;
4271 SPLIT_INSERT(self->str, i + 1, j + 1);
4272 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4273 i--;
4274 j = i;
4275 }
4276 }
4277 if (j >= 0) {
4278 SPLIT_INSERT(self->str, 0, j + 1);
4279 }
4280 return list;
4281
4282 onError:
4283 Py_DECREF(list);
4284 return NULL;
4285}
4286
4287static
4288PyObject *rsplit_char(PyUnicodeObject *self,
4289 PyObject *list,
4290 Py_UNICODE ch,
4291 int maxcount)
4292{
4293 register int i;
4294 register int j;
4295 int len = self->length;
4296 PyObject *str;
4297
4298 for (i = j = len - 1; i >= 0; ) {
4299 if (self->str[i] == ch) {
4300 if (maxcount-- <= 0)
4301 break;
4302 SPLIT_INSERT(self->str, i + 1, j + 1);
4303 j = i = i - 1;
4304 } else
4305 i--;
4306 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004307 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004308 SPLIT_INSERT(self->str, 0, j + 1);
4309 }
4310 return list;
4311
4312 onError:
4313 Py_DECREF(list);
4314 return NULL;
4315}
4316
4317static
4318PyObject *rsplit_substring(PyUnicodeObject *self,
4319 PyObject *list,
4320 PyUnicodeObject *substring,
4321 int maxcount)
4322{
4323 register int i;
4324 register int j;
4325 int len = self->length;
4326 int sublen = substring->length;
4327 PyObject *str;
4328
4329 for (i = len - sublen, j = len; i >= 0; ) {
4330 if (Py_UNICODE_MATCH(self, i, substring)) {
4331 if (maxcount-- <= 0)
4332 break;
4333 SPLIT_INSERT(self->str, i + sublen, j);
4334 j = i;
4335 i -= sublen;
4336 } else
4337 i--;
4338 }
4339 if (j >= 0) {
4340 SPLIT_INSERT(self->str, 0, j);
4341 }
4342 return list;
4343
4344 onError:
4345 Py_DECREF(list);
4346 return NULL;
4347}
4348
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004350#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351
4352static
4353PyObject *split(PyUnicodeObject *self,
4354 PyUnicodeObject *substring,
4355 int maxcount)
4356{
4357 PyObject *list;
4358
4359 if (maxcount < 0)
4360 maxcount = INT_MAX;
4361
4362 list = PyList_New(0);
4363 if (!list)
4364 return NULL;
4365
4366 if (substring == NULL)
4367 return split_whitespace(self,list,maxcount);
4368
4369 else if (substring->length == 1)
4370 return split_char(self,list,substring->str[0],maxcount);
4371
4372 else if (substring->length == 0) {
4373 Py_DECREF(list);
4374 PyErr_SetString(PyExc_ValueError, "empty separator");
4375 return NULL;
4376 }
4377 else
4378 return split_substring(self,list,substring,maxcount);
4379}
4380
Tim Petersced69f82003-09-16 20:30:58 +00004381static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004382PyObject *rsplit(PyUnicodeObject *self,
4383 PyUnicodeObject *substring,
4384 int maxcount)
4385{
4386 PyObject *list;
4387
4388 if (maxcount < 0)
4389 maxcount = INT_MAX;
4390
4391 list = PyList_New(0);
4392 if (!list)
4393 return NULL;
4394
4395 if (substring == NULL)
4396 return rsplit_whitespace(self,list,maxcount);
4397
4398 else if (substring->length == 1)
4399 return rsplit_char(self,list,substring->str[0],maxcount);
4400
4401 else if (substring->length == 0) {
4402 Py_DECREF(list);
4403 PyErr_SetString(PyExc_ValueError, "empty separator");
4404 return NULL;
4405 }
4406 else
4407 return rsplit_substring(self,list,substring,maxcount);
4408}
4409
4410static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411PyObject *replace(PyUnicodeObject *self,
4412 PyUnicodeObject *str1,
4413 PyUnicodeObject *str2,
4414 int maxcount)
4415{
4416 PyUnicodeObject *u;
4417
4418 if (maxcount < 0)
4419 maxcount = INT_MAX;
4420
4421 if (str1->length == 1 && str2->length == 1) {
4422 int i;
4423
4424 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004425 if (!findchar(self->str, self->length, str1->str[0]) &&
4426 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 /* nothing to replace, return original string */
4428 Py_INCREF(self);
4429 u = self;
4430 } else {
4431 Py_UNICODE u1 = str1->str[0];
4432 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004433
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004435 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 self->length
4437 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004438 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004439 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004440 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 for (i = 0; i < u->length; i++)
4442 if (u->str[i] == u1) {
4443 if (--maxcount < 0)
4444 break;
4445 u->str[i] = u2;
4446 }
4447 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449
4450 } else {
4451 int n, i;
4452 Py_UNICODE *p;
4453
4454 /* replace strings */
4455 n = count(self, 0, self->length, str1);
4456 if (n > maxcount)
4457 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004458 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004460 if (PyUnicode_CheckExact(self)) {
4461 Py_INCREF(self);
4462 u = self;
4463 }
4464 else {
4465 u = (PyUnicodeObject *)
4466 PyUnicode_FromUnicode(self->str, self->length);
4467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 } else {
4469 u = _PyUnicode_New(
4470 self->length + n * (str2->length - str1->length));
4471 if (u) {
4472 i = 0;
4473 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004474 if (str1->length > 0) {
4475 while (i <= self->length - str1->length)
4476 if (Py_UNICODE_MATCH(self, i, str1)) {
4477 /* replace string segment */
4478 Py_UNICODE_COPY(p, str2->str, str2->length);
4479 p += str2->length;
4480 i += str1->length;
4481 if (--n <= 0) {
4482 /* copy remaining part */
4483 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4484 break;
4485 }
4486 } else
4487 *p++ = self->str[i++];
4488 } else {
4489 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 Py_UNICODE_COPY(p, str2->str, str2->length);
4491 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004492 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004495 }
4496 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 }
4499 }
4500 }
Tim Petersced69f82003-09-16 20:30:58 +00004501
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 return (PyObject *) u;
4503}
4504
4505/* --- Unicode Object Methods --------------------------------------------- */
4506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004507PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508"S.title() -> unicode\n\
4509\n\
4510Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004511characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004514unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 return fixup(self, fixtitle);
4517}
4518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004519PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520"S.capitalize() -> unicode\n\
4521\n\
4522Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004523have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524
4525static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004526unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 return fixup(self, fixcapitalize);
4529}
4530
4531#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004532PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533"S.capwords() -> unicode\n\
4534\n\
4535Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004536normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537
4538static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004539unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540{
4541 PyObject *list;
4542 PyObject *item;
4543 int i;
4544
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 /* Split into words */
4546 list = split(self, NULL, -1);
4547 if (!list)
4548 return NULL;
4549
4550 /* Capitalize each word */
4551 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4552 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4553 fixcapitalize);
4554 if (item == NULL)
4555 goto onError;
4556 Py_DECREF(PyList_GET_ITEM(list, i));
4557 PyList_SET_ITEM(list, i, item);
4558 }
4559
4560 /* Join the words to form a new string */
4561 item = PyUnicode_Join(NULL, list);
4562
4563onError:
4564 Py_DECREF(list);
4565 return (PyObject *)item;
4566}
4567#endif
4568
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004569/* Argument converter. Coerces to a single unicode character */
4570
4571static int
4572convert_uc(PyObject *obj, void *addr)
4573{
4574 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4575 PyObject *uniobj;
4576 Py_UNICODE *unistr;
4577
4578 uniobj = PyUnicode_FromObject(obj);
4579 if (uniobj == NULL) {
4580 PyErr_SetString(PyExc_TypeError,
4581 "The fill character cannot be converted to Unicode");
4582 return 0;
4583 }
4584 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4585 PyErr_SetString(PyExc_TypeError,
4586 "The fill character must be exactly one character long");
4587 Py_DECREF(uniobj);
4588 return 0;
4589 }
4590 unistr = PyUnicode_AS_UNICODE(uniobj);
4591 *fillcharloc = unistr[0];
4592 Py_DECREF(uniobj);
4593 return 1;
4594}
4595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004596PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004597"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004599Return S centered in a Unicode string of length width. Padding is\n\
4600done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601
4602static PyObject *
4603unicode_center(PyUnicodeObject *self, PyObject *args)
4604{
4605 int marg, left;
4606 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004607 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004609 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 return NULL;
4611
Tim Peters7a29bd52001-09-12 03:03:31 +00004612 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 Py_INCREF(self);
4614 return (PyObject*) self;
4615 }
4616
4617 marg = width - self->length;
4618 left = marg / 2 + (marg & width & 1);
4619
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004620 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621}
4622
Marc-André Lemburge5034372000-08-08 08:04:29 +00004623#if 0
4624
4625/* This code should go into some future Unicode collation support
4626 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004627 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004628
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004629/* speedy UTF-16 code point order comparison */
4630/* gleaned from: */
4631/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4632
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004633static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004634{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004635 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004636 0, 0, 0, 0, 0, 0, 0, 0,
4637 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004638 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004639};
4640
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641static int
4642unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4643{
4644 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004645
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 Py_UNICODE *s1 = str1->str;
4647 Py_UNICODE *s2 = str2->str;
4648
4649 len1 = str1->length;
4650 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004651
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004653 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004654
4655 c1 = *s1++;
4656 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004657
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004658 if (c1 > (1<<11) * 26)
4659 c1 += utf16Fixup[c1>>11];
4660 if (c2 > (1<<11) * 26)
4661 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004662 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004663
4664 if (c1 != c2)
4665 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004666
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004667 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 }
4669
4670 return (len1 < len2) ? -1 : (len1 != len2);
4671}
4672
Marc-André Lemburge5034372000-08-08 08:04:29 +00004673#else
4674
4675static int
4676unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4677{
4678 register int len1, len2;
4679
4680 Py_UNICODE *s1 = str1->str;
4681 Py_UNICODE *s2 = str2->str;
4682
4683 len1 = str1->length;
4684 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004685
Marc-André Lemburge5034372000-08-08 08:04:29 +00004686 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004687 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004688
Fredrik Lundh45714e92001-06-26 16:39:36 +00004689 c1 = *s1++;
4690 c2 = *s2++;
4691
4692 if (c1 != c2)
4693 return (c1 < c2) ? -1 : 1;
4694
Marc-André Lemburge5034372000-08-08 08:04:29 +00004695 len1--; len2--;
4696 }
4697
4698 return (len1 < len2) ? -1 : (len1 != len2);
4699}
4700
4701#endif
4702
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703int PyUnicode_Compare(PyObject *left,
4704 PyObject *right)
4705{
4706 PyUnicodeObject *u = NULL, *v = NULL;
4707 int result;
4708
4709 /* Coerce the two arguments */
4710 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4711 if (u == NULL)
4712 goto onError;
4713 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4714 if (v == NULL)
4715 goto onError;
4716
Thomas Wouters7e474022000-07-16 12:04:32 +00004717 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 if (v == u) {
4719 Py_DECREF(u);
4720 Py_DECREF(v);
4721 return 0;
4722 }
4723
4724 result = unicode_compare(u, v);
4725
4726 Py_DECREF(u);
4727 Py_DECREF(v);
4728 return result;
4729
4730onError:
4731 Py_XDECREF(u);
4732 Py_XDECREF(v);
4733 return -1;
4734}
4735
Guido van Rossum403d68b2000-03-13 15:55:09 +00004736int PyUnicode_Contains(PyObject *container,
4737 PyObject *element)
4738{
4739 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004740 int result, size;
4741 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004742
4743 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004744 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004745 if (v == NULL) {
4746 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004747 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004748 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004749 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004750 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004751 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004752 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004753
Barry Warsaw817918c2002-08-06 16:58:21 +00004754 size = PyUnicode_GET_SIZE(v);
4755 rhs = PyUnicode_AS_UNICODE(v);
4756 lhs = PyUnicode_AS_UNICODE(u);
4757
Guido van Rossum403d68b2000-03-13 15:55:09 +00004758 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004759 if (size == 1) {
4760 end = lhs + PyUnicode_GET_SIZE(u);
4761 while (lhs < end) {
4762 if (*lhs++ == *rhs) {
4763 result = 1;
4764 break;
4765 }
4766 }
4767 }
4768 else {
4769 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4770 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004771 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004772 result = 1;
4773 break;
4774 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004775 }
4776 }
4777
4778 Py_DECREF(u);
4779 Py_DECREF(v);
4780 return result;
4781
4782onError:
4783 Py_XDECREF(u);
4784 Py_XDECREF(v);
4785 return -1;
4786}
4787
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788/* Concat to string or Unicode object giving a new Unicode object. */
4789
4790PyObject *PyUnicode_Concat(PyObject *left,
4791 PyObject *right)
4792{
4793 PyUnicodeObject *u = NULL, *v = NULL, *w;
4794
4795 /* Coerce the two arguments */
4796 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4797 if (u == NULL)
4798 goto onError;
4799 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4800 if (v == NULL)
4801 goto onError;
4802
4803 /* Shortcuts */
4804 if (v == unicode_empty) {
4805 Py_DECREF(v);
4806 return (PyObject *)u;
4807 }
4808 if (u == unicode_empty) {
4809 Py_DECREF(u);
4810 return (PyObject *)v;
4811 }
4812
4813 /* Concat the two Unicode strings */
4814 w = _PyUnicode_New(u->length + v->length);
4815 if (w == NULL)
4816 goto onError;
4817 Py_UNICODE_COPY(w->str, u->str, u->length);
4818 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4819
4820 Py_DECREF(u);
4821 Py_DECREF(v);
4822 return (PyObject *)w;
4823
4824onError:
4825 Py_XDECREF(u);
4826 Py_XDECREF(v);
4827 return NULL;
4828}
4829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004830PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831"S.count(sub[, start[, end]]) -> int\n\
4832\n\
4833Return the number of occurrences of substring sub in Unicode string\n\
4834S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004835interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836
4837static PyObject *
4838unicode_count(PyUnicodeObject *self, PyObject *args)
4839{
4840 PyUnicodeObject *substring;
4841 int start = 0;
4842 int end = INT_MAX;
4843 PyObject *result;
4844
Guido van Rossumb8872e62000-05-09 14:14:27 +00004845 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4846 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return NULL;
4848
4849 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4850 (PyObject *)substring);
4851 if (substring == NULL)
4852 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 if (start < 0)
4855 start += self->length;
4856 if (start < 0)
4857 start = 0;
4858 if (end > self->length)
4859 end = self->length;
4860 if (end < 0)
4861 end += self->length;
4862 if (end < 0)
4863 end = 0;
4864
4865 result = PyInt_FromLong((long) count(self, start, end, substring));
4866
4867 Py_DECREF(substring);
4868 return result;
4869}
4870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004871PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872"S.encode([encoding[,errors]]) -> string\n\
4873\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004874Return an encoded string version of S. Default encoding is the current\n\
4875default string encoding. errors may be given to set a different error\n\
4876handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4878'xmlcharrefreplace' as well as any other name registered with\n\
4879codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880
4881static PyObject *
4882unicode_encode(PyUnicodeObject *self, PyObject *args)
4883{
4884 char *encoding = NULL;
4885 char *errors = NULL;
4886 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4887 return NULL;
4888 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4889}
4890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004891PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892"S.expandtabs([tabsize]) -> unicode\n\
4893\n\
4894Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004895If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896
4897static PyObject*
4898unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4899{
4900 Py_UNICODE *e;
4901 Py_UNICODE *p;
4902 Py_UNICODE *q;
4903 int i, j;
4904 PyUnicodeObject *u;
4905 int tabsize = 8;
4906
4907 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4908 return NULL;
4909
Thomas Wouters7e474022000-07-16 12:04:32 +00004910 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 i = j = 0;
4912 e = self->str + self->length;
4913 for (p = self->str; p < e; p++)
4914 if (*p == '\t') {
4915 if (tabsize > 0)
4916 j += tabsize - (j % tabsize);
4917 }
4918 else {
4919 j++;
4920 if (*p == '\n' || *p == '\r') {
4921 i += j;
4922 j = 0;
4923 }
4924 }
4925
4926 /* Second pass: create output string and fill it */
4927 u = _PyUnicode_New(i + j);
4928 if (!u)
4929 return NULL;
4930
4931 j = 0;
4932 q = u->str;
4933
4934 for (p = self->str; p < e; p++)
4935 if (*p == '\t') {
4936 if (tabsize > 0) {
4937 i = tabsize - (j % tabsize);
4938 j += i;
4939 while (i--)
4940 *q++ = ' ';
4941 }
4942 }
4943 else {
4944 j++;
4945 *q++ = *p;
4946 if (*p == '\n' || *p == '\r')
4947 j = 0;
4948 }
4949
4950 return (PyObject*) u;
4951}
4952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004953PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954"S.find(sub [,start [,end]]) -> int\n\
4955\n\
4956Return the lowest index in S where substring sub is found,\n\
4957such that sub is contained within s[start,end]. Optional\n\
4958arguments start and end are interpreted as in slice notation.\n\
4959\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004960Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961
4962static PyObject *
4963unicode_find(PyUnicodeObject *self, PyObject *args)
4964{
4965 PyUnicodeObject *substring;
4966 int start = 0;
4967 int end = INT_MAX;
4968 PyObject *result;
4969
Guido van Rossumb8872e62000-05-09 14:14:27 +00004970 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4971 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 return NULL;
4973 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4974 (PyObject *)substring);
4975 if (substring == NULL)
4976 return NULL;
4977
4978 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4979
4980 Py_DECREF(substring);
4981 return result;
4982}
4983
4984static PyObject *
4985unicode_getitem(PyUnicodeObject *self, int index)
4986{
4987 if (index < 0 || index >= self->length) {
4988 PyErr_SetString(PyExc_IndexError, "string index out of range");
4989 return NULL;
4990 }
4991
4992 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4993}
4994
4995static long
4996unicode_hash(PyUnicodeObject *self)
4997{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004998 /* Since Unicode objects compare equal to their ASCII string
4999 counterparts, they should use the individual character values
5000 as basis for their hash value. This is needed to assure that
5001 strings and Unicode objects behave in the same way as
5002 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003
Fredrik Lundhdde61642000-07-10 18:27:47 +00005004 register int len;
5005 register Py_UNICODE *p;
5006 register long x;
5007
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008 if (self->hash != -1)
5009 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005010 len = PyUnicode_GET_SIZE(self);
5011 p = PyUnicode_AS_UNICODE(self);
5012 x = *p << 7;
5013 while (--len >= 0)
5014 x = (1000003*x) ^ *p++;
5015 x ^= PyUnicode_GET_SIZE(self);
5016 if (x == -1)
5017 x = -2;
5018 self->hash = x;
5019 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020}
5021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005022PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023"S.index(sub [,start [,end]]) -> int\n\
5024\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005025Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026
5027static PyObject *
5028unicode_index(PyUnicodeObject *self, PyObject *args)
5029{
5030 int result;
5031 PyUnicodeObject *substring;
5032 int start = 0;
5033 int end = INT_MAX;
5034
Guido van Rossumb8872e62000-05-09 14:14:27 +00005035 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5036 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005038
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5040 (PyObject *)substring);
5041 if (substring == NULL)
5042 return NULL;
5043
5044 result = findstring(self, substring, start, end, 1);
5045
5046 Py_DECREF(substring);
5047 if (result < 0) {
5048 PyErr_SetString(PyExc_ValueError, "substring not found");
5049 return NULL;
5050 }
5051 return PyInt_FromLong(result);
5052}
5053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005054PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005055"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005057Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005058at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059
5060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005061unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062{
5063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5064 register const Py_UNICODE *e;
5065 int cased;
5066
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 /* Shortcut for single character strings */
5068 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005069 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005071 /* Special case for empty strings */
5072 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005073 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005074
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 e = p + PyUnicode_GET_SIZE(self);
5076 cased = 0;
5077 for (; p < e; p++) {
5078 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005079
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005081 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 else if (!cased && Py_UNICODE_ISLOWER(ch))
5083 cased = 1;
5084 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005085 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086}
5087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005088PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005089"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005091Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005092at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093
5094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005095unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096{
5097 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5098 register const Py_UNICODE *e;
5099 int cased;
5100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 /* Shortcut for single character strings */
5102 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005103 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005105 /* Special case for empty strings */
5106 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005107 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005108
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 e = p + PyUnicode_GET_SIZE(self);
5110 cased = 0;
5111 for (; p < e; p++) {
5112 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005115 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 else if (!cased && Py_UNICODE_ISUPPER(ch))
5117 cased = 1;
5118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005119 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120}
5121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005122PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005123"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005125Return True if S is a titlecased string and there is at least one\n\
5126character in S, i.e. upper- and titlecase characters may only\n\
5127follow uncased characters and lowercase characters only cased ones.\n\
5128Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005131unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132{
5133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5134 register const Py_UNICODE *e;
5135 int cased, previous_is_cased;
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 /* Shortcut for single character strings */
5138 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005139 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5140 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005142 /* Special case for empty strings */
5143 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005144 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 e = p + PyUnicode_GET_SIZE(self);
5147 cased = 0;
5148 previous_is_cased = 0;
5149 for (; p < e; p++) {
5150 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005151
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5153 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005154 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 previous_is_cased = 1;
5156 cased = 1;
5157 }
5158 else if (Py_UNICODE_ISLOWER(ch)) {
5159 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005160 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 previous_is_cased = 1;
5162 cased = 1;
5163 }
5164 else
5165 previous_is_cased = 0;
5166 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005167 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168}
5169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005170PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005171"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005173Return True if all characters in S are whitespace\n\
5174and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
5176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005177unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178{
5179 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5180 register const Py_UNICODE *e;
5181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 /* Shortcut for single character strings */
5183 if (PyUnicode_GET_SIZE(self) == 1 &&
5184 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005185 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005187 /* Special case for empty strings */
5188 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005189 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 e = p + PyUnicode_GET_SIZE(self);
5192 for (; p < e; p++) {
5193 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005194 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005196 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197}
5198
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005199PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005200"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005201\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005202Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005203and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005204
5205static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005206unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005207{
5208 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5209 register const Py_UNICODE *e;
5210
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005211 /* Shortcut for single character strings */
5212 if (PyUnicode_GET_SIZE(self) == 1 &&
5213 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005214 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005215
5216 /* Special case for empty strings */
5217 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005218 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005219
5220 e = p + PyUnicode_GET_SIZE(self);
5221 for (; p < e; p++) {
5222 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005223 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005224 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005225 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005226}
5227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005228PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005229"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005230\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005231Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005232and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005233
5234static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005235unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005236{
5237 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5238 register const Py_UNICODE *e;
5239
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005240 /* Shortcut for single character strings */
5241 if (PyUnicode_GET_SIZE(self) == 1 &&
5242 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005244
5245 /* Special case for empty strings */
5246 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005247 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005248
5249 e = p + PyUnicode_GET_SIZE(self);
5250 for (; p < e; p++) {
5251 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005252 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005254 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005255}
5256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005257PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005258"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005260Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
5263static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005264unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265{
5266 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5267 register const Py_UNICODE *e;
5268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 /* Shortcut for single character strings */
5270 if (PyUnicode_GET_SIZE(self) == 1 &&
5271 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005272 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005274 /* Special case for empty strings */
5275 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005276 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 e = p + PyUnicode_GET_SIZE(self);
5279 for (; p < e; p++) {
5280 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005281 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005283 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284}
5285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005286PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005287"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005289Return True if all characters in S are digits\n\
5290and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
5292static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005293unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294{
5295 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5296 register const Py_UNICODE *e;
5297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 /* Shortcut for single character strings */
5299 if (PyUnicode_GET_SIZE(self) == 1 &&
5300 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005301 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005303 /* Special case for empty strings */
5304 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005305 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 e = p + PyUnicode_GET_SIZE(self);
5308 for (; p < e; p++) {
5309 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005310 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005312 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313}
5314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005315PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005316"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005318Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005319False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
5321static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005322unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
5324 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5325 register const Py_UNICODE *e;
5326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 /* Shortcut for single character strings */
5328 if (PyUnicode_GET_SIZE(self) == 1 &&
5329 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005330 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005332 /* Special case for empty strings */
5333 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005334 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 e = p + PyUnicode_GET_SIZE(self);
5337 for (; p < e; p++) {
5338 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005339 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005341 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342}
5343
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005344PyDoc_STRVAR(iswide__doc__,
5345"S.iswide() -> bool\n\
5346\n\
5347Return True if all characters in S are wide width\n\
5348and there is at least one character in S, False otherwise.");
5349
5350static PyObject*
5351unicode_iswide(PyUnicodeObject *self)
5352{
5353 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5354 register const Py_UNICODE *e;
5355
5356 /* Shortcut for single character strings */
5357 if (PyUnicode_GET_SIZE(self) == 1 &&
5358 Py_UNICODE_ISWIDE(*p))
5359 Py_RETURN_TRUE;
5360
5361 /* Special case for empty strings */
5362 if (PyString_GET_SIZE(self) == 0)
5363 Py_RETURN_FALSE;
5364
5365 e = p + PyUnicode_GET_SIZE(self);
5366 for (; p < e; p++) {
5367 if (!Py_UNICODE_ISWIDE(*p))
5368 Py_RETURN_FALSE;
5369 }
5370 Py_RETURN_TRUE;
5371}
5372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005373PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374"S.join(sequence) -> unicode\n\
5375\n\
5376Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005377sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378
5379static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005380unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005382 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383}
5384
5385static int
5386unicode_length(PyUnicodeObject *self)
5387{
5388 return self->length;
5389}
5390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005391PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005392"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393\n\
5394Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005395done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396
5397static PyObject *
5398unicode_ljust(PyUnicodeObject *self, PyObject *args)
5399{
5400 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005401 Py_UNICODE fillchar = ' ';
5402
5403 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 return NULL;
5405
Tim Peters7a29bd52001-09-12 03:03:31 +00005406 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 Py_INCREF(self);
5408 return (PyObject*) self;
5409 }
5410
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005411 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412}
5413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005414PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415"S.lower() -> unicode\n\
5416\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005417Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
5419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005420unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 return fixup(self, fixlower);
5423}
5424
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005425#define LEFTSTRIP 0
5426#define RIGHTSTRIP 1
5427#define BOTHSTRIP 2
5428
5429/* Arrays indexed by above */
5430static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5431
5432#define STRIPNAME(i) (stripformat[i]+3)
5433
5434static const Py_UNICODE *
5435unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5436{
Tim Peters030a5ce2002-04-22 19:00:10 +00005437 size_t i;
5438 for (i = 0; i < n; ++i)
5439 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005440 return s+i;
5441 return NULL;
5442}
5443
5444/* externally visible for str.strip(unicode) */
5445PyObject *
5446_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5447{
5448 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5449 int len = PyUnicode_GET_SIZE(self);
5450 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5451 int seplen = PyUnicode_GET_SIZE(sepobj);
5452 int i, j;
5453
5454 i = 0;
5455 if (striptype != RIGHTSTRIP) {
5456 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5457 i++;
5458 }
5459 }
5460
5461 j = len;
5462 if (striptype != LEFTSTRIP) {
5463 do {
5464 j--;
5465 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5466 j++;
5467 }
5468
5469 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5470 Py_INCREF(self);
5471 return (PyObject*)self;
5472 }
5473 else
5474 return PyUnicode_FromUnicode(s+i, j-i);
5475}
5476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
5478static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005479do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005481 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5482 int len = PyUnicode_GET_SIZE(self), i, j;
5483
5484 i = 0;
5485 if (striptype != RIGHTSTRIP) {
5486 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5487 i++;
5488 }
5489 }
5490
5491 j = len;
5492 if (striptype != LEFTSTRIP) {
5493 do {
5494 j--;
5495 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5496 j++;
5497 }
5498
5499 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5500 Py_INCREF(self);
5501 return (PyObject*)self;
5502 }
5503 else
5504 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505}
5506
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005507
5508static PyObject *
5509do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5510{
5511 PyObject *sep = NULL;
5512
5513 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5514 return NULL;
5515
5516 if (sep != NULL && sep != Py_None) {
5517 if (PyUnicode_Check(sep))
5518 return _PyUnicode_XStrip(self, striptype, sep);
5519 else if (PyString_Check(sep)) {
5520 PyObject *res;
5521 sep = PyUnicode_FromObject(sep);
5522 if (sep==NULL)
5523 return NULL;
5524 res = _PyUnicode_XStrip(self, striptype, sep);
5525 Py_DECREF(sep);
5526 return res;
5527 }
5528 else {
5529 PyErr_Format(PyExc_TypeError,
5530 "%s arg must be None, unicode or str",
5531 STRIPNAME(striptype));
5532 return NULL;
5533 }
5534 }
5535
5536 return do_strip(self, striptype);
5537}
5538
5539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005540PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005541"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005542\n\
5543Return a copy of the string S with leading and trailing\n\
5544whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005545If chars is given and not None, remove characters in chars instead.\n\
5546If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005547
5548static PyObject *
5549unicode_strip(PyUnicodeObject *self, PyObject *args)
5550{
5551 if (PyTuple_GET_SIZE(args) == 0)
5552 return do_strip(self, BOTHSTRIP); /* Common case */
5553 else
5554 return do_argstrip(self, BOTHSTRIP, args);
5555}
5556
5557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005558PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005559"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005560\n\
5561Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005562If chars is given and not None, remove characters in chars instead.\n\
5563If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005564
5565static PyObject *
5566unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5567{
5568 if (PyTuple_GET_SIZE(args) == 0)
5569 return do_strip(self, LEFTSTRIP); /* Common case */
5570 else
5571 return do_argstrip(self, LEFTSTRIP, args);
5572}
5573
5574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005575PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005576"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005577\n\
5578Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005579If chars is given and not None, remove characters in chars instead.\n\
5580If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005581
5582static PyObject *
5583unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5584{
5585 if (PyTuple_GET_SIZE(args) == 0)
5586 return do_strip(self, RIGHTSTRIP); /* Common case */
5587 else
5588 return do_argstrip(self, RIGHTSTRIP, args);
5589}
5590
5591
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592static PyObject*
5593unicode_repeat(PyUnicodeObject *str, int len)
5594{
5595 PyUnicodeObject *u;
5596 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005597 int nchars;
5598 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
5600 if (len < 0)
5601 len = 0;
5602
Tim Peters7a29bd52001-09-12 03:03:31 +00005603 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 /* no repeat, return original string */
5605 Py_INCREF(str);
5606 return (PyObject*) str;
5607 }
Tim Peters8f422462000-09-09 06:13:41 +00005608
5609 /* ensure # of chars needed doesn't overflow int and # of bytes
5610 * needed doesn't overflow size_t
5611 */
5612 nchars = len * str->length;
5613 if (len && nchars / len != str->length) {
5614 PyErr_SetString(PyExc_OverflowError,
5615 "repeated string is too long");
5616 return NULL;
5617 }
5618 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5619 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5620 PyErr_SetString(PyExc_OverflowError,
5621 "repeated string is too long");
5622 return NULL;
5623 }
5624 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 if (!u)
5626 return NULL;
5627
5628 p = u->str;
5629
5630 while (len-- > 0) {
5631 Py_UNICODE_COPY(p, str->str, str->length);
5632 p += str->length;
5633 }
5634
5635 return (PyObject*) u;
5636}
5637
5638PyObject *PyUnicode_Replace(PyObject *obj,
5639 PyObject *subobj,
5640 PyObject *replobj,
5641 int maxcount)
5642{
5643 PyObject *self;
5644 PyObject *str1;
5645 PyObject *str2;
5646 PyObject *result;
5647
5648 self = PyUnicode_FromObject(obj);
5649 if (self == NULL)
5650 return NULL;
5651 str1 = PyUnicode_FromObject(subobj);
5652 if (str1 == NULL) {
5653 Py_DECREF(self);
5654 return NULL;
5655 }
5656 str2 = PyUnicode_FromObject(replobj);
5657 if (str2 == NULL) {
5658 Py_DECREF(self);
5659 Py_DECREF(str1);
5660 return NULL;
5661 }
Tim Petersced69f82003-09-16 20:30:58 +00005662 result = replace((PyUnicodeObject *)self,
5663 (PyUnicodeObject *)str1,
5664 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 maxcount);
5666 Py_DECREF(self);
5667 Py_DECREF(str1);
5668 Py_DECREF(str2);
5669 return result;
5670}
5671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005672PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673"S.replace (old, new[, maxsplit]) -> unicode\n\
5674\n\
5675Return a copy of S with all occurrences of substring\n\
5676old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005677given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678
5679static PyObject*
5680unicode_replace(PyUnicodeObject *self, PyObject *args)
5681{
5682 PyUnicodeObject *str1;
5683 PyUnicodeObject *str2;
5684 int maxcount = -1;
5685 PyObject *result;
5686
5687 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5688 return NULL;
5689 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5690 if (str1 == NULL)
5691 return NULL;
5692 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005693 if (str2 == NULL) {
5694 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
5698 result = replace(self, str1, str2, maxcount);
5699
5700 Py_DECREF(str1);
5701 Py_DECREF(str2);
5702 return result;
5703}
5704
5705static
5706PyObject *unicode_repr(PyObject *unicode)
5707{
5708 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5709 PyUnicode_GET_SIZE(unicode),
5710 1);
5711}
5712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005713PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714"S.rfind(sub [,start [,end]]) -> int\n\
5715\n\
5716Return the highest index in S where substring sub is found,\n\
5717such that sub is contained within s[start,end]. Optional\n\
5718arguments start and end are interpreted as in slice notation.\n\
5719\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005720Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
5722static PyObject *
5723unicode_rfind(PyUnicodeObject *self, PyObject *args)
5724{
5725 PyUnicodeObject *substring;
5726 int start = 0;
5727 int end = INT_MAX;
5728 PyObject *result;
5729
Guido van Rossumb8872e62000-05-09 14:14:27 +00005730 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5731 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 return NULL;
5733 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5734 (PyObject *)substring);
5735 if (substring == NULL)
5736 return NULL;
5737
5738 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5739
5740 Py_DECREF(substring);
5741 return result;
5742}
5743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005744PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745"S.rindex(sub [,start [,end]]) -> int\n\
5746\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005747Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
5749static PyObject *
5750unicode_rindex(PyUnicodeObject *self, PyObject *args)
5751{
5752 int result;
5753 PyUnicodeObject *substring;
5754 int start = 0;
5755 int end = INT_MAX;
5756
Guido van Rossumb8872e62000-05-09 14:14:27 +00005757 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5758 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 return NULL;
5760 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5761 (PyObject *)substring);
5762 if (substring == NULL)
5763 return NULL;
5764
5765 result = findstring(self, substring, start, end, -1);
5766
5767 Py_DECREF(substring);
5768 if (result < 0) {
5769 PyErr_SetString(PyExc_ValueError, "substring not found");
5770 return NULL;
5771 }
5772 return PyInt_FromLong(result);
5773}
5774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005775PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005776"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777\n\
5778Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005779done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780
5781static PyObject *
5782unicode_rjust(PyUnicodeObject *self, PyObject *args)
5783{
5784 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005785 Py_UNICODE fillchar = ' ';
5786
5787 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 return NULL;
5789
Tim Peters7a29bd52001-09-12 03:03:31 +00005790 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 Py_INCREF(self);
5792 return (PyObject*) self;
5793 }
5794
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005795 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796}
5797
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798static PyObject*
5799unicode_slice(PyUnicodeObject *self, int start, int end)
5800{
5801 /* standard clamping */
5802 if (start < 0)
5803 start = 0;
5804 if (end < 0)
5805 end = 0;
5806 if (end > self->length)
5807 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005808 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 /* full slice, return original string */
5810 Py_INCREF(self);
5811 return (PyObject*) self;
5812 }
5813 if (start > end)
5814 start = end;
5815 /* copy slice */
5816 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5817 end - start);
5818}
5819
5820PyObject *PyUnicode_Split(PyObject *s,
5821 PyObject *sep,
5822 int maxsplit)
5823{
5824 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005825
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 s = PyUnicode_FromObject(s);
5827 if (s == NULL)
5828 return NULL;
5829 if (sep != NULL) {
5830 sep = PyUnicode_FromObject(sep);
5831 if (sep == NULL) {
5832 Py_DECREF(s);
5833 return NULL;
5834 }
5835 }
5836
5837 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5838
5839 Py_DECREF(s);
5840 Py_XDECREF(sep);
5841 return result;
5842}
5843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005844PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845"S.split([sep [,maxsplit]]) -> list of strings\n\
5846\n\
5847Return a list of the words in S, using sep as the\n\
5848delimiter string. If maxsplit is given, at most maxsplit\n\
5849splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005850is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
5852static PyObject*
5853unicode_split(PyUnicodeObject *self, PyObject *args)
5854{
5855 PyObject *substring = Py_None;
5856 int maxcount = -1;
5857
5858 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5859 return NULL;
5860
5861 if (substring == Py_None)
5862 return split(self, NULL, maxcount);
5863 else if (PyUnicode_Check(substring))
5864 return split(self, (PyUnicodeObject *)substring, maxcount);
5865 else
5866 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5867}
5868
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869PyObject *PyUnicode_RSplit(PyObject *s,
5870 PyObject *sep,
5871 int maxsplit)
5872{
5873 PyObject *result;
5874
5875 s = PyUnicode_FromObject(s);
5876 if (s == NULL)
5877 return NULL;
5878 if (sep != NULL) {
5879 sep = PyUnicode_FromObject(sep);
5880 if (sep == NULL) {
5881 Py_DECREF(s);
5882 return NULL;
5883 }
5884 }
5885
5886 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5887
5888 Py_DECREF(s);
5889 Py_XDECREF(sep);
5890 return result;
5891}
5892
5893PyDoc_STRVAR(rsplit__doc__,
5894"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5895\n\
5896Return a list of the words in S, using sep as the\n\
5897delimiter string, starting at the end of the string and\n\
5898working to the front. If maxsplit is given, at most maxsplit\n\
5899splits are done. If sep is not specified, any whitespace string\n\
5900is a separator.");
5901
5902static PyObject*
5903unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5904{
5905 PyObject *substring = Py_None;
5906 int maxcount = -1;
5907
5908 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
5909 return NULL;
5910
5911 if (substring == Py_None)
5912 return rsplit(self, NULL, maxcount);
5913 else if (PyUnicode_Check(substring))
5914 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
5915 else
5916 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
5917}
5918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005919PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005920"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921\n\
5922Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005923Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005924is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
5926static PyObject*
5927unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5928{
Guido van Rossum86662912000-04-11 15:38:46 +00005929 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Guido van Rossum86662912000-04-11 15:38:46 +00005931 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 return NULL;
5933
Guido van Rossum86662912000-04-11 15:38:46 +00005934 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935}
5936
5937static
5938PyObject *unicode_str(PyUnicodeObject *self)
5939{
Fred Drakee4315f52000-05-09 19:53:39 +00005940 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941}
5942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005943PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944"S.swapcase() -> unicode\n\
5945\n\
5946Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948
5949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005950unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 return fixup(self, fixswapcase);
5953}
5954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956"S.translate(table) -> unicode\n\
5957\n\
5958Return a copy of the string S, where all characters have been mapped\n\
5959through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005960Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5961Unmapped characters are left untouched. Characters mapped to None\n\
5962are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
5964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005965unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
Tim Petersced69f82003-09-16 20:30:58 +00005967 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005969 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 "ignore");
5971}
5972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005973PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974"S.upper() -> unicode\n\
5975\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005979unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 return fixup(self, fixupper);
5982}
5983
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005984PyDoc_STRVAR(width__doc__,
5985"S.width() -> unicode\n\
5986\n\
5987Return a fixed-width representation length of S.");
5988
5989static PyObject*
5990unicode_width(PyObject *self)
5991{
5992 int width = PyUnicode_GetWidth(self);
5993 if (width == -1)
5994 return NULL;
5995 else
5996 return PyInt_FromLong((long)width);
5997}
5998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005999PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000"S.zfill(width) -> unicode\n\
6001\n\
6002Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006003of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
6005static PyObject *
6006unicode_zfill(PyUnicodeObject *self, PyObject *args)
6007{
6008 int fill;
6009 PyUnicodeObject *u;
6010
6011 int width;
6012 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6013 return NULL;
6014
6015 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006016 if (PyUnicode_CheckExact(self)) {
6017 Py_INCREF(self);
6018 return (PyObject*) self;
6019 }
6020 else
6021 return PyUnicode_FromUnicode(
6022 PyUnicode_AS_UNICODE(self),
6023 PyUnicode_GET_SIZE(self)
6024 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 }
6026
6027 fill = width - self->length;
6028
6029 u = pad(self, fill, 0, '0');
6030
Walter Dörwald068325e2002-04-15 13:36:47 +00006031 if (u == NULL)
6032 return NULL;
6033
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 if (u->str[fill] == '+' || u->str[fill] == '-') {
6035 /* move sign to beginning of string */
6036 u->str[0] = u->str[fill];
6037 u->str[fill] = '0';
6038 }
6039
6040 return (PyObject*) u;
6041}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
6043#if 0
6044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006045unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return PyInt_FromLong(unicode_freelist_size);
6048}
6049#endif
6050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006051PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006052"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006054Return True if S starts with the specified prefix, False otherwise.\n\
6055With optional start, test S beginning at that position.\n\
6056With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
6058static PyObject *
6059unicode_startswith(PyUnicodeObject *self,
6060 PyObject *args)
6061{
6062 PyUnicodeObject *substring;
6063 int start = 0;
6064 int end = INT_MAX;
6065 PyObject *result;
6066
Guido van Rossumb8872e62000-05-09 14:14:27 +00006067 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6068 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 return NULL;
6070 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6071 (PyObject *)substring);
6072 if (substring == NULL)
6073 return NULL;
6074
Guido van Rossum77f6a652002-04-03 22:41:51 +00006075 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
6077 Py_DECREF(substring);
6078 return result;
6079}
6080
6081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006082PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006083"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006085Return True if S ends with the specified suffix, False otherwise.\n\
6086With optional start, test S beginning at that position.\n\
6087With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
6089static PyObject *
6090unicode_endswith(PyUnicodeObject *self,
6091 PyObject *args)
6092{
6093 PyUnicodeObject *substring;
6094 int start = 0;
6095 int end = INT_MAX;
6096 PyObject *result;
6097
Guido van Rossumb8872e62000-05-09 14:14:27 +00006098 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6099 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 return NULL;
6101 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6102 (PyObject *)substring);
6103 if (substring == NULL)
6104 return NULL;
6105
Guido van Rossum77f6a652002-04-03 22:41:51 +00006106 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
6108 Py_DECREF(substring);
6109 return result;
6110}
6111
6112
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006113
6114static PyObject *
6115unicode_getnewargs(PyUnicodeObject *v)
6116{
6117 return Py_BuildValue("(u#)", v->str, v->length);
6118}
6119
6120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121static PyMethodDef unicode_methods[] = {
6122
6123 /* Order is according to common usage: often used methods should
6124 appear first, since lookup is done sequentially. */
6125
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006126 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6127 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6128 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006129 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006130 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6131 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6132 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6133 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6134 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6135 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6136 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6137 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6138 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6139 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006140 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006141/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6142 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6143 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6144 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006145 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006146 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006147 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006148 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6149 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6150 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6151 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6152 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6153 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6154 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6155 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6156 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6157 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6158 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6159 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6160 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6161 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006162 {"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
6163 {"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006164 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006165#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006166 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167#endif
6168
6169#if 0
6170 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006171 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172#endif
6173
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006174 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 {NULL, NULL}
6176};
6177
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006178static PyObject *
6179unicode_mod(PyObject *v, PyObject *w)
6180{
6181 if (!PyUnicode_Check(v)) {
6182 Py_INCREF(Py_NotImplemented);
6183 return Py_NotImplemented;
6184 }
6185 return PyUnicode_Format(v, w);
6186}
6187
6188static PyNumberMethods unicode_as_number = {
6189 0, /*nb_add*/
6190 0, /*nb_subtract*/
6191 0, /*nb_multiply*/
6192 0, /*nb_divide*/
6193 unicode_mod, /*nb_remainder*/
6194};
6195
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196static PySequenceMethods unicode_as_sequence = {
6197 (inquiry) unicode_length, /* sq_length */
6198 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6199 (intargfunc) unicode_repeat, /* sq_repeat */
6200 (intargfunc) unicode_getitem, /* sq_item */
6201 (intintargfunc) unicode_slice, /* sq_slice */
6202 0, /* sq_ass_item */
6203 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006204 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205};
6206
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006207static PyObject*
6208unicode_subscript(PyUnicodeObject* self, PyObject* item)
6209{
6210 if (PyInt_Check(item)) {
6211 long i = PyInt_AS_LONG(item);
6212 if (i < 0)
6213 i += PyString_GET_SIZE(self);
6214 return unicode_getitem(self, i);
6215 } else if (PyLong_Check(item)) {
6216 long i = PyLong_AsLong(item);
6217 if (i == -1 && PyErr_Occurred())
6218 return NULL;
6219 if (i < 0)
6220 i += PyString_GET_SIZE(self);
6221 return unicode_getitem(self, i);
6222 } else if (PySlice_Check(item)) {
6223 int start, stop, step, slicelength, cur, i;
6224 Py_UNICODE* source_buf;
6225 Py_UNICODE* result_buf;
6226 PyObject* result;
6227
6228 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6229 &start, &stop, &step, &slicelength) < 0) {
6230 return NULL;
6231 }
6232
6233 if (slicelength <= 0) {
6234 return PyUnicode_FromUnicode(NULL, 0);
6235 } else {
6236 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6237 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6238
6239 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6240 result_buf[i] = source_buf[cur];
6241 }
Tim Petersced69f82003-09-16 20:30:58 +00006242
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006243 result = PyUnicode_FromUnicode(result_buf, slicelength);
6244 PyMem_FREE(result_buf);
6245 return result;
6246 }
6247 } else {
6248 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6249 return NULL;
6250 }
6251}
6252
6253static PyMappingMethods unicode_as_mapping = {
6254 (inquiry)unicode_length, /* mp_length */
6255 (binaryfunc)unicode_subscript, /* mp_subscript */
6256 (objobjargproc)0, /* mp_ass_subscript */
6257};
6258
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259static int
6260unicode_buffer_getreadbuf(PyUnicodeObject *self,
6261 int index,
6262 const void **ptr)
6263{
6264 if (index != 0) {
6265 PyErr_SetString(PyExc_SystemError,
6266 "accessing non-existent unicode segment");
6267 return -1;
6268 }
6269 *ptr = (void *) self->str;
6270 return PyUnicode_GET_DATA_SIZE(self);
6271}
6272
6273static int
6274unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6275 const void **ptr)
6276{
6277 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006278 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 return -1;
6280}
6281
6282static int
6283unicode_buffer_getsegcount(PyUnicodeObject *self,
6284 int *lenp)
6285{
6286 if (lenp)
6287 *lenp = PyUnicode_GET_DATA_SIZE(self);
6288 return 1;
6289}
6290
6291static int
6292unicode_buffer_getcharbuf(PyUnicodeObject *self,
6293 int index,
6294 const void **ptr)
6295{
6296 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 if (index != 0) {
6299 PyErr_SetString(PyExc_SystemError,
6300 "accessing non-existent unicode segment");
6301 return -1;
6302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006303 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 if (str == NULL)
6305 return -1;
6306 *ptr = (void *) PyString_AS_STRING(str);
6307 return PyString_GET_SIZE(str);
6308}
6309
6310/* Helpers for PyUnicode_Format() */
6311
6312static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006313getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314{
6315 int argidx = *p_argidx;
6316 if (argidx < arglen) {
6317 (*p_argidx)++;
6318 if (arglen < 0)
6319 return args;
6320 else
6321 return PyTuple_GetItem(args, argidx);
6322 }
6323 PyErr_SetString(PyExc_TypeError,
6324 "not enough arguments for format string");
6325 return NULL;
6326}
6327
6328#define F_LJUST (1<<0)
6329#define F_SIGN (1<<1)
6330#define F_BLANK (1<<2)
6331#define F_ALT (1<<3)
6332#define F_ZERO (1<<4)
6333
6334static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336{
6337 register int i;
6338 int len;
6339 va_list va;
6340 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343 /* First, format the string as char array, then expand to Py_UNICODE
6344 array. */
6345 charbuffer = (char *)buffer;
6346 len = vsprintf(charbuffer, format, va);
6347 for (i = len - 1; i >= 0; i--)
6348 buffer[i] = (Py_UNICODE) charbuffer[i];
6349
6350 va_end(va);
6351 return len;
6352}
6353
Guido van Rossum078151d2002-08-11 04:24:12 +00006354/* XXX To save some code duplication, formatfloat/long/int could have been
6355 shared with stringobject.c, converting from 8-bit to Unicode after the
6356 formatting is done. */
6357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358static int
6359formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006360 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 int flags,
6362 int prec,
6363 int type,
6364 PyObject *v)
6365{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006366 /* fmt = '%#.' + `prec` + `type`
6367 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 char fmt[20];
6369 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 x = PyFloat_AsDouble(v);
6372 if (x == -1.0 && PyErr_Occurred())
6373 return -1;
6374 if (prec < 0)
6375 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6377 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006378 /* Worst case length calc to ensure no buffer overrun:
6379
6380 'g' formats:
6381 fmt = %#.<prec>g
6382 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6383 for any double rep.)
6384 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6385
6386 'f' formats:
6387 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6388 len = 1 + 50 + 1 + prec = 52 + prec
6389
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006390 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006391 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006392
6393 */
6394 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6395 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006396 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006397 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006398 return -1;
6399 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006400 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6401 (flags&F_ALT) ? "#" : "",
6402 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 return usprintf(buf, fmt, x);
6404}
6405
Tim Peters38fd5b62000-09-21 05:43:11 +00006406static PyObject*
6407formatlong(PyObject *val, int flags, int prec, int type)
6408{
6409 char *buf;
6410 int i, len;
6411 PyObject *str; /* temporary string object. */
6412 PyUnicodeObject *result;
6413
6414 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6415 if (!str)
6416 return NULL;
6417 result = _PyUnicode_New(len);
6418 for (i = 0; i < len; i++)
6419 result->str[i] = buf[i];
6420 result->str[len] = 0;
6421 Py_DECREF(str);
6422 return (PyObject*)result;
6423}
6424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425static int
6426formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006427 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 int flags,
6429 int prec,
6430 int type,
6431 PyObject *v)
6432{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006433 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006434 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6435 * + 1 + 1
6436 * = 24
6437 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006438 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006439 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 long x;
6441
6442 x = PyInt_AsLong(v);
6443 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006444 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006445 if (x < 0 && type == 'u') {
6446 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006447 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006448 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6449 sign = "-";
6450 else
6451 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006453 prec = 1;
6454
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006455 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6456 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006457 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006458 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006459 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006460 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006461 return -1;
6462 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006463
6464 if ((flags & F_ALT) &&
6465 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006466 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006467 * of issues that cause pain:
6468 * - when 0 is being converted, the C standard leaves off
6469 * the '0x' or '0X', which is inconsistent with other
6470 * %#x/%#X conversions and inconsistent with Python's
6471 * hex() function
6472 * - there are platforms that violate the standard and
6473 * convert 0 with the '0x' or '0X'
6474 * (Metrowerks, Compaq Tru64)
6475 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006476 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006477 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006478 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006479 * We can achieve the desired consistency by inserting our
6480 * own '0x' or '0X' prefix, and substituting %x/%X in place
6481 * of %#x/%#X.
6482 *
6483 * Note that this is the same approach as used in
6484 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006485 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006486 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6487 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006488 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006489 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006490 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6491 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006492 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006493 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006494 if (sign[0])
6495 return usprintf(buf, fmt, -x);
6496 else
6497 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
6500static int
6501formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006502 size_t buflen,
6503 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006505 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006506 if (PyUnicode_Check(v)) {
6507 if (PyUnicode_GET_SIZE(v) != 1)
6508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006512 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006513 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006514 goto onError;
6515 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
6518 else {
6519 /* Integer input truncated to a character */
6520 long x;
6521 x = PyInt_AsLong(v);
6522 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006523 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006524#ifdef Py_UNICODE_WIDE
6525 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006526 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006527 "%c arg not in range(0x110000) "
6528 "(wide Python build)");
6529 return -1;
6530 }
6531#else
6532 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006533 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006534 "%c arg not in range(0x10000) "
6535 "(narrow Python build)");
6536 return -1;
6537 }
6538#endif
6539 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 }
6541 buf[1] = '\0';
6542 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006543
6544 onError:
6545 PyErr_SetString(PyExc_TypeError,
6546 "%c requires int or char");
6547 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548}
6549
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006550/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6551
6552 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6553 chars are formatted. XXX This is a magic number. Each formatting
6554 routine does bounds checking to ensure no overflow, but a better
6555 solution may be to malloc a buffer of appropriate size for each
6556 format. For now, the current solution is sufficient.
6557*/
6558#define FORMATBUFLEN (size_t)120
6559
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560PyObject *PyUnicode_Format(PyObject *format,
6561 PyObject *args)
6562{
6563 Py_UNICODE *fmt, *res;
6564 int fmtcnt, rescnt, reslen, arglen, argidx;
6565 int args_owned = 0;
6566 PyUnicodeObject *result = NULL;
6567 PyObject *dict = NULL;
6568 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 if (format == NULL || args == NULL) {
6571 PyErr_BadInternalCall();
6572 return NULL;
6573 }
6574 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006575 if (uformat == NULL)
6576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 fmt = PyUnicode_AS_UNICODE(uformat);
6578 fmtcnt = PyUnicode_GET_SIZE(uformat);
6579
6580 reslen = rescnt = fmtcnt + 100;
6581 result = _PyUnicode_New(reslen);
6582 if (result == NULL)
6583 goto onError;
6584 res = PyUnicode_AS_UNICODE(result);
6585
6586 if (PyTuple_Check(args)) {
6587 arglen = PyTuple_Size(args);
6588 argidx = 0;
6589 }
6590 else {
6591 arglen = -1;
6592 argidx = -2;
6593 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006594 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6595 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 dict = args;
6597
6598 while (--fmtcnt >= 0) {
6599 if (*fmt != '%') {
6600 if (--rescnt < 0) {
6601 rescnt = fmtcnt + 100;
6602 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006603 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 return NULL;
6605 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6606 --rescnt;
6607 }
6608 *res++ = *fmt++;
6609 }
6610 else {
6611 /* Got a format specifier */
6612 int flags = 0;
6613 int width = -1;
6614 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 Py_UNICODE c = '\0';
6616 Py_UNICODE fill;
6617 PyObject *v = NULL;
6618 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006619 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 Py_UNICODE sign;
6621 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006622 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
6624 fmt++;
6625 if (*fmt == '(') {
6626 Py_UNICODE *keystart;
6627 int keylen;
6628 PyObject *key;
6629 int pcount = 1;
6630
6631 if (dict == NULL) {
6632 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006633 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 goto onError;
6635 }
6636 ++fmt;
6637 --fmtcnt;
6638 keystart = fmt;
6639 /* Skip over balanced parentheses */
6640 while (pcount > 0 && --fmtcnt >= 0) {
6641 if (*fmt == ')')
6642 --pcount;
6643 else if (*fmt == '(')
6644 ++pcount;
6645 fmt++;
6646 }
6647 keylen = fmt - keystart - 1;
6648 if (fmtcnt < 0 || pcount > 0) {
6649 PyErr_SetString(PyExc_ValueError,
6650 "incomplete format key");
6651 goto onError;
6652 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006653#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006654 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 then looked up since Python uses strings to hold
6656 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006657 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 key = PyUnicode_EncodeUTF8(keystart,
6659 keylen,
6660 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006661#else
6662 key = PyUnicode_FromUnicode(keystart, keylen);
6663#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 if (key == NULL)
6665 goto onError;
6666 if (args_owned) {
6667 Py_DECREF(args);
6668 args_owned = 0;
6669 }
6670 args = PyObject_GetItem(dict, key);
6671 Py_DECREF(key);
6672 if (args == NULL) {
6673 goto onError;
6674 }
6675 args_owned = 1;
6676 arglen = -1;
6677 argidx = -2;
6678 }
6679 while (--fmtcnt >= 0) {
6680 switch (c = *fmt++) {
6681 case '-': flags |= F_LJUST; continue;
6682 case '+': flags |= F_SIGN; continue;
6683 case ' ': flags |= F_BLANK; continue;
6684 case '#': flags |= F_ALT; continue;
6685 case '0': flags |= F_ZERO; continue;
6686 }
6687 break;
6688 }
6689 if (c == '*') {
6690 v = getnextarg(args, arglen, &argidx);
6691 if (v == NULL)
6692 goto onError;
6693 if (!PyInt_Check(v)) {
6694 PyErr_SetString(PyExc_TypeError,
6695 "* wants int");
6696 goto onError;
6697 }
6698 width = PyInt_AsLong(v);
6699 if (width < 0) {
6700 flags |= F_LJUST;
6701 width = -width;
6702 }
6703 if (--fmtcnt >= 0)
6704 c = *fmt++;
6705 }
6706 else if (c >= '0' && c <= '9') {
6707 width = c - '0';
6708 while (--fmtcnt >= 0) {
6709 c = *fmt++;
6710 if (c < '0' || c > '9')
6711 break;
6712 if ((width*10) / 10 != width) {
6713 PyErr_SetString(PyExc_ValueError,
6714 "width too big");
6715 goto onError;
6716 }
6717 width = width*10 + (c - '0');
6718 }
6719 }
6720 if (c == '.') {
6721 prec = 0;
6722 if (--fmtcnt >= 0)
6723 c = *fmt++;
6724 if (c == '*') {
6725 v = getnextarg(args, arglen, &argidx);
6726 if (v == NULL)
6727 goto onError;
6728 if (!PyInt_Check(v)) {
6729 PyErr_SetString(PyExc_TypeError,
6730 "* wants int");
6731 goto onError;
6732 }
6733 prec = PyInt_AsLong(v);
6734 if (prec < 0)
6735 prec = 0;
6736 if (--fmtcnt >= 0)
6737 c = *fmt++;
6738 }
6739 else if (c >= '0' && c <= '9') {
6740 prec = c - '0';
6741 while (--fmtcnt >= 0) {
6742 c = Py_CHARMASK(*fmt++);
6743 if (c < '0' || c > '9')
6744 break;
6745 if ((prec*10) / 10 != prec) {
6746 PyErr_SetString(PyExc_ValueError,
6747 "prec too big");
6748 goto onError;
6749 }
6750 prec = prec*10 + (c - '0');
6751 }
6752 }
6753 } /* prec */
6754 if (fmtcnt >= 0) {
6755 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (--fmtcnt >= 0)
6757 c = *fmt++;
6758 }
6759 }
6760 if (fmtcnt < 0) {
6761 PyErr_SetString(PyExc_ValueError,
6762 "incomplete format");
6763 goto onError;
6764 }
6765 if (c != '%') {
6766 v = getnextarg(args, arglen, &argidx);
6767 if (v == NULL)
6768 goto onError;
6769 }
6770 sign = 0;
6771 fill = ' ';
6772 switch (c) {
6773
6774 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006775 pbuf = formatbuf;
6776 /* presume that buffer length is at least 1 */
6777 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 len = 1;
6779 break;
6780
6781 case 's':
6782 case 'r':
6783 if (PyUnicode_Check(v) && c == 's') {
6784 temp = v;
6785 Py_INCREF(temp);
6786 }
6787 else {
6788 PyObject *unicode;
6789 if (c == 's')
6790 temp = PyObject_Str(v);
6791 else
6792 temp = PyObject_Repr(v);
6793 if (temp == NULL)
6794 goto onError;
6795 if (!PyString_Check(temp)) {
6796 /* XXX Note: this should never happen, since
6797 PyObject_Repr() and PyObject_Str() assure
6798 this */
6799 Py_DECREF(temp);
6800 PyErr_SetString(PyExc_TypeError,
6801 "%s argument has non-string str()");
6802 goto onError;
6803 }
Fred Drakee4315f52000-05-09 19:53:39 +00006804 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006806 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 "strict");
6808 Py_DECREF(temp);
6809 temp = unicode;
6810 if (temp == NULL)
6811 goto onError;
6812 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006813 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 len = PyUnicode_GET_SIZE(temp);
6815 if (prec >= 0 && len > prec)
6816 len = prec;
6817 break;
6818
6819 case 'i':
6820 case 'd':
6821 case 'u':
6822 case 'o':
6823 case 'x':
6824 case 'X':
6825 if (c == 'i')
6826 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006827 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006828 temp = formatlong(v, flags, prec, c);
6829 if (!temp)
6830 goto onError;
6831 pbuf = PyUnicode_AS_UNICODE(temp);
6832 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006833 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006835 else {
6836 pbuf = formatbuf;
6837 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6838 flags, prec, c, v);
6839 if (len < 0)
6840 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006841 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006842 }
6843 if (flags & F_ZERO)
6844 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 break;
6846
6847 case 'e':
6848 case 'E':
6849 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006850 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 case 'g':
6852 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006853 if (c == 'F')
6854 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006855 pbuf = formatbuf;
6856 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6857 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 if (len < 0)
6859 goto onError;
6860 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006861 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 fill = '0';
6863 break;
6864
6865 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006866 pbuf = formatbuf;
6867 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 if (len < 0)
6869 goto onError;
6870 break;
6871
6872 default:
6873 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006874 "unsupported format character '%c' (0x%x) "
6875 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006876 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006877 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006878 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 goto onError;
6880 }
6881 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006882 if (*pbuf == '-' || *pbuf == '+') {
6883 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 len--;
6885 }
6886 else if (flags & F_SIGN)
6887 sign = '+';
6888 else if (flags & F_BLANK)
6889 sign = ' ';
6890 else
6891 sign = 0;
6892 }
6893 if (width < len)
6894 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006895 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 reslen -= rescnt;
6897 rescnt = width + fmtcnt + 100;
6898 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006899 if (reslen < 0) {
6900 Py_DECREF(result);
6901 return PyErr_NoMemory();
6902 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006903 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 return NULL;
6905 res = PyUnicode_AS_UNICODE(result)
6906 + reslen - rescnt;
6907 }
6908 if (sign) {
6909 if (fill != ' ')
6910 *res++ = sign;
6911 rescnt--;
6912 if (width > len)
6913 width--;
6914 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006915 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6916 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006917 assert(pbuf[1] == c);
6918 if (fill != ' ') {
6919 *res++ = *pbuf++;
6920 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006921 }
Tim Petersfff53252001-04-12 18:38:48 +00006922 rescnt -= 2;
6923 width -= 2;
6924 if (width < 0)
6925 width = 0;
6926 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (width > len && !(flags & F_LJUST)) {
6929 do {
6930 --rescnt;
6931 *res++ = fill;
6932 } while (--width > len);
6933 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006934 if (fill == ' ') {
6935 if (sign)
6936 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006937 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006938 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006939 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006940 *res++ = *pbuf++;
6941 *res++ = *pbuf++;
6942 }
6943 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006944 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 res += len;
6946 rescnt -= len;
6947 while (--width >= len) {
6948 --rescnt;
6949 *res++ = ' ';
6950 }
6951 if (dict && (argidx < arglen) && c != '%') {
6952 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006953 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 goto onError;
6955 }
6956 Py_XDECREF(temp);
6957 } /* '%' */
6958 } /* until end */
6959 if (argidx < arglen && !dict) {
6960 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006961 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 goto onError;
6963 }
6964
6965 if (args_owned) {
6966 Py_DECREF(args);
6967 }
6968 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006969 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006970 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 return (PyObject *)result;
6972
6973 onError:
6974 Py_XDECREF(result);
6975 Py_DECREF(uformat);
6976 if (args_owned) {
6977 Py_DECREF(args);
6978 }
6979 return NULL;
6980}
6981
6982static PyBufferProcs unicode_as_buffer = {
6983 (getreadbufferproc) unicode_buffer_getreadbuf,
6984 (getwritebufferproc) unicode_buffer_getwritebuf,
6985 (getsegcountproc) unicode_buffer_getsegcount,
6986 (getcharbufferproc) unicode_buffer_getcharbuf,
6987};
6988
Jeremy Hylton938ace62002-07-17 16:30:39 +00006989static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006990unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6991
Tim Peters6d6c1a32001-08-02 04:15:00 +00006992static PyObject *
6993unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6994{
6995 PyObject *x = NULL;
6996 static char *kwlist[] = {"string", "encoding", "errors", 0};
6997 char *encoding = NULL;
6998 char *errors = NULL;
6999
Guido van Rossume023fe02001-08-30 03:12:59 +00007000 if (type != &PyUnicode_Type)
7001 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007002 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7003 kwlist, &x, &encoding, &errors))
7004 return NULL;
7005 if (x == NULL)
7006 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007007 if (encoding == NULL && errors == NULL)
7008 return PyObject_Unicode(x);
7009 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007010 return PyUnicode_FromEncodedObject(x, encoding, errors);
7011}
7012
Guido van Rossume023fe02001-08-30 03:12:59 +00007013static PyObject *
7014unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7015{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007016 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007017 int n;
7018
7019 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7020 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7021 if (tmp == NULL)
7022 return NULL;
7023 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007024 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007025 if (pnew == NULL) {
7026 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007027 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007028 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007029 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7030 if (pnew->str == NULL) {
7031 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007032 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007033 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007034 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007035 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007036 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7037 pnew->length = n;
7038 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007039 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007040 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007041}
7042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007043PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007044"unicode(string [, encoding[, errors]]) -> object\n\
7045\n\
7046Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007047encoding defaults to the current default string encoding.\n\
7048errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007049
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050PyTypeObject PyUnicode_Type = {
7051 PyObject_HEAD_INIT(&PyType_Type)
7052 0, /* ob_size */
7053 "unicode", /* tp_name */
7054 sizeof(PyUnicodeObject), /* tp_size */
7055 0, /* tp_itemsize */
7056 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007057 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007059 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 0, /* tp_setattr */
7061 (cmpfunc) unicode_compare, /* tp_compare */
7062 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007063 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007065 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 (hashfunc) unicode_hash, /* tp_hash*/
7067 0, /* tp_call*/
7068 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007069 PyObject_GenericGetAttr, /* tp_getattro */
7070 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007072 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7073 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007074 unicode_doc, /* tp_doc */
7075 0, /* tp_traverse */
7076 0, /* tp_clear */
7077 0, /* tp_richcompare */
7078 0, /* tp_weaklistoffset */
7079 0, /* tp_iter */
7080 0, /* tp_iternext */
7081 unicode_methods, /* tp_methods */
7082 0, /* tp_members */
7083 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007084 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007085 0, /* tp_dict */
7086 0, /* tp_descr_get */
7087 0, /* tp_descr_set */
7088 0, /* tp_dictoffset */
7089 0, /* tp_init */
7090 0, /* tp_alloc */
7091 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007092 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093};
7094
7095/* Initialize the Unicode implementation */
7096
Thomas Wouters78890102000-07-22 19:25:51 +00007097void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007099 int i;
7100
Fred Drakee4315f52000-05-09 19:53:39 +00007101 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007102 unicode_freelist = NULL;
7103 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007105 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007106 for (i = 0; i < 256; i++)
7107 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007108 if (PyType_Ready(&PyUnicode_Type) < 0)
7109 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
7112/* Finalize the Unicode implementation */
7113
7114void
Thomas Wouters78890102000-07-22 19:25:51 +00007115_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007117 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007118 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007120 Py_XDECREF(unicode_empty);
7121 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007122
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007123 for (i = 0; i < 256; i++) {
7124 if (unicode_latin1[i]) {
7125 Py_DECREF(unicode_latin1[i]);
7126 unicode_latin1[i] = NULL;
7127 }
7128 }
7129
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007130 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 PyUnicodeObject *v = u;
7132 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007133 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007134 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007135 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007136 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007138 unicode_freelist = NULL;
7139 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007141
7142/*
7143Local variables:
7144c-basic-offset: 4
7145indent-tabs-mode: nil
7146End:
7147*/