blob: 9e35b61d8a5c7336e9c52c7ce7fad099dfdfa2d6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Thomas Wouters477c8d52006-05-27 19:21:47 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
166 return 0;
167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000187
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Thomas Wouters477c8d52006-05-27 19:21:47 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019 Py_ssize_t size,
2020 Py_UNICODE ch)
2021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044 if (repr == NULL)
2045 return NULL;
2046
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002047 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002051 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 !findchar(s, size, '"')) ? '"' : '\'';
2053 }
2054 while (size-- > 0) {
2055 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002056
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002057 /* Escape quotes and backslashes */
2058 if ((quotes &&
2059 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 *p++ = '\\';
2061 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002062 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002065#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002066 /* Map 21-bit characters to '\U00xxxxxx' */
2067 else if (ch >= 0x10000) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002068 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002069
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002070 /* Resize the string if necessary */
2071 if (offset + 12 > PyString_GET_SIZE(repr)) {
2072 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002073 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074 p = PyString_AS_STRING(repr) + offset;
2075 }
2076
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077 *p++ = '\\';
2078 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002086 *p++ = hexdigit[ch & 0x0000000F];
2087 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002089#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091 else if (ch >= 0xD800 && ch < 0xDC00) {
2092 Py_UNICODE ch2;
2093 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002094
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 ch2 = *s++;
2096 size--;
2097 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099 *p++ = '\\';
2100 *p++ = 'U';
2101 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108 *p++ = hexdigit[ucs & 0x0000000F];
2109 continue;
2110 }
2111 /* Fall through: isolated surrogates are copied as-is */
2112 s--;
2113 size++;
2114 }
2115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 *p++ = '\\';
2119 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002120 *p++ = hexdigit[(ch >> 12) & 0x000F];
2121 *p++ = hexdigit[(ch >> 8) & 0x000F];
2122 *p++ = hexdigit[(ch >> 4) & 0x000F];
2123 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002125
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002126 /* Map special whitespace to '\t', \n', '\r' */
2127 else if (ch == '\t') {
2128 *p++ = '\\';
2129 *p++ = 't';
2130 }
2131 else if (ch == '\n') {
2132 *p++ = '\\';
2133 *p++ = 'n';
2134 }
2135 else if (ch == '\r') {
2136 *p++ = '\\';
2137 *p++ = 'r';
2138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002140 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002141 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144 *p++ = hexdigit[(ch >> 4) & 0x000F];
2145 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Copy everything else as-is */
2149 else
2150 *p++ = (char) ch;
2151 }
2152 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002153 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002156 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 return repr;
2158}
2159
2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162{
2163 return unicodeescape_string(s, size, 0);
2164}
2165
2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167{
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2171 }
2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode));
2174}
2175
2176/* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002179 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 const char *errors)
2181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002183 Py_ssize_t startinpos;
2184 Py_ssize_t endinpos;
2185 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 const char *end;
2189 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002190 PyObject *errorHandler = NULL;
2191 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002192
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 /* Escaped strings will always be longer than the resulting
2194 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 length after conversion to the true value. (But decoding error
2196 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 v = _PyUnicode_New(size);
2198 if (v == NULL)
2199 goto onError;
2200 if (size == 0)
2201 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 end = s + size;
2204 while (s < end) {
2205 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002206 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002208 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
2210 /* Non-escape characters are interpreted as Unicode ordinals */
2211 if (*s != '\\') {
2212 *p++ = (unsigned char)*s++;
2213 continue;
2214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 /* \u-escapes are only interpreted iff the number of leading
2218 backslashes if odd */
2219 bs = s;
2220 for (;s < end;) {
2221 if (*s != '\\')
2222 break;
2223 *p++ = (unsigned char)*s++;
2224 }
2225 if (((s - bs) & 1) == 0 ||
2226 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 continue;
2229 }
2230 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 s++;
2233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002237 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 endinpos = s-starts;
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "rawunicodeescape", "truncated \\uXXXX",
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 }
2248 x = (x<<4) & ~0xF;
2249 if (c >= '0' && c <= '9')
2250 x += c - '0';
2251 else if (c >= 'a' && c <= 'f')
2252 x += 10 + c - 'a';
2253 else
2254 x += 10 + c - 'A';
2255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256#ifndef Py_UNICODE_WIDE
2257 if (x > 0x10000) {
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
2263 goto onError;
2264 }
2265#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 *p++ = x;
2267 nextByte:
2268 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002270 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 onError:
2277 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 Py_XDECREF(errorHandler);
2279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 return NULL;
2281}
2282
2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
2286 PyObject *repr;
2287 char *p;
2288 char *q;
2289
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002290 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002292#ifdef Py_UNICODE_WIDE
2293 repr = PyString_FromStringAndSize(NULL, 10 * size);
2294#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002296#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 if (repr == NULL)
2298 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002299 if (size == 0)
2300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 p = q = PyString_AS_STRING(repr);
2303 while (size-- > 0) {
2304 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 /* Map 32-bit characters to '\Uxxxxxxxx' */
2307 if (ch >= 0x10000) {
2308 *p++ = '\\';
2309 *p++ = 'U';
2310 *p++ = hexdigit[(ch >> 28) & 0xf];
2311 *p++ = hexdigit[(ch >> 24) & 0xf];
2312 *p++ = hexdigit[(ch >> 20) & 0xf];
2313 *p++ = hexdigit[(ch >> 16) & 0xf];
2314 *p++ = hexdigit[(ch >> 12) & 0xf];
2315 *p++ = hexdigit[(ch >> 8) & 0xf];
2316 *p++ = hexdigit[(ch >> 4) & 0xf];
2317 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002318 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002319 else
2320#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 /* Map 16-bit characters to '\uxxxx' */
2322 if (ch >= 256) {
2323 *p++ = '\\';
2324 *p++ = 'u';
2325 *p++ = hexdigit[(ch >> 12) & 0xf];
2326 *p++ = hexdigit[(ch >> 8) & 0xf];
2327 *p++ = hexdigit[(ch >> 4) & 0xf];
2328 *p++ = hexdigit[ch & 15];
2329 }
2330 /* Copy everything else as-is */
2331 else
2332 *p++ = (char) ch;
2333 }
2334 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002335 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 return repr;
2337}
2338
2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340{
2341 if (!PyUnicode_Check(unicode)) {
2342 PyErr_BadArgument();
2343 return NULL;
2344 }
2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346 PyUnicode_GET_SIZE(unicode));
2347}
2348
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002349/* --- Unicode Internal Codec ------------------------------------------- */
2350
2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 const char *errors)
2354{
2355 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002356 Py_ssize_t startinpos;
2357 Py_ssize_t endinpos;
2358 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002359 PyUnicodeObject *v;
2360 Py_UNICODE *p;
2361 const char *end;
2362 const char *reason;
2363 PyObject *errorHandler = NULL;
2364 PyObject *exc = NULL;
2365
Neal Norwitzd43069c2006-01-08 01:12:10 +00002366#ifdef Py_UNICODE_WIDE
2367 Py_UNICODE unimax = PyUnicode_GetMax();
2368#endif
2369
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371 if (v == NULL)
2372 goto onError;
2373 if (PyUnicode_GetSize((PyObject *)v) == 0)
2374 return (PyObject *)v;
2375 p = PyUnicode_AS_UNICODE(v);
2376 end = s + size;
2377
2378 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002379 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002380 /* We have to sanity check the raw data, otherwise doom looms for
2381 some malformed UCS-4 data. */
2382 if (
2383 #ifdef Py_UNICODE_WIDE
2384 *p > unimax || *p < 0 ||
2385 #endif
2386 end-s < Py_UNICODE_SIZE
2387 )
2388 {
2389 startinpos = s - starts;
2390 if (end-s < Py_UNICODE_SIZE) {
2391 endinpos = end-starts;
2392 reason = "truncated input";
2393 }
2394 else {
2395 endinpos = s - starts + Py_UNICODE_SIZE;
2396 reason = "illegal code point (> 0x10FFFF)";
2397 }
2398 outpos = p - PyUnicode_AS_UNICODE(v);
2399 if (unicode_decode_call_errorhandler(
2400 errors, &errorHandler,
2401 "unicode_internal", reason,
2402 starts, size, &startinpos, &endinpos, &exc, &s,
2403 (PyObject **)&v, &outpos, &p)) {
2404 goto onError;
2405 }
2406 }
2407 else {
2408 p++;
2409 s += Py_UNICODE_SIZE;
2410 }
2411 }
2412
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002413 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002414 goto onError;
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return (PyObject *)v;
2418
2419 onError:
2420 Py_XDECREF(v);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return NULL;
2424}
2425
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426/* --- Latin-1 Codec ------------------------------------------------------ */
2427
2428PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002429 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 const char *errors)
2431{
2432 PyUnicodeObject *v;
2433 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002436 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002437 Py_UNICODE r = *(unsigned char*)s;
2438 return PyUnicode_FromUnicode(&r, 1);
2439 }
2440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 v = _PyUnicode_New(size);
2442 if (v == NULL)
2443 goto onError;
2444 if (size == 0)
2445 return (PyObject *)v;
2446 p = PyUnicode_AS_UNICODE(v);
2447 while (size-- > 0)
2448 *p++ = (unsigned char)*s++;
2449 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 onError:
2452 Py_XDECREF(v);
2453 return NULL;
2454}
2455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002456/* create or adjust a UnicodeEncodeError */
2457static void make_encode_exception(PyObject **exceptionObject,
2458 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 const Py_UNICODE *unicode, Py_ssize_t size,
2460 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 if (*exceptionObject == NULL) {
2464 *exceptionObject = PyUnicodeEncodeError_Create(
2465 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
2467 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2469 goto onError;
2470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2471 goto onError;
2472 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2473 goto onError;
2474 return;
2475 onError:
2476 Py_DECREF(*exceptionObject);
2477 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
2479}
2480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481/* raises a UnicodeEncodeError */
2482static void raise_encode_exception(PyObject **exceptionObject,
2483 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 const Py_UNICODE *unicode, Py_ssize_t size,
2485 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *reason)
2487{
2488 make_encode_exception(exceptionObject,
2489 encoding, unicode, size, startpos, endpos, reason);
2490 if (*exceptionObject != NULL)
2491 PyCodec_StrictErrors(*exceptionObject);
2492}
2493
2494/* error handling callback helper:
2495 build arguments, call the callback and check the arguments,
2496 put the result into newpos and return the replacement string, which
2497 has to be freed by the caller */
2498static PyObject *unicode_encode_call_errorhandler(const char *errors,
2499 PyObject **errorHandler,
2500 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2502 Py_ssize_t startpos, Py_ssize_t endpos,
2503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002506
2507 PyObject *restuple;
2508 PyObject *resunicode;
2509
2510 if (*errorHandler == NULL) {
2511 *errorHandler = PyCodec_LookupError(errors);
2512 if (*errorHandler == NULL)
2513 return NULL;
2514 }
2515
2516 make_encode_exception(exceptionObject,
2517 encoding, unicode, size, startpos, endpos, reason);
2518 if (*exceptionObject == NULL)
2519 return NULL;
2520
2521 restuple = PyObject_CallFunctionObjArgs(
2522 *errorHandler, *exceptionObject, NULL);
2523 if (restuple == NULL)
2524 return NULL;
2525 if (!PyTuple_Check(restuple)) {
2526 PyErr_Format(PyExc_TypeError, &argparse[4]);
2527 Py_DECREF(restuple);
2528 return NULL;
2529 }
2530 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2531 &resunicode, newpos)) {
2532 Py_DECREF(restuple);
2533 return NULL;
2534 }
2535 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002536 *newpos = size+*newpos;
2537 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002538 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002539 Py_DECREF(restuple);
2540 return NULL;
2541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 Py_INCREF(resunicode);
2543 Py_DECREF(restuple);
2544 return resunicode;
2545}
2546
2547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *errors,
2550 int limit)
2551{
2552 /* output object */
2553 PyObject *res;
2554 /* pointers to the beginning and end+1 of input */
2555 const Py_UNICODE *startp = p;
2556 const Py_UNICODE *endp = p + size;
2557 /* pointer to the beginning of the unencodable characters */
2558 /* const Py_UNICODE *badp = NULL; */
2559 /* pointer into the output */
2560 char *str;
2561 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002562 Py_ssize_t respos = 0;
2563 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002564 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2565 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 PyObject *errorHandler = NULL;
2567 PyObject *exc = NULL;
2568 /* the following variable is used for caching string comparisons
2569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570 int known_errorHandler = -1;
2571
2572 /* allocate enough for a simple encoding without
2573 replacements, if we need more, we'll resize */
2574 res = PyString_FromStringAndSize(NULL, size);
2575 if (res == NULL)
2576 goto onError;
2577 if (size == 0)
2578 return res;
2579 str = PyString_AS_STRING(res);
2580 ressize = size;
2581
2582 while (p<endp) {
2583 Py_UNICODE c = *p;
2584
2585 /* can we encode this? */
2586 if (c<limit) {
2587 /* no overflow check, because we know that the space is enough */
2588 *str++ = (char)c;
2589 ++p;
2590 }
2591 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t unicodepos = p-startp;
2593 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002595 Py_ssize_t repsize;
2596 Py_ssize_t newpos;
2597 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002598 Py_UNICODE *uni2;
2599 /* startpos for collecting unencodable chars */
2600 const Py_UNICODE *collstart = p;
2601 const Py_UNICODE *collend = p;
2602 /* find all unecodable characters */
2603 while ((collend < endp) && ((*collend)>=limit))
2604 ++collend;
2605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606 if (known_errorHandler==-1) {
2607 if ((errors==NULL) || (!strcmp(errors, "strict")))
2608 known_errorHandler = 1;
2609 else if (!strcmp(errors, "replace"))
2610 known_errorHandler = 2;
2611 else if (!strcmp(errors, "ignore"))
2612 known_errorHandler = 3;
2613 else if (!strcmp(errors, "xmlcharrefreplace"))
2614 known_errorHandler = 4;
2615 else
2616 known_errorHandler = 0;
2617 }
2618 switch (known_errorHandler) {
2619 case 1: /* strict */
2620 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2621 goto onError;
2622 case 2: /* replace */
2623 while (collstart++<collend)
2624 *str++ = '?'; /* fall through */
2625 case 3: /* ignore */
2626 p = collend;
2627 break;
2628 case 4: /* xmlcharrefreplace */
2629 respos = str-PyString_AS_STRING(res);
2630 /* determine replacement size (temporarily (mis)uses p) */
2631 for (p = collstart, repsize = 0; p < collend; ++p) {
2632 if (*p<10)
2633 repsize += 2+1+1;
2634 else if (*p<100)
2635 repsize += 2+2+1;
2636 else if (*p<1000)
2637 repsize += 2+3+1;
2638 else if (*p<10000)
2639 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002640#ifndef Py_UNICODE_WIDE
2641 else
2642 repsize += 2+5+1;
2643#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 else if (*p<100000)
2645 repsize += 2+5+1;
2646 else if (*p<1000000)
2647 repsize += 2+6+1;
2648 else
2649 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 }
2652 requiredsize = respos+repsize+(endp-collend);
2653 if (requiredsize > ressize) {
2654 if (requiredsize<2*ressize)
2655 requiredsize = 2*ressize;
2656 if (_PyString_Resize(&res, requiredsize))
2657 goto onError;
2658 str = PyString_AS_STRING(res) + respos;
2659 ressize = requiredsize;
2660 }
2661 /* generate replacement (temporarily (mis)uses p) */
2662 for (p = collstart; p < collend; ++p) {
2663 str += sprintf(str, "&#%d;", (int)*p);
2664 }
2665 p = collend;
2666 break;
2667 default:
2668 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2669 encoding, reason, startp, size, &exc,
2670 collstart-startp, collend-startp, &newpos);
2671 if (repunicode == NULL)
2672 goto onError;
2673 /* need more space? (at least enough for what we
2674 have+the replacement+the rest of the string, so
2675 we won't have to check space for encodable characters) */
2676 respos = str-PyString_AS_STRING(res);
2677 repsize = PyUnicode_GET_SIZE(repunicode);
2678 requiredsize = respos+repsize+(endp-collend);
2679 if (requiredsize > ressize) {
2680 if (requiredsize<2*ressize)
2681 requiredsize = 2*ressize;
2682 if (_PyString_Resize(&res, requiredsize)) {
2683 Py_DECREF(repunicode);
2684 goto onError;
2685 }
2686 str = PyString_AS_STRING(res) + respos;
2687 ressize = requiredsize;
2688 }
2689 /* check if there is anything unencodable in the replacement
2690 and copy it to the output */
2691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2692 c = *uni2;
2693 if (c >= limit) {
2694 raise_encode_exception(&exc, encoding, startp, size,
2695 unicodepos, unicodepos+1, reason);
2696 Py_DECREF(repunicode);
2697 goto onError;
2698 }
2699 *str = (char)c;
2700 }
2701 p = startp + newpos;
2702 Py_DECREF(repunicode);
2703 }
2704 }
2705 }
2706 /* Resize if we allocated to much */
2707 respos = str-PyString_AS_STRING(res);
2708 if (respos<ressize)
2709 /* If this falls res will be NULL */
2710 _PyString_Resize(&res, respos);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return res;
2714
2715 onError:
2716 Py_XDECREF(res);
2717 Py_XDECREF(errorHandler);
2718 Py_XDECREF(exc);
2719 return NULL;
2720}
2721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 const char *errors)
2725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2730{
2731 if (!PyUnicode_Check(unicode)) {
2732 PyErr_BadArgument();
2733 return NULL;
2734 }
2735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2736 PyUnicode_GET_SIZE(unicode),
2737 NULL);
2738}
2739
2740/* --- 7-bit ASCII Codec -------------------------------------------------- */
2741
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 const char *errors)
2745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 PyUnicodeObject *v;
2748 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002749 Py_ssize_t startinpos;
2750 Py_ssize_t endinpos;
2751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 const char *e;
2753 PyObject *errorHandler = NULL;
2754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002757 if (size == 1 && *(unsigned char*)s < 128) {
2758 Py_UNICODE r = *(unsigned char*)s;
2759 return PyUnicode_FromUnicode(&r, 1);
2760 }
Tim Petersced69f82003-09-16 20:30:58 +00002761
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 v = _PyUnicode_New(size);
2763 if (v == NULL)
2764 goto onError;
2765 if (size == 0)
2766 return (PyObject *)v;
2767 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 e = s + size;
2769 while (s < e) {
2770 register unsigned char c = (unsigned char)*s;
2771 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 ++s;
2774 }
2775 else {
2776 startinpos = s-starts;
2777 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002778 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 if (unicode_decode_call_errorhandler(
2780 errors, &errorHandler,
2781 "ascii", "ordinal not in range(128)",
2782 starts, size, &startinpos, &endinpos, &exc, &s,
2783 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002787 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002788 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 onError:
2795 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 return NULL;
2799}
2800
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002802 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 const char *errors)
2804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806}
2807
2808PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2809{
2810 if (!PyUnicode_Check(unicode)) {
2811 PyErr_BadArgument();
2812 return NULL;
2813 }
2814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2815 PyUnicode_GET_SIZE(unicode),
2816 NULL);
2817}
2818
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002821/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002822
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002823#if SIZEOF_INT < SIZEOF_SSIZE_T
2824#define NEED_RETRY
2825#endif
2826
2827/* XXX This code is limited to "true" double-byte encodings, as
2828 a) it assumes an incomplete character consists of a single byte, and
2829 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2830 encodings, see IsDBCSLeadByteEx documentation. */
2831
2832static int is_dbcs_lead_byte(const char *s, int offset)
2833{
2834 const char *curr = s + offset;
2835
2836 if (IsDBCSLeadByte(*curr)) {
2837 const char *prev = CharPrev(s, curr);
2838 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2839 }
2840 return 0;
2841}
2842
2843/*
2844 * Decode MBCS string into unicode object. If 'final' is set, converts
2845 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2846 */
2847static int decode_mbcs(PyUnicodeObject **v,
2848 const char *s, /* MBCS string */
2849 int size, /* sizeof MBCS string */
2850 int final)
2851{
2852 Py_UNICODE *p;
2853 Py_ssize_t n = 0;
2854 int usize = 0;
2855
2856 assert(size >= 0);
2857
2858 /* Skip trailing lead-byte unless 'final' is set */
2859 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2860 --size;
2861
2862 /* First get the size of the result */
2863 if (size > 0) {
2864 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2865 if (usize == 0) {
2866 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2867 return -1;
2868 }
2869 }
2870
2871 if (*v == NULL) {
2872 /* Create unicode object */
2873 *v = _PyUnicode_New(usize);
2874 if (*v == NULL)
2875 return -1;
2876 }
2877 else {
2878 /* Extend unicode object */
2879 n = PyUnicode_GET_SIZE(*v);
2880 if (_PyUnicode_Resize(v, n + usize) < 0)
2881 return -1;
2882 }
2883
2884 /* Do the conversion */
2885 if (size > 0) {
2886 p = PyUnicode_AS_UNICODE(*v) + n;
2887 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2888 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2889 return -1;
2890 }
2891 }
2892
2893 return size;
2894}
2895
2896PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2897 Py_ssize_t size,
2898 const char *errors,
2899 Py_ssize_t *consumed)
2900{
2901 PyUnicodeObject *v = NULL;
2902 int done;
2903
2904 if (consumed)
2905 *consumed = 0;
2906
2907#ifdef NEED_RETRY
2908 retry:
2909 if (size > INT_MAX)
2910 done = decode_mbcs(&v, s, INT_MAX, 0);
2911 else
2912#endif
2913 done = decode_mbcs(&v, s, (int)size, !consumed);
2914
2915 if (done < 0) {
2916 Py_XDECREF(v);
2917 return NULL;
2918 }
2919
2920 if (consumed)
2921 *consumed += done;
2922
2923#ifdef NEED_RETRY
2924 if (size > INT_MAX) {
2925 s += done;
2926 size -= done;
2927 goto retry;
2928 }
2929#endif
2930
2931 return (PyObject *)v;
2932}
2933
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002934PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002935 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002936 const char *errors)
2937{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002938 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2939}
2940
2941/*
2942 * Convert unicode into string object (MBCS).
2943 * Returns 0 if succeed, -1 otherwise.
2944 */
2945static int encode_mbcs(PyObject **repr,
2946 const Py_UNICODE *p, /* unicode */
2947 int size) /* size of unicode */
2948{
2949 int mbcssize = 0;
2950 Py_ssize_t n = 0;
2951
2952 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002953
2954 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002955 if (size > 0) {
2956 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2957 if (mbcssize == 0) {
2958 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2959 return -1;
2960 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002961 }
2962
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002963 if (*repr == NULL) {
2964 /* Create string object */
2965 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2966 if (*repr == NULL)
2967 return -1;
2968 }
2969 else {
2970 /* Extend string object */
2971 n = PyString_Size(*repr);
2972 if (_PyString_Resize(repr, n + mbcssize) < 0)
2973 return -1;
2974 }
2975
2976 /* Do the conversion */
2977 if (size > 0) {
2978 char *s = PyString_AS_STRING(*repr) + n;
2979 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2980 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2981 return -1;
2982 }
2983 }
2984
2985 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002986}
2987
2988PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002989 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002990 const char *errors)
2991{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002992 PyObject *repr = NULL;
2993 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002994
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002995#ifdef NEED_RETRY
2996 retry:
2997 if (size > INT_MAX)
2998 ret = encode_mbcs(&repr, p, INT_MAX);
2999 else
3000#endif
3001 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003002
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003003 if (ret < 0) {
3004 Py_XDECREF(repr);
3005 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003006 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003007
3008#ifdef NEED_RETRY
3009 if (size > INT_MAX) {
3010 p += INT_MAX;
3011 size -= INT_MAX;
3012 goto retry;
3013 }
3014#endif
3015
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003016 return repr;
3017}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003018
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003019PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3020{
3021 if (!PyUnicode_Check(unicode)) {
3022 PyErr_BadArgument();
3023 return NULL;
3024 }
3025 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3026 PyUnicode_GET_SIZE(unicode),
3027 NULL);
3028}
3029
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003030#undef NEED_RETRY
3031
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003032#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003033
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034/* --- Character Mapping Codec -------------------------------------------- */
3035
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003037 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 PyObject *mapping,
3039 const char *errors)
3040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003042 Py_ssize_t startinpos;
3043 Py_ssize_t endinpos;
3044 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 PyUnicodeObject *v;
3047 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003048 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 PyObject *errorHandler = NULL;
3050 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003051 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003052 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 /* Default to Latin-1 */
3055 if (mapping == NULL)
3056 return PyUnicode_DecodeLatin1(s, size, errors);
3057
3058 v = _PyUnicode_New(size);
3059 if (v == NULL)
3060 goto onError;
3061 if (size == 0)
3062 return (PyObject *)v;
3063 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003065 if (PyUnicode_CheckExact(mapping)) {
3066 mapstring = PyUnicode_AS_UNICODE(mapping);
3067 maplen = PyUnicode_GET_SIZE(mapping);
3068 while (s < e) {
3069 unsigned char ch = *s;
3070 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003072 if (ch < maplen)
3073 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003075 if (x == 0xfffe) {
3076 /* undefined mapping */
3077 outpos = p-PyUnicode_AS_UNICODE(v);
3078 startinpos = s-starts;
3079 endinpos = startinpos+1;
3080 if (unicode_decode_call_errorhandler(
3081 errors, &errorHandler,
3082 "charmap", "character maps to <undefined>",
3083 starts, size, &startinpos, &endinpos, &exc, &s,
3084 (PyObject **)&v, &outpos, &p)) {
3085 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003086 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003087 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003088 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003089 *p++ = x;
3090 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003092 }
3093 else {
3094 while (s < e) {
3095 unsigned char ch = *s;
3096 PyObject *w, *x;
3097
3098 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3099 w = PyInt_FromLong((long)ch);
3100 if (w == NULL)
3101 goto onError;
3102 x = PyObject_GetItem(mapping, w);
3103 Py_DECREF(w);
3104 if (x == NULL) {
3105 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3106 /* No mapping found means: mapping is undefined. */
3107 PyErr_Clear();
3108 x = Py_None;
3109 Py_INCREF(x);
3110 } else
3111 goto onError;
3112 }
3113
3114 /* Apply mapping */
3115 if (PyInt_Check(x)) {
3116 long value = PyInt_AS_LONG(x);
3117 if (value < 0 || value > 65535) {
3118 PyErr_SetString(PyExc_TypeError,
3119 "character mapping must be in range(65536)");
3120 Py_DECREF(x);
3121 goto onError;
3122 }
3123 *p++ = (Py_UNICODE)value;
3124 }
3125 else if (x == Py_None) {
3126 /* undefined mapping */
3127 outpos = p-PyUnicode_AS_UNICODE(v);
3128 startinpos = s-starts;
3129 endinpos = startinpos+1;
3130 if (unicode_decode_call_errorhandler(
3131 errors, &errorHandler,
3132 "charmap", "character maps to <undefined>",
3133 starts, size, &startinpos, &endinpos, &exc, &s,
3134 (PyObject **)&v, &outpos, &p)) {
3135 Py_DECREF(x);
3136 goto onError;
3137 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003138 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003139 continue;
3140 }
3141 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003142 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003143
3144 if (targetsize == 1)
3145 /* 1-1 mapping */
3146 *p++ = *PyUnicode_AS_UNICODE(x);
3147
3148 else if (targetsize > 1) {
3149 /* 1-n mapping */
3150 if (targetsize > extrachars) {
3151 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003152 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3153 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003154 (targetsize << 2);
3155 extrachars += needed;
3156 if (_PyUnicode_Resize(&v,
3157 PyUnicode_GET_SIZE(v) + needed) < 0) {
3158 Py_DECREF(x);
3159 goto onError;
3160 }
3161 p = PyUnicode_AS_UNICODE(v) + oldpos;
3162 }
3163 Py_UNICODE_COPY(p,
3164 PyUnicode_AS_UNICODE(x),
3165 targetsize);
3166 p += targetsize;
3167 extrachars -= targetsize;
3168 }
3169 /* 1-0 mapping: skip the character */
3170 }
3171 else {
3172 /* wrong return value */
3173 PyErr_SetString(PyExc_TypeError,
3174 "character mapping must return integer, None or unicode");
3175 Py_DECREF(x);
3176 goto onError;
3177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003179 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181 }
3182 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003183 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 Py_XDECREF(errorHandler);
3186 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003188
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 Py_XDECREF(errorHandler);
3191 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 Py_XDECREF(v);
3193 return NULL;
3194}
3195
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003196/* Charmap encoding: the lookup table */
3197
3198struct encoding_map{
3199 PyObject_HEAD
3200 unsigned char level1[32];
3201 int count2, count3;
3202 unsigned char level23[1];
3203};
3204
3205static PyObject*
3206encoding_map_size(PyObject *obj, PyObject* args)
3207{
3208 struct encoding_map *map = (struct encoding_map*)obj;
3209 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3210 128*map->count3);
3211}
3212
3213static PyMethodDef encoding_map_methods[] = {
3214 {"size", encoding_map_size, METH_NOARGS,
3215 PyDoc_STR("Return the size (in bytes) of this object") },
3216 { 0 }
3217};
3218
3219static void
3220encoding_map_dealloc(PyObject* o)
3221{
3222 PyObject_FREE(o);
3223}
3224
3225static PyTypeObject EncodingMapType = {
3226 PyObject_HEAD_INIT(NULL)
3227 0, /*ob_size*/
3228 "EncodingMap", /*tp_name*/
3229 sizeof(struct encoding_map), /*tp_basicsize*/
3230 0, /*tp_itemsize*/
3231 /* methods */
3232 encoding_map_dealloc, /*tp_dealloc*/
3233 0, /*tp_print*/
3234 0, /*tp_getattr*/
3235 0, /*tp_setattr*/
3236 0, /*tp_compare*/
3237 0, /*tp_repr*/
3238 0, /*tp_as_number*/
3239 0, /*tp_as_sequence*/
3240 0, /*tp_as_mapping*/
3241 0, /*tp_hash*/
3242 0, /*tp_call*/
3243 0, /*tp_str*/
3244 0, /*tp_getattro*/
3245 0, /*tp_setattro*/
3246 0, /*tp_as_buffer*/
3247 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3248 0, /*tp_doc*/
3249 0, /*tp_traverse*/
3250 0, /*tp_clear*/
3251 0, /*tp_richcompare*/
3252 0, /*tp_weaklistoffset*/
3253 0, /*tp_iter*/
3254 0, /*tp_iternext*/
3255 encoding_map_methods, /*tp_methods*/
3256 0, /*tp_members*/
3257 0, /*tp_getset*/
3258 0, /*tp_base*/
3259 0, /*tp_dict*/
3260 0, /*tp_descr_get*/
3261 0, /*tp_descr_set*/
3262 0, /*tp_dictoffset*/
3263 0, /*tp_init*/
3264 0, /*tp_alloc*/
3265 0, /*tp_new*/
3266 0, /*tp_free*/
3267 0, /*tp_is_gc*/
3268};
3269
3270PyObject*
3271PyUnicode_BuildEncodingMap(PyObject* string)
3272{
3273 Py_UNICODE *decode;
3274 PyObject *result;
3275 struct encoding_map *mresult;
3276 int i;
3277 int need_dict = 0;
3278 unsigned char level1[32];
3279 unsigned char level2[512];
3280 unsigned char *mlevel1, *mlevel2, *mlevel3;
3281 int count2 = 0, count3 = 0;
3282
3283 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3284 PyErr_BadArgument();
3285 return NULL;
3286 }
3287 decode = PyUnicode_AS_UNICODE(string);
3288 memset(level1, 0xFF, sizeof level1);
3289 memset(level2, 0xFF, sizeof level2);
3290
3291 /* If there isn't a one-to-one mapping of NULL to \0,
3292 or if there are non-BMP characters, we need to use
3293 a mapping dictionary. */
3294 if (decode[0] != 0)
3295 need_dict = 1;
3296 for (i = 1; i < 256; i++) {
3297 int l1, l2;
3298 if (decode[i] == 0
3299 #ifdef Py_UNICODE_WIDE
3300 || decode[i] > 0xFFFF
3301 #endif
3302 ) {
3303 need_dict = 1;
3304 break;
3305 }
3306 if (decode[i] == 0xFFFE)
3307 /* unmapped character */
3308 continue;
3309 l1 = decode[i] >> 11;
3310 l2 = decode[i] >> 7;
3311 if (level1[l1] == 0xFF)
3312 level1[l1] = count2++;
3313 if (level2[l2] == 0xFF)
3314 level2[l2] = count3++;
3315 }
3316
3317 if (count2 >= 0xFF || count3 >= 0xFF)
3318 need_dict = 1;
3319
3320 if (need_dict) {
3321 PyObject *result = PyDict_New();
3322 PyObject *key, *value;
3323 if (!result)
3324 return NULL;
3325 for (i = 0; i < 256; i++) {
3326 key = value = NULL;
3327 key = PyInt_FromLong(decode[i]);
3328 value = PyInt_FromLong(i);
3329 if (!key || !value)
3330 goto failed1;
3331 if (PyDict_SetItem(result, key, value) == -1)
3332 goto failed1;
3333 Py_DECREF(key);
3334 Py_DECREF(value);
3335 }
3336 return result;
3337 failed1:
3338 Py_XDECREF(key);
3339 Py_XDECREF(value);
3340 Py_DECREF(result);
3341 return NULL;
3342 }
3343
3344 /* Create a three-level trie */
3345 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3346 16*count2 + 128*count3 - 1);
3347 if (!result)
3348 return PyErr_NoMemory();
3349 PyObject_Init(result, &EncodingMapType);
3350 mresult = (struct encoding_map*)result;
3351 mresult->count2 = count2;
3352 mresult->count3 = count3;
3353 mlevel1 = mresult->level1;
3354 mlevel2 = mresult->level23;
3355 mlevel3 = mresult->level23 + 16*count2;
3356 memcpy(mlevel1, level1, 32);
3357 memset(mlevel2, 0xFF, 16*count2);
3358 memset(mlevel3, 0, 128*count3);
3359 count3 = 0;
3360 for (i = 1; i < 256; i++) {
3361 int o1, o2, o3, i2, i3;
3362 if (decode[i] == 0xFFFE)
3363 /* unmapped character */
3364 continue;
3365 o1 = decode[i]>>11;
3366 o2 = (decode[i]>>7) & 0xF;
3367 i2 = 16*mlevel1[o1] + o2;
3368 if (mlevel2[i2] == 0xFF)
3369 mlevel2[i2] = count3++;
3370 o3 = decode[i] & 0x7F;
3371 i3 = 128*mlevel2[i2] + o3;
3372 mlevel3[i3] = i;
3373 }
3374 return result;
3375}
3376
3377static int
3378encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3379{
3380 struct encoding_map *map = (struct encoding_map*)mapping;
3381 int l1 = c>>11;
3382 int l2 = (c>>7) & 0xF;
3383 int l3 = c & 0x7F;
3384 int i;
3385
3386#ifdef Py_UNICODE_WIDE
3387 if (c > 0xFFFF) {
3388 return -1;
3389 }
3390#endif
3391 if (c == 0)
3392 return 0;
3393 /* level 1*/
3394 i = map->level1[l1];
3395 if (i == 0xFF) {
3396 return -1;
3397 }
3398 /* level 2*/
3399 i = map->level23[16*i+l2];
3400 if (i == 0xFF) {
3401 return -1;
3402 }
3403 /* level 3 */
3404 i = map->level23[16*map->count2 + 128*i + l3];
3405 if (i == 0) {
3406 return -1;
3407 }
3408 return i;
3409}
3410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411/* Lookup the character ch in the mapping. If the character
3412 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003413 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 PyObject *w = PyInt_FromLong((long)c);
3417 PyObject *x;
3418
3419 if (w == NULL)
3420 return NULL;
3421 x = PyObject_GetItem(mapping, w);
3422 Py_DECREF(w);
3423 if (x == NULL) {
3424 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3425 /* No mapping found means: mapping is undefined. */
3426 PyErr_Clear();
3427 x = Py_None;
3428 Py_INCREF(x);
3429 return x;
3430 } else
3431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003433 else if (x == Py_None)
3434 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 else if (PyInt_Check(x)) {
3436 long value = PyInt_AS_LONG(x);
3437 if (value < 0 || value > 255) {
3438 PyErr_SetString(PyExc_TypeError,
3439 "character mapping must be in range(256)");
3440 Py_DECREF(x);
3441 return NULL;
3442 }
3443 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 else if (PyString_Check(x))
3446 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 /* wrong return value */
3449 PyErr_SetString(PyExc_TypeError,
3450 "character mapping must return integer, None or str");
3451 Py_DECREF(x);
3452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
3454}
3455
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003456static int
3457charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3458{
3459 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3460 /* exponentially overallocate to minimize reallocations */
3461 if (requiredsize < 2*outsize)
3462 requiredsize = 2*outsize;
3463 if (_PyString_Resize(outobj, requiredsize)) {
3464 return 0;
3465 }
3466 return 1;
3467}
3468
3469typedef enum charmapencode_result {
3470 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3471}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472/* lookup the character, put the result in the output string and adjust
3473 various state variables. Reallocate the output string if not enough
3474 space is available. Return a new reference to the object that
3475 was put in the output buffer, or Py_None, if the mapping was undefined
3476 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003477 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003479charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003480 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003482 PyObject *rep;
3483 char *outstart;
3484 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003486 if (mapping->ob_type == &EncodingMapType) {
3487 int res = encoding_map_lookup(c, mapping);
3488 Py_ssize_t requiredsize = *outpos+1;
3489 if (res == -1)
3490 return enc_FAILED;
3491 if (outsize<requiredsize)
3492 if (!charmapencode_resize(outobj, outpos, requiredsize))
3493 return enc_EXCEPTION;
3494 outstart = PyString_AS_STRING(*outobj);
3495 outstart[(*outpos)++] = (char)res;
3496 return enc_SUCCESS;
3497 }
3498
3499 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003501 return enc_EXCEPTION;
3502 else if (rep==Py_None) {
3503 Py_DECREF(rep);
3504 return enc_FAILED;
3505 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003507 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003508 if (outsize<requiredsize)
3509 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003511 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003513 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3515 }
3516 else {
3517 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3519 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003520 if (outsize<requiredsize)
3521 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003523 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003525 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 memcpy(outstart + *outpos, repchars, repsize);
3527 *outpos += repsize;
3528 }
3529 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003530 Py_DECREF(rep);
3531 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532}
3533
3534/* handle an error in PyUnicode_EncodeCharmap
3535 Return 0 on success, -1 on error */
3536static
3537int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003538 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003540 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003541 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542{
3543 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003544 Py_ssize_t repsize;
3545 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 Py_UNICODE *uni2;
3547 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003548 Py_ssize_t collstartpos = *inpos;
3549 Py_ssize_t collendpos = *inpos+1;
3550 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 char *encoding = "charmap";
3552 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003553 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 /* find all unencodable characters */
3556 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003557 PyObject *rep;
3558 if (mapping->ob_type == &EncodingMapType) {
3559 int res = encoding_map_lookup(p[collendpos], mapping);
3560 if (res != -1)
3561 break;
3562 ++collendpos;
3563 continue;
3564 }
3565
3566 rep = charmapencode_lookup(p[collendpos], mapping);
3567 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003569 else if (rep!=Py_None) {
3570 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 break;
3572 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003573 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 ++collendpos;
3575 }
3576 /* cache callback name lookup
3577 * (if not done yet, i.e. it's the first error) */
3578 if (*known_errorHandler==-1) {
3579 if ((errors==NULL) || (!strcmp(errors, "strict")))
3580 *known_errorHandler = 1;
3581 else if (!strcmp(errors, "replace"))
3582 *known_errorHandler = 2;
3583 else if (!strcmp(errors, "ignore"))
3584 *known_errorHandler = 3;
3585 else if (!strcmp(errors, "xmlcharrefreplace"))
3586 *known_errorHandler = 4;
3587 else
3588 *known_errorHandler = 0;
3589 }
3590 switch (*known_errorHandler) {
3591 case 1: /* strict */
3592 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3593 return -1;
3594 case 2: /* replace */
3595 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3596 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003597 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 return -1;
3599 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003600 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3602 return -1;
3603 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 }
3605 /* fall through */
3606 case 3: /* ignore */
3607 *inpos = collendpos;
3608 break;
3609 case 4: /* xmlcharrefreplace */
3610 /* generate replacement (temporarily (mis)uses p) */
3611 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3612 char buffer[2+29+1+1];
3613 char *cp;
3614 sprintf(buffer, "&#%d;", (int)p[collpos]);
3615 for (cp = buffer; *cp; ++cp) {
3616 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003617 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003619 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621 return -1;
3622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 }
3624 }
3625 *inpos = collendpos;
3626 break;
3627 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003628 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 encoding, reason, p, size, exceptionObject,
3630 collstartpos, collendpos, &newpos);
3631 if (repunicode == NULL)
3632 return -1;
3633 /* generate replacement */
3634 repsize = PyUnicode_GET_SIZE(repunicode);
3635 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3636 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003637 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 return -1;
3639 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003640 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3643 return -1;
3644 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 }
3646 *inpos = newpos;
3647 Py_DECREF(repunicode);
3648 }
3649 return 0;
3650}
3651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003653 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 PyObject *mapping,
3655 const char *errors)
3656{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 /* output object */
3658 PyObject *res = NULL;
3659 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003660 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003662 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 PyObject *errorHandler = NULL;
3664 PyObject *exc = NULL;
3665 /* the following variable is used for caching string comparisons
3666 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3667 * 3=ignore, 4=xmlcharrefreplace */
3668 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669
3670 /* Default to Latin-1 */
3671 if (mapping == NULL)
3672 return PyUnicode_EncodeLatin1(p, size, errors);
3673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 /* allocate enough for a simple encoding without
3675 replacements, if we need more, we'll resize */
3676 res = PyString_FromStringAndSize(NULL, size);
3677 if (res == NULL)
3678 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003679 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 while (inpos<size) {
3683 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003684 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3685 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003687 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 if (charmap_encoding_error(p, size, &inpos, mapping,
3689 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003690 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003691 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003692 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003693 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 else
3696 /* done with this character => adjust input position */
3697 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 /* Resize if we allocated to much */
3701 if (respos<PyString_GET_SIZE(res)) {
3702 if (_PyString_Resize(&res, respos))
3703 goto onError;
3704 }
3705 Py_XDECREF(exc);
3706 Py_XDECREF(errorHandler);
3707 return res;
3708
3709 onError:
3710 Py_XDECREF(res);
3711 Py_XDECREF(exc);
3712 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 return NULL;
3714}
3715
3716PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3717 PyObject *mapping)
3718{
3719 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3720 PyErr_BadArgument();
3721 return NULL;
3722 }
3723 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3724 PyUnicode_GET_SIZE(unicode),
3725 mapping,
3726 NULL);
3727}
3728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729/* create or adjust a UnicodeTranslateError */
3730static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003731 const Py_UNICODE *unicode, Py_ssize_t size,
3732 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 if (*exceptionObject == NULL) {
3736 *exceptionObject = PyUnicodeTranslateError_Create(
3737 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
3739 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3741 goto onError;
3742 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3743 goto onError;
3744 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3745 goto onError;
3746 return;
3747 onError:
3748 Py_DECREF(*exceptionObject);
3749 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 }
3751}
3752
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753/* raises a UnicodeTranslateError */
3754static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003755 const Py_UNICODE *unicode, Py_ssize_t size,
3756 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 const char *reason)
3758{
3759 make_translate_exception(exceptionObject,
3760 unicode, size, startpos, endpos, reason);
3761 if (*exceptionObject != NULL)
3762 PyCodec_StrictErrors(*exceptionObject);
3763}
3764
3765/* error handling callback helper:
3766 build arguments, call the callback and check the arguments,
3767 put the result into newpos and return the replacement string, which
3768 has to be freed by the caller */
3769static PyObject *unicode_translate_call_errorhandler(const char *errors,
3770 PyObject **errorHandler,
3771 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003772 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3773 Py_ssize_t startpos, Py_ssize_t endpos,
3774 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003776 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003778 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 PyObject *restuple;
3780 PyObject *resunicode;
3781
3782 if (*errorHandler == NULL) {
3783 *errorHandler = PyCodec_LookupError(errors);
3784 if (*errorHandler == NULL)
3785 return NULL;
3786 }
3787
3788 make_translate_exception(exceptionObject,
3789 unicode, size, startpos, endpos, reason);
3790 if (*exceptionObject == NULL)
3791 return NULL;
3792
3793 restuple = PyObject_CallFunctionObjArgs(
3794 *errorHandler, *exceptionObject, NULL);
3795 if (restuple == NULL)
3796 return NULL;
3797 if (!PyTuple_Check(restuple)) {
3798 PyErr_Format(PyExc_TypeError, &argparse[4]);
3799 Py_DECREF(restuple);
3800 return NULL;
3801 }
3802 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003803 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 Py_DECREF(restuple);
3805 return NULL;
3806 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003807 if (i_newpos<0)
3808 *newpos = size+i_newpos;
3809 else
3810 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003811 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003812 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003813 Py_DECREF(restuple);
3814 return NULL;
3815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 Py_INCREF(resunicode);
3817 Py_DECREF(restuple);
3818 return resunicode;
3819}
3820
3821/* Lookup the character ch in the mapping and put the result in result,
3822 which must be decrefed by the caller.
3823 Return 0 on success, -1 on error */
3824static
3825int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3826{
3827 PyObject *w = PyInt_FromLong((long)c);
3828 PyObject *x;
3829
3830 if (w == NULL)
3831 return -1;
3832 x = PyObject_GetItem(mapping, w);
3833 Py_DECREF(w);
3834 if (x == NULL) {
3835 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3836 /* No mapping found means: use 1:1 mapping. */
3837 PyErr_Clear();
3838 *result = NULL;
3839 return 0;
3840 } else
3841 return -1;
3842 }
3843 else if (x == Py_None) {
3844 *result = x;
3845 return 0;
3846 }
3847 else if (PyInt_Check(x)) {
3848 long value = PyInt_AS_LONG(x);
3849 long max = PyUnicode_GetMax();
3850 if (value < 0 || value > max) {
3851 PyErr_Format(PyExc_TypeError,
3852 "character mapping must be in range(0x%lx)", max+1);
3853 Py_DECREF(x);
3854 return -1;
3855 }
3856 *result = x;
3857 return 0;
3858 }
3859 else if (PyUnicode_Check(x)) {
3860 *result = x;
3861 return 0;
3862 }
3863 else {
3864 /* wrong return value */
3865 PyErr_SetString(PyExc_TypeError,
3866 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003867 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 return -1;
3869 }
3870}
3871/* ensure that *outobj is at least requiredsize characters long,
3872if not reallocate and adjust various state variables.
3873Return 0 on success, -1 on error */
3874static
Walter Dörwald4894c302003-10-24 14:25:28 +00003875int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003876 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003879 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003881 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003883 if (requiredsize < 2 * oldsize)
3884 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003885 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 return -1;
3887 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 }
3889 return 0;
3890}
3891/* lookup the character, put the result in the output string and adjust
3892 various state variables. Return a new reference to the object that
3893 was put in the output buffer in *result, or Py_None, if the mapping was
3894 undefined (in which case no character was written).
3895 The called must decref result.
3896 Return 0 on success, -1 on error. */
3897static
Walter Dörwald4894c302003-10-24 14:25:28 +00003898int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003899 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003900 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901{
Walter Dörwald4894c302003-10-24 14:25:28 +00003902 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 return -1;
3904 if (*res==NULL) {
3905 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003906 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 }
3908 else if (*res==Py_None)
3909 ;
3910 else if (PyInt_Check(*res)) {
3911 /* no overflow check, because we know that the space is enough */
3912 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3913 }
3914 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003915 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 if (repsize==1) {
3917 /* no overflow check, because we know that the space is enough */
3918 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3919 }
3920 else if (repsize!=0) {
3921 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003923 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003924 repsize - 1;
3925 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 return -1;
3927 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3928 *outp += repsize;
3929 }
3930 }
3931 else
3932 return -1;
3933 return 0;
3934}
3935
3936PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003937 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 PyObject *mapping,
3939 const char *errors)
3940{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 /* output object */
3942 PyObject *res = NULL;
3943 /* pointers to the beginning and end+1 of input */
3944 const Py_UNICODE *startp = p;
3945 const Py_UNICODE *endp = p + size;
3946 /* pointer into the output */
3947 Py_UNICODE *str;
3948 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003949 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 char *reason = "character maps to <undefined>";
3951 PyObject *errorHandler = NULL;
3952 PyObject *exc = NULL;
3953 /* the following variable is used for caching string comparisons
3954 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3955 * 3=ignore, 4=xmlcharrefreplace */
3956 int known_errorHandler = -1;
3957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 if (mapping == NULL) {
3959 PyErr_BadArgument();
3960 return NULL;
3961 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962
3963 /* allocate enough for a simple 1:1 translation without
3964 replacements, if we need more, we'll resize */
3965 res = PyUnicode_FromUnicode(NULL, size);
3966 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003967 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 return res;
3970 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 while (p<endp) {
3973 /* try to encode it */
3974 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003975 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 goto onError;
3978 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003979 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 if (x!=Py_None) /* it worked => adjust input pointer */
3981 ++p;
3982 else { /* untranslatable character */
3983 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t repsize;
3985 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 Py_UNICODE *uni2;
3987 /* startpos for collecting untranslatable chars */
3988 const Py_UNICODE *collstart = p;
3989 const Py_UNICODE *collend = p+1;
3990 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 /* find all untranslatable characters */
3993 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003994 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 goto onError;
3996 Py_XDECREF(x);
3997 if (x!=Py_None)
3998 break;
3999 ++collend;
4000 }
4001 /* cache callback name lookup
4002 * (if not done yet, i.e. it's the first error) */
4003 if (known_errorHandler==-1) {
4004 if ((errors==NULL) || (!strcmp(errors, "strict")))
4005 known_errorHandler = 1;
4006 else if (!strcmp(errors, "replace"))
4007 known_errorHandler = 2;
4008 else if (!strcmp(errors, "ignore"))
4009 known_errorHandler = 3;
4010 else if (!strcmp(errors, "xmlcharrefreplace"))
4011 known_errorHandler = 4;
4012 else
4013 known_errorHandler = 0;
4014 }
4015 switch (known_errorHandler) {
4016 case 1: /* strict */
4017 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4018 goto onError;
4019 case 2: /* replace */
4020 /* No need to check for space, this is a 1:1 replacement */
4021 for (coll = collstart; coll<collend; ++coll)
4022 *str++ = '?';
4023 /* fall through */
4024 case 3: /* ignore */
4025 p = collend;
4026 break;
4027 case 4: /* xmlcharrefreplace */
4028 /* generate replacement (temporarily (mis)uses p) */
4029 for (p = collstart; p < collend; ++p) {
4030 char buffer[2+29+1+1];
4031 char *cp;
4032 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004033 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4035 goto onError;
4036 for (cp = buffer; *cp; ++cp)
4037 *str++ = *cp;
4038 }
4039 p = collend;
4040 break;
4041 default:
4042 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4043 reason, startp, size, &exc,
4044 collstart-startp, collend-startp, &newpos);
4045 if (repunicode == NULL)
4046 goto onError;
4047 /* generate replacement */
4048 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004049 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4051 Py_DECREF(repunicode);
4052 goto onError;
4053 }
4054 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4055 *str++ = *uni2;
4056 p = startp + newpos;
4057 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 }
4059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 /* Resize if we allocated to much */
4062 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004063 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004064 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 }
4067 Py_XDECREF(exc);
4068 Py_XDECREF(errorHandler);
4069 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 onError:
4072 Py_XDECREF(res);
4073 Py_XDECREF(exc);
4074 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 return NULL;
4076}
4077
4078PyObject *PyUnicode_Translate(PyObject *str,
4079 PyObject *mapping,
4080 const char *errors)
4081{
4082 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004083
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 str = PyUnicode_FromObject(str);
4085 if (str == NULL)
4086 goto onError;
4087 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4088 PyUnicode_GET_SIZE(str),
4089 mapping,
4090 errors);
4091 Py_DECREF(str);
4092 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004093
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 onError:
4095 Py_XDECREF(str);
4096 return NULL;
4097}
Tim Petersced69f82003-09-16 20:30:58 +00004098
Guido van Rossum9e896b32000-04-05 20:11:21 +00004099/* --- Decimal Encoder ---------------------------------------------------- */
4100
4101int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004102 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004103 char *output,
4104 const char *errors)
4105{
4106 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 PyObject *errorHandler = NULL;
4108 PyObject *exc = NULL;
4109 const char *encoding = "decimal";
4110 const char *reason = "invalid decimal Unicode string";
4111 /* the following variable is used for caching string comparisons
4112 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4113 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004114
4115 if (output == NULL) {
4116 PyErr_BadArgument();
4117 return -1;
4118 }
4119
4120 p = s;
4121 end = s + length;
4122 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004124 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004126 Py_ssize_t repsize;
4127 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 Py_UNICODE *uni2;
4129 Py_UNICODE *collstart;
4130 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004131
Guido van Rossum9e896b32000-04-05 20:11:21 +00004132 if (Py_UNICODE_ISSPACE(ch)) {
4133 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004135 continue;
4136 }
4137 decimal = Py_UNICODE_TODECIMAL(ch);
4138 if (decimal >= 0) {
4139 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004141 continue;
4142 }
Guido van Rossumba477042000-04-06 18:18:10 +00004143 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004144 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004146 continue;
4147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 /* All other characters are considered unencodable */
4149 collstart = p;
4150 collend = p+1;
4151 while (collend < end) {
4152 if ((0 < *collend && *collend < 256) ||
4153 !Py_UNICODE_ISSPACE(*collend) ||
4154 Py_UNICODE_TODECIMAL(*collend))
4155 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004156 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 /* cache callback name lookup
4158 * (if not done yet, i.e. it's the first error) */
4159 if (known_errorHandler==-1) {
4160 if ((errors==NULL) || (!strcmp(errors, "strict")))
4161 known_errorHandler = 1;
4162 else if (!strcmp(errors, "replace"))
4163 known_errorHandler = 2;
4164 else if (!strcmp(errors, "ignore"))
4165 known_errorHandler = 3;
4166 else if (!strcmp(errors, "xmlcharrefreplace"))
4167 known_errorHandler = 4;
4168 else
4169 known_errorHandler = 0;
4170 }
4171 switch (known_errorHandler) {
4172 case 1: /* strict */
4173 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4174 goto onError;
4175 case 2: /* replace */
4176 for (p = collstart; p < collend; ++p)
4177 *output++ = '?';
4178 /* fall through */
4179 case 3: /* ignore */
4180 p = collend;
4181 break;
4182 case 4: /* xmlcharrefreplace */
4183 /* generate replacement (temporarily (mis)uses p) */
4184 for (p = collstart; p < collend; ++p)
4185 output += sprintf(output, "&#%d;", (int)*p);
4186 p = collend;
4187 break;
4188 default:
4189 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4190 encoding, reason, s, length, &exc,
4191 collstart-s, collend-s, &newpos);
4192 if (repunicode == NULL)
4193 goto onError;
4194 /* generate replacement */
4195 repsize = PyUnicode_GET_SIZE(repunicode);
4196 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4197 Py_UNICODE ch = *uni2;
4198 if (Py_UNICODE_ISSPACE(ch))
4199 *output++ = ' ';
4200 else {
4201 decimal = Py_UNICODE_TODECIMAL(ch);
4202 if (decimal >= 0)
4203 *output++ = '0' + decimal;
4204 else if (0 < ch && ch < 256)
4205 *output++ = (char)ch;
4206 else {
4207 Py_DECREF(repunicode);
4208 raise_encode_exception(&exc, encoding,
4209 s, length, collstart-s, collend-s, reason);
4210 goto onError;
4211 }
4212 }
4213 }
4214 p = s + newpos;
4215 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004216 }
4217 }
4218 /* 0-terminate the output string */
4219 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 Py_XDECREF(exc);
4221 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004222 return 0;
4223
4224 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 Py_XDECREF(exc);
4226 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004227 return -1;
4228}
4229
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230/* --- Helpers ------------------------------------------------------------ */
4231
Thomas Wouters477c8d52006-05-27 19:21:47 +00004232#define STRINGLIB_CHAR Py_UNICODE
4233
4234#define STRINGLIB_LEN PyUnicode_GET_SIZE
4235#define STRINGLIB_NEW PyUnicode_FromUnicode
4236#define STRINGLIB_STR PyUnicode_AS_UNICODE
4237
4238Py_LOCAL_INLINE(int)
4239STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004241 if (str[0] != other[0])
4242 return 1;
4243 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244}
4245
Thomas Wouters477c8d52006-05-27 19:21:47 +00004246#define STRINGLIB_EMPTY unicode_empty
4247
4248#include "stringlib/fastsearch.h"
4249
4250#include "stringlib/count.h"
4251#include "stringlib/find.h"
4252#include "stringlib/partition.h"
4253
4254/* helper macro to fixup start/end slice values */
4255#define FIX_START_END(obj) \
4256 if (start < 0) \
4257 start += (obj)->length; \
4258 if (start < 0) \
4259 start = 0; \
4260 if (end > (obj)->length) \
4261 end = (obj)->length; \
4262 if (end < 0) \
4263 end += (obj)->length; \
4264 if (end < 0) \
4265 end = 0;
4266
Martin v. Löwis18e16552006-02-15 17:27:45 +00004267Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004268 PyObject *substr,
4269 Py_ssize_t start,
4270 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004272 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004273 PyUnicodeObject* str_obj;
4274 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004275
Thomas Wouters477c8d52006-05-27 19:21:47 +00004276 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4277 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004279 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4280 if (!sub_obj) {
4281 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282 return -1;
4283 }
Tim Petersced69f82003-09-16 20:30:58 +00004284
Thomas Wouters477c8d52006-05-27 19:21:47 +00004285 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004286
Thomas Wouters477c8d52006-05-27 19:21:47 +00004287 result = stringlib_count(
4288 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4289 );
4290
4291 Py_DECREF(sub_obj);
4292 Py_DECREF(str_obj);
4293
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 return result;
4295}
4296
Martin v. Löwis18e16552006-02-15 17:27:45 +00004297Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004298 PyObject *sub,
4299 Py_ssize_t start,
4300 Py_ssize_t end,
4301 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004304
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004306 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004307 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004308 sub = PyUnicode_FromObject(sub);
4309 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004310 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004311 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 }
Tim Petersced69f82003-09-16 20:30:58 +00004313
Thomas Wouters477c8d52006-05-27 19:21:47 +00004314 if (direction > 0)
4315 result = stringlib_find_slice(
4316 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4317 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4318 start, end
4319 );
4320 else
4321 result = stringlib_rfind_slice(
4322 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4323 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4324 start, end
4325 );
4326
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004328 Py_DECREF(sub);
4329
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 return result;
4331}
4332
Tim Petersced69f82003-09-16 20:30:58 +00004333static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334int tailmatch(PyUnicodeObject *self,
4335 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004336 Py_ssize_t start,
4337 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 int direction)
4339{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 if (substring->length == 0)
4341 return 1;
4342
Thomas Wouters477c8d52006-05-27 19:21:47 +00004343 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344
4345 end -= substring->length;
4346 if (end < start)
4347 return 0;
4348
4349 if (direction > 0) {
4350 if (Py_UNICODE_MATCH(self, end, substring))
4351 return 1;
4352 } else {
4353 if (Py_UNICODE_MATCH(self, start, substring))
4354 return 1;
4355 }
4356
4357 return 0;
4358}
4359
Martin v. Löwis18e16552006-02-15 17:27:45 +00004360Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004362 Py_ssize_t start,
4363 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 int direction)
4365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004366 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004367
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 str = PyUnicode_FromObject(str);
4369 if (str == NULL)
4370 return -1;
4371 substr = PyUnicode_FromObject(substr);
4372 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004373 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 return -1;
4375 }
Tim Petersced69f82003-09-16 20:30:58 +00004376
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 result = tailmatch((PyUnicodeObject *)str,
4378 (PyUnicodeObject *)substr,
4379 start, end, direction);
4380 Py_DECREF(str);
4381 Py_DECREF(substr);
4382 return result;
4383}
4384
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385/* Apply fixfct filter to the Unicode object self and return a
4386 reference to the modified object */
4387
Tim Petersced69f82003-09-16 20:30:58 +00004388static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389PyObject *fixup(PyUnicodeObject *self,
4390 int (*fixfct)(PyUnicodeObject *s))
4391{
4392
4393 PyUnicodeObject *u;
4394
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004395 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 if (u == NULL)
4397 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004398
4399 Py_UNICODE_COPY(u->str, self->str, self->length);
4400
Tim Peters7a29bd52001-09-12 03:03:31 +00004401 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 /* fixfct should return TRUE if it modified the buffer. If
4403 FALSE, return a reference to the original buffer instead
4404 (to save space, not time) */
4405 Py_INCREF(self);
4406 Py_DECREF(u);
4407 return (PyObject*) self;
4408 }
4409 return (PyObject*) u;
4410}
4411
Tim Petersced69f82003-09-16 20:30:58 +00004412static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413int fixupper(PyUnicodeObject *self)
4414{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 Py_UNICODE *s = self->str;
4417 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004418
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 while (len-- > 0) {
4420 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004421
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 ch = Py_UNICODE_TOUPPER(*s);
4423 if (ch != *s) {
4424 status = 1;
4425 *s = ch;
4426 }
4427 s++;
4428 }
4429
4430 return status;
4431}
4432
Tim Petersced69f82003-09-16 20:30:58 +00004433static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434int fixlower(PyUnicodeObject *self)
4435{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 Py_UNICODE *s = self->str;
4438 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 while (len-- > 0) {
4441 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004442
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 ch = Py_UNICODE_TOLOWER(*s);
4444 if (ch != *s) {
4445 status = 1;
4446 *s = ch;
4447 }
4448 s++;
4449 }
4450
4451 return status;
4452}
4453
Tim Petersced69f82003-09-16 20:30:58 +00004454static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455int fixswapcase(PyUnicodeObject *self)
4456{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004457 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 Py_UNICODE *s = self->str;
4459 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 while (len-- > 0) {
4462 if (Py_UNICODE_ISUPPER(*s)) {
4463 *s = Py_UNICODE_TOLOWER(*s);
4464 status = 1;
4465 } else if (Py_UNICODE_ISLOWER(*s)) {
4466 *s = Py_UNICODE_TOUPPER(*s);
4467 status = 1;
4468 }
4469 s++;
4470 }
4471
4472 return status;
4473}
4474
Tim Petersced69f82003-09-16 20:30:58 +00004475static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476int fixcapitalize(PyUnicodeObject *self)
4477{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004478 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004479 Py_UNICODE *s = self->str;
4480 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004481
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004482 if (len == 0)
4483 return 0;
4484 if (Py_UNICODE_ISLOWER(*s)) {
4485 *s = Py_UNICODE_TOUPPER(*s);
4486 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004488 s++;
4489 while (--len > 0) {
4490 if (Py_UNICODE_ISUPPER(*s)) {
4491 *s = Py_UNICODE_TOLOWER(*s);
4492 status = 1;
4493 }
4494 s++;
4495 }
4496 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497}
4498
4499static
4500int fixtitle(PyUnicodeObject *self)
4501{
4502 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4503 register Py_UNICODE *e;
4504 int previous_is_cased;
4505
4506 /* Shortcut for single character strings */
4507 if (PyUnicode_GET_SIZE(self) == 1) {
4508 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4509 if (*p != ch) {
4510 *p = ch;
4511 return 1;
4512 }
4513 else
4514 return 0;
4515 }
Tim Petersced69f82003-09-16 20:30:58 +00004516
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 e = p + PyUnicode_GET_SIZE(self);
4518 previous_is_cased = 0;
4519 for (; p < e; p++) {
4520 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004521
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 if (previous_is_cased)
4523 *p = Py_UNICODE_TOLOWER(ch);
4524 else
4525 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004526
4527 if (Py_UNICODE_ISLOWER(ch) ||
4528 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 Py_UNICODE_ISTITLE(ch))
4530 previous_is_cased = 1;
4531 else
4532 previous_is_cased = 0;
4533 }
4534 return 1;
4535}
4536
Tim Peters8ce9f162004-08-27 01:49:32 +00004537PyObject *
4538PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539{
Tim Peters8ce9f162004-08-27 01:49:32 +00004540 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004541 const Py_UNICODE blank = ' ';
4542 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004543 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004544 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004545 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4546 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004547 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4548 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004549 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004550 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004551 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552
Tim Peters05eba1f2004-08-27 21:32:02 +00004553 fseq = PySequence_Fast(seq, "");
4554 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004555 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004556 }
4557
Tim Peters91879ab2004-08-27 22:35:44 +00004558 /* Grrrr. A codec may be invoked to convert str objects to
4559 * Unicode, and so it's possible to call back into Python code
4560 * during PyUnicode_FromObject(), and so it's possible for a sick
4561 * codec to change the size of fseq (if seq is a list). Therefore
4562 * we have to keep refetching the size -- can't assume seqlen
4563 * is invariant.
4564 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004565 seqlen = PySequence_Fast_GET_SIZE(fseq);
4566 /* If empty sequence, return u"". */
4567 if (seqlen == 0) {
4568 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4569 goto Done;
4570 }
4571 /* If singleton sequence with an exact Unicode, return that. */
4572 if (seqlen == 1) {
4573 item = PySequence_Fast_GET_ITEM(fseq, 0);
4574 if (PyUnicode_CheckExact(item)) {
4575 Py_INCREF(item);
4576 res = (PyUnicodeObject *)item;
4577 goto Done;
4578 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004579 }
4580
Tim Peters05eba1f2004-08-27 21:32:02 +00004581 /* At least two items to join, or one that isn't exact Unicode. */
4582 if (seqlen > 1) {
4583 /* Set up sep and seplen -- they're needed. */
4584 if (separator == NULL) {
4585 sep = &blank;
4586 seplen = 1;
4587 }
4588 else {
4589 internal_separator = PyUnicode_FromObject(separator);
4590 if (internal_separator == NULL)
4591 goto onError;
4592 sep = PyUnicode_AS_UNICODE(internal_separator);
4593 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004594 /* In case PyUnicode_FromObject() mutated seq. */
4595 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004596 }
4597 }
4598
4599 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004600 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004601 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004602 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004603 res_p = PyUnicode_AS_UNICODE(res);
4604 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004605
Tim Peters05eba1f2004-08-27 21:32:02 +00004606 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004607 Py_ssize_t itemlen;
4608 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004609
4610 item = PySequence_Fast_GET_ITEM(fseq, i);
4611 /* Convert item to Unicode. */
4612 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4613 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004614 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004615 " %.80s found",
4616 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004617 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004618 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004619 item = PyUnicode_FromObject(item);
4620 if (item == NULL)
4621 goto onError;
4622 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004623
Tim Peters91879ab2004-08-27 22:35:44 +00004624 /* In case PyUnicode_FromObject() mutated seq. */
4625 seqlen = PySequence_Fast_GET_SIZE(fseq);
4626
Tim Peters8ce9f162004-08-27 01:49:32 +00004627 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004629 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004630 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004631 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 if (i < seqlen - 1) {
4633 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004634 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004635 goto Overflow;
4636 }
4637 if (new_res_used > res_alloc) {
4638 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004639 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004640 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004641 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004642 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004643 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004644 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004645 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004647 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004648 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004650
4651 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004652 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004653 res_p += itemlen;
4654 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004655 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004656 res_p += seplen;
4657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004659 res_used = new_res_used;
4660 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004661
Tim Peters05eba1f2004-08-27 21:32:02 +00004662 /* Shrink res to match the used area; this probably can't fail,
4663 * but it's cheap to check.
4664 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004665 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004666 goto onError;
4667
4668 Done:
4669 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004670 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 return (PyObject *)res;
4672
Tim Peters8ce9f162004-08-27 01:49:32 +00004673 Overflow:
4674 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004675 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004676 Py_DECREF(item);
4677 /* fall through */
4678
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004680 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004682 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 return NULL;
4684}
4685
Tim Petersced69f82003-09-16 20:30:58 +00004686static
4687PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 Py_ssize_t left,
4689 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 Py_UNICODE fill)
4691{
4692 PyUnicodeObject *u;
4693
4694 if (left < 0)
4695 left = 0;
4696 if (right < 0)
4697 right = 0;
4698
Tim Peters7a29bd52001-09-12 03:03:31 +00004699 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 Py_INCREF(self);
4701 return self;
4702 }
4703
4704 u = _PyUnicode_New(left + self->length + right);
4705 if (u) {
4706 if (left)
4707 Py_UNICODE_FILL(u->str, fill, left);
4708 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4709 if (right)
4710 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4711 }
4712
4713 return u;
4714}
4715
4716#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004717 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 if (!str) \
4719 goto onError; \
4720 if (PyList_Append(list, str)) { \
4721 Py_DECREF(str); \
4722 goto onError; \
4723 } \
4724 else \
4725 Py_DECREF(str);
4726
4727static
4728PyObject *split_whitespace(PyUnicodeObject *self,
4729 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004730 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004732 register Py_ssize_t i;
4733 register Py_ssize_t j;
4734 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 PyObject *str;
4736
4737 for (i = j = 0; i < len; ) {
4738 /* find a token */
4739 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4740 i++;
4741 j = i;
4742 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4743 i++;
4744 if (j < i) {
4745 if (maxcount-- <= 0)
4746 break;
4747 SPLIT_APPEND(self->str, j, i);
4748 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4749 i++;
4750 j = i;
4751 }
4752 }
4753 if (j < len) {
4754 SPLIT_APPEND(self->str, j, len);
4755 }
4756 return list;
4757
4758 onError:
4759 Py_DECREF(list);
4760 return NULL;
4761}
4762
4763PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004764 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 register Py_ssize_t i;
4767 register Py_ssize_t j;
4768 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 PyObject *list;
4770 PyObject *str;
4771 Py_UNICODE *data;
4772
4773 string = PyUnicode_FromObject(string);
4774 if (string == NULL)
4775 return NULL;
4776 data = PyUnicode_AS_UNICODE(string);
4777 len = PyUnicode_GET_SIZE(string);
4778
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 list = PyList_New(0);
4780 if (!list)
4781 goto onError;
4782
4783 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004785
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004787 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789
4790 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004791 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 if (i < len) {
4793 if (data[i] == '\r' && i + 1 < len &&
4794 data[i+1] == '\n')
4795 i += 2;
4796 else
4797 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004798 if (keepends)
4799 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 }
Guido van Rossum86662912000-04-11 15:38:46 +00004801 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 j = i;
4803 }
4804 if (j < len) {
4805 SPLIT_APPEND(data, j, len);
4806 }
4807
4808 Py_DECREF(string);
4809 return list;
4810
4811 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004812 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 Py_DECREF(string);
4814 return NULL;
4815}
4816
Tim Petersced69f82003-09-16 20:30:58 +00004817static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818PyObject *split_char(PyUnicodeObject *self,
4819 PyObject *list,
4820 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004821 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 register Py_ssize_t i;
4824 register Py_ssize_t j;
4825 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 PyObject *str;
4827
4828 for (i = j = 0; i < len; ) {
4829 if (self->str[i] == ch) {
4830 if (maxcount-- <= 0)
4831 break;
4832 SPLIT_APPEND(self->str, j, i);
4833 i = j = i + 1;
4834 } else
4835 i++;
4836 }
4837 if (j <= len) {
4838 SPLIT_APPEND(self->str, j, len);
4839 }
4840 return list;
4841
4842 onError:
4843 Py_DECREF(list);
4844 return NULL;
4845}
4846
Tim Petersced69f82003-09-16 20:30:58 +00004847static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848PyObject *split_substring(PyUnicodeObject *self,
4849 PyObject *list,
4850 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004853 register Py_ssize_t i;
4854 register Py_ssize_t j;
4855 Py_ssize_t len = self->length;
4856 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 PyObject *str;
4858
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004859 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 if (Py_UNICODE_MATCH(self, i, substring)) {
4861 if (maxcount-- <= 0)
4862 break;
4863 SPLIT_APPEND(self->str, j, i);
4864 i = j = i + sublen;
4865 } else
4866 i++;
4867 }
4868 if (j <= len) {
4869 SPLIT_APPEND(self->str, j, len);
4870 }
4871 return list;
4872
4873 onError:
4874 Py_DECREF(list);
4875 return NULL;
4876}
4877
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004878static
4879PyObject *rsplit_whitespace(PyUnicodeObject *self,
4880 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004882{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004883 register Py_ssize_t i;
4884 register Py_ssize_t j;
4885 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004886 PyObject *str;
4887
4888 for (i = j = len - 1; i >= 0; ) {
4889 /* find a token */
4890 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4891 i--;
4892 j = i;
4893 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4894 i--;
4895 if (j > i) {
4896 if (maxcount-- <= 0)
4897 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004898 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004899 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4900 i--;
4901 j = i;
4902 }
4903 }
4904 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004905 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004906 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004907 if (PyList_Reverse(list) < 0)
4908 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004909 return list;
4910
4911 onError:
4912 Py_DECREF(list);
4913 return NULL;
4914}
4915
4916static
4917PyObject *rsplit_char(PyUnicodeObject *self,
4918 PyObject *list,
4919 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004920 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004921{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004922 register Py_ssize_t i;
4923 register Py_ssize_t j;
4924 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004925 PyObject *str;
4926
4927 for (i = j = len - 1; i >= 0; ) {
4928 if (self->str[i] == ch) {
4929 if (maxcount-- <= 0)
4930 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004931 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004932 j = i = i - 1;
4933 } else
4934 i--;
4935 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004936 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004937 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004938 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004939 if (PyList_Reverse(list) < 0)
4940 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004941 return list;
4942
4943 onError:
4944 Py_DECREF(list);
4945 return NULL;
4946}
4947
4948static
4949PyObject *rsplit_substring(PyUnicodeObject *self,
4950 PyObject *list,
4951 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004952 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004953{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004954 register Py_ssize_t i;
4955 register Py_ssize_t j;
4956 Py_ssize_t len = self->length;
4957 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004958 PyObject *str;
4959
4960 for (i = len - sublen, j = len; i >= 0; ) {
4961 if (Py_UNICODE_MATCH(self, i, substring)) {
4962 if (maxcount-- <= 0)
4963 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004964 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004965 j = i;
4966 i -= sublen;
4967 } else
4968 i--;
4969 }
4970 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004971 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004973 if (PyList_Reverse(list) < 0)
4974 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004975 return list;
4976
4977 onError:
4978 Py_DECREF(list);
4979 return NULL;
4980}
4981
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982#undef SPLIT_APPEND
4983
4984static
4985PyObject *split(PyUnicodeObject *self,
4986 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988{
4989 PyObject *list;
4990
4991 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004992 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993
4994 list = PyList_New(0);
4995 if (!list)
4996 return NULL;
4997
4998 if (substring == NULL)
4999 return split_whitespace(self,list,maxcount);
5000
5001 else if (substring->length == 1)
5002 return split_char(self,list,substring->str[0],maxcount);
5003
5004 else if (substring->length == 0) {
5005 Py_DECREF(list);
5006 PyErr_SetString(PyExc_ValueError, "empty separator");
5007 return NULL;
5008 }
5009 else
5010 return split_substring(self,list,substring,maxcount);
5011}
5012
Tim Petersced69f82003-09-16 20:30:58 +00005013static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005014PyObject *rsplit(PyUnicodeObject *self,
5015 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005016 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005017{
5018 PyObject *list;
5019
5020 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005021 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005022
5023 list = PyList_New(0);
5024 if (!list)
5025 return NULL;
5026
5027 if (substring == NULL)
5028 return rsplit_whitespace(self,list,maxcount);
5029
5030 else if (substring->length == 1)
5031 return rsplit_char(self,list,substring->str[0],maxcount);
5032
5033 else if (substring->length == 0) {
5034 Py_DECREF(list);
5035 PyErr_SetString(PyExc_ValueError, "empty separator");
5036 return NULL;
5037 }
5038 else
5039 return rsplit_substring(self,list,substring,maxcount);
5040}
5041
5042static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043PyObject *replace(PyUnicodeObject *self,
5044 PyUnicodeObject *str1,
5045 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005046 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047{
5048 PyUnicodeObject *u;
5049
5050 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005051 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052
Thomas Wouters477c8d52006-05-27 19:21:47 +00005053 if (str1->length == str2->length) {
5054 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005055 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 if (str1->length == 1) {
5057 /* replace characters */
5058 Py_UNICODE u1, u2;
5059 if (!findchar(self->str, self->length, str1->str[0]))
5060 goto nothing;
5061 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5062 if (!u)
5063 return NULL;
5064 Py_UNICODE_COPY(u->str, self->str, self->length);
5065 u1 = str1->str[0];
5066 u2 = str2->str[0];
5067 for (i = 0; i < u->length; i++)
5068 if (u->str[i] == u1) {
5069 if (--maxcount < 0)
5070 break;
5071 u->str[i] = u2;
5072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005074 i = fastsearch(
5075 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005077 if (i < 0)
5078 goto nothing;
5079 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5080 if (!u)
5081 return NULL;
5082 Py_UNICODE_COPY(u->str, self->str, self->length);
5083 while (i <= self->length - str1->length)
5084 if (Py_UNICODE_MATCH(self, i, str1)) {
5085 if (--maxcount < 0)
5086 break;
5087 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5088 i += str1->length;
5089 } else
5090 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005093
5094 Py_ssize_t n, i, j, e;
5095 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 Py_UNICODE *p;
5097
5098 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005099 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 if (n > maxcount)
5101 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005102 if (n == 0)
5103 goto nothing;
5104 /* new_size = self->length + n * (str2->length - str1->length)); */
5105 delta = (str2->length - str1->length);
5106 if (delta == 0) {
5107 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005109 product = n * (str2->length - str1->length);
5110 if ((product / (str2->length - str1->length)) != n) {
5111 PyErr_SetString(PyExc_OverflowError,
5112 "replace string is too long");
5113 return NULL;
5114 }
5115 new_size = self->length + product;
5116 if (new_size < 0) {
5117 PyErr_SetString(PyExc_OverflowError,
5118 "replace string is too long");
5119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 }
5121 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005122 u = _PyUnicode_New(new_size);
5123 if (!u)
5124 return NULL;
5125 i = 0;
5126 p = u->str;
5127 e = self->length - str1->length;
5128 if (str1->length > 0) {
5129 while (n-- > 0) {
5130 /* look for next match */
5131 j = i;
5132 while (j <= e) {
5133 if (Py_UNICODE_MATCH(self, j, str1))
5134 break;
5135 j++;
5136 }
5137 if (j > i) {
5138 if (j > e)
5139 break;
5140 /* copy unchanged part [i:j] */
5141 Py_UNICODE_COPY(p, self->str+i, j-i);
5142 p += j - i;
5143 }
5144 /* copy substitution string */
5145 if (str2->length > 0) {
5146 Py_UNICODE_COPY(p, str2->str, str2->length);
5147 p += str2->length;
5148 }
5149 i = j + str1->length;
5150 }
5151 if (i < self->length)
5152 /* copy tail [i:] */
5153 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5154 } else {
5155 /* interleave */
5156 while (n > 0) {
5157 Py_UNICODE_COPY(p, str2->str, str2->length);
5158 p += str2->length;
5159 if (--n <= 0)
5160 break;
5161 *p++ = self->str[i++];
5162 }
5163 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005167
5168nothing:
5169 /* nothing to replace; return original string (when possible) */
5170 if (PyUnicode_CheckExact(self)) {
5171 Py_INCREF(self);
5172 return (PyObject *) self;
5173 }
5174 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175}
5176
5177/* --- Unicode Object Methods --------------------------------------------- */
5178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005179PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180"S.title() -> unicode\n\
5181\n\
5182Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005183characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
5185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005186unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 return fixup(self, fixtitle);
5189}
5190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005191PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192"S.capitalize() -> unicode\n\
5193\n\
5194Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005195have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005198unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 return fixup(self, fixcapitalize);
5201}
5202
5203#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005204PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205"S.capwords() -> unicode\n\
5206\n\
5207Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005208normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209
5210static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005211unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212{
5213 PyObject *list;
5214 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005215 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 /* Split into words */
5218 list = split(self, NULL, -1);
5219 if (!list)
5220 return NULL;
5221
5222 /* Capitalize each word */
5223 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5224 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5225 fixcapitalize);
5226 if (item == NULL)
5227 goto onError;
5228 Py_DECREF(PyList_GET_ITEM(list, i));
5229 PyList_SET_ITEM(list, i, item);
5230 }
5231
5232 /* Join the words to form a new string */
5233 item = PyUnicode_Join(NULL, list);
5234
5235onError:
5236 Py_DECREF(list);
5237 return (PyObject *)item;
5238}
5239#endif
5240
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005241/* Argument converter. Coerces to a single unicode character */
5242
5243static int
5244convert_uc(PyObject *obj, void *addr)
5245{
5246 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5247 PyObject *uniobj;
5248 Py_UNICODE *unistr;
5249
5250 uniobj = PyUnicode_FromObject(obj);
5251 if (uniobj == NULL) {
5252 PyErr_SetString(PyExc_TypeError,
5253 "The fill character cannot be converted to Unicode");
5254 return 0;
5255 }
5256 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5257 PyErr_SetString(PyExc_TypeError,
5258 "The fill character must be exactly one character long");
5259 Py_DECREF(uniobj);
5260 return 0;
5261 }
5262 unistr = PyUnicode_AS_UNICODE(uniobj);
5263 *fillcharloc = unistr[0];
5264 Py_DECREF(uniobj);
5265 return 1;
5266}
5267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005268PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005269"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005271Return S centered in a Unicode string of length width. Padding is\n\
5272done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
5274static PyObject *
5275unicode_center(PyUnicodeObject *self, PyObject *args)
5276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 Py_ssize_t marg, left;
5278 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005279 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
Thomas Woutersde017742006-02-16 19:34:37 +00005281 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 return NULL;
5283
Tim Peters7a29bd52001-09-12 03:03:31 +00005284 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 Py_INCREF(self);
5286 return (PyObject*) self;
5287 }
5288
5289 marg = width - self->length;
5290 left = marg / 2 + (marg & width & 1);
5291
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005292 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293}
5294
Marc-André Lemburge5034372000-08-08 08:04:29 +00005295#if 0
5296
5297/* This code should go into some future Unicode collation support
5298 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005299 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005300
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005301/* speedy UTF-16 code point order comparison */
5302/* gleaned from: */
5303/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5304
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005305static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005306{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005307 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005308 0, 0, 0, 0, 0, 0, 0, 0,
5309 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005310 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005311};
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313static int
5314unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5315{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005316 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 Py_UNICODE *s1 = str1->str;
5319 Py_UNICODE *s2 = str2->str;
5320
5321 len1 = str1->length;
5322 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005325 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005326
5327 c1 = *s1++;
5328 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005329
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005330 if (c1 > (1<<11) * 26)
5331 c1 += utf16Fixup[c1>>11];
5332 if (c2 > (1<<11) * 26)
5333 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005334 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005335
5336 if (c1 != c2)
5337 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005338
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005339 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 }
5341
5342 return (len1 < len2) ? -1 : (len1 != len2);
5343}
5344
Marc-André Lemburge5034372000-08-08 08:04:29 +00005345#else
5346
5347static int
5348unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005350 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005351
5352 Py_UNICODE *s1 = str1->str;
5353 Py_UNICODE *s2 = str2->str;
5354
5355 len1 = str1->length;
5356 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Marc-André Lemburge5034372000-08-08 08:04:29 +00005358 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005359 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005360
Fredrik Lundh45714e92001-06-26 16:39:36 +00005361 c1 = *s1++;
5362 c2 = *s2++;
5363
5364 if (c1 != c2)
5365 return (c1 < c2) ? -1 : 1;
5366
Marc-André Lemburge5034372000-08-08 08:04:29 +00005367 len1--; len2--;
5368 }
5369
5370 return (len1 < len2) ? -1 : (len1 != len2);
5371}
5372
5373#endif
5374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375int PyUnicode_Compare(PyObject *left,
5376 PyObject *right)
5377{
5378 PyUnicodeObject *u = NULL, *v = NULL;
5379 int result;
5380
5381 /* Coerce the two arguments */
5382 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5383 if (u == NULL)
5384 goto onError;
5385 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5386 if (v == NULL)
5387 goto onError;
5388
Thomas Wouters7e474022000-07-16 12:04:32 +00005389 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 if (v == u) {
5391 Py_DECREF(u);
5392 Py_DECREF(v);
5393 return 0;
5394 }
5395
5396 result = unicode_compare(u, v);
5397
5398 Py_DECREF(u);
5399 Py_DECREF(v);
5400 return result;
5401
5402onError:
5403 Py_XDECREF(u);
5404 Py_XDECREF(v);
5405 return -1;
5406}
5407
Guido van Rossum403d68b2000-03-13 15:55:09 +00005408int PyUnicode_Contains(PyObject *container,
5409 PyObject *element)
5410{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005411 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005412 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005413
5414 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005415 sub = PyUnicode_FromObject(element);
5416 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005417 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005418 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005420 }
5421
Thomas Wouters477c8d52006-05-27 19:21:47 +00005422 str = PyUnicode_FromObject(container);
5423 if (!str) {
5424 Py_DECREF(sub);
5425 return -1;
5426 }
5427
5428 result = stringlib_contains_obj(str, sub);
5429
5430 Py_DECREF(str);
5431 Py_DECREF(sub);
5432
Guido van Rossum403d68b2000-03-13 15:55:09 +00005433 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005434}
5435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436/* Concat to string or Unicode object giving a new Unicode object. */
5437
5438PyObject *PyUnicode_Concat(PyObject *left,
5439 PyObject *right)
5440{
5441 PyUnicodeObject *u = NULL, *v = NULL, *w;
5442
5443 /* Coerce the two arguments */
5444 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5445 if (u == NULL)
5446 goto onError;
5447 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5448 if (v == NULL)
5449 goto onError;
5450
5451 /* Shortcuts */
5452 if (v == unicode_empty) {
5453 Py_DECREF(v);
5454 return (PyObject *)u;
5455 }
5456 if (u == unicode_empty) {
5457 Py_DECREF(u);
5458 return (PyObject *)v;
5459 }
5460
5461 /* Concat the two Unicode strings */
5462 w = _PyUnicode_New(u->length + v->length);
5463 if (w == NULL)
5464 goto onError;
5465 Py_UNICODE_COPY(w->str, u->str, u->length);
5466 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5467
5468 Py_DECREF(u);
5469 Py_DECREF(v);
5470 return (PyObject *)w;
5471
5472onError:
5473 Py_XDECREF(u);
5474 Py_XDECREF(v);
5475 return NULL;
5476}
5477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005478PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479"S.count(sub[, start[, end]]) -> int\n\
5480\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005481Return the number of non-overlapping occurrences of substring sub in\n\
5482Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005483interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
5485static PyObject *
5486unicode_count(PyUnicodeObject *self, PyObject *args)
5487{
5488 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005490 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 PyObject *result;
5492
Guido van Rossumb8872e62000-05-09 14:14:27 +00005493 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5494 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 return NULL;
5496
5497 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005498 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 if (substring == NULL)
5500 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005501
Thomas Wouters477c8d52006-05-27 19:21:47 +00005502 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503
Thomas Wouters477c8d52006-05-27 19:21:47 +00005504 result = PyInt_FromSsize_t(
5505 stringlib_count(self->str + start, end - start,
5506 substring->str, substring->length)
5507 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
5509 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005510
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 return result;
5512}
5513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005514PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005515"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005517Encodes S using the codec registered for encoding. encoding defaults\n\
5518to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005519handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5521'xmlcharrefreplace' as well as any other name registered with\n\
5522codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523
5524static PyObject *
5525unicode_encode(PyUnicodeObject *self, PyObject *args)
5526{
5527 char *encoding = NULL;
5528 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005529 PyObject *v;
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5532 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005533 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005534 if (v == NULL)
5535 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005536 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5537 PyErr_Format(PyExc_TypeError,
5538 "encoder did not return a string/unicode object "
5539 "(type=%.400s)",
5540 v->ob_type->tp_name);
5541 Py_DECREF(v);
5542 return NULL;
5543 }
5544 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005545
5546 onError:
5547 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005548}
5549
5550PyDoc_STRVAR(decode__doc__,
5551"S.decode([encoding[,errors]]) -> string or unicode\n\
5552\n\
5553Decodes S using the codec registered for encoding. encoding defaults\n\
5554to the default encoding. errors may be given to set a different error\n\
5555handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5556a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5557as well as any other name registerd with codecs.register_error that is\n\
5558able to handle UnicodeDecodeErrors.");
5559
5560static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005561unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005562{
5563 char *encoding = NULL;
5564 char *errors = NULL;
5565 PyObject *v;
5566
5567 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5568 return NULL;
5569 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005570 if (v == NULL)
5571 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005572 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5573 PyErr_Format(PyExc_TypeError,
5574 "decoder did not return a string/unicode object "
5575 "(type=%.400s)",
5576 v->ob_type->tp_name);
5577 Py_DECREF(v);
5578 return NULL;
5579 }
5580 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005581
5582 onError:
5583 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584}
5585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005586PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587"S.expandtabs([tabsize]) -> unicode\n\
5588\n\
5589Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005590If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591
5592static PyObject*
5593unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5594{
5595 Py_UNICODE *e;
5596 Py_UNICODE *p;
5597 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005598 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 PyUnicodeObject *u;
5600 int tabsize = 8;
5601
5602 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5603 return NULL;
5604
Thomas Wouters7e474022000-07-16 12:04:32 +00005605 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 i = j = 0;
5607 e = self->str + self->length;
5608 for (p = self->str; p < e; p++)
5609 if (*p == '\t') {
5610 if (tabsize > 0)
5611 j += tabsize - (j % tabsize);
5612 }
5613 else {
5614 j++;
5615 if (*p == '\n' || *p == '\r') {
5616 i += j;
5617 j = 0;
5618 }
5619 }
5620
5621 /* Second pass: create output string and fill it */
5622 u = _PyUnicode_New(i + j);
5623 if (!u)
5624 return NULL;
5625
5626 j = 0;
5627 q = u->str;
5628
5629 for (p = self->str; p < e; p++)
5630 if (*p == '\t') {
5631 if (tabsize > 0) {
5632 i = tabsize - (j % tabsize);
5633 j += i;
5634 while (i--)
5635 *q++ = ' ';
5636 }
5637 }
5638 else {
5639 j++;
5640 *q++ = *p;
5641 if (*p == '\n' || *p == '\r')
5642 j = 0;
5643 }
5644
5645 return (PyObject*) u;
5646}
5647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005648PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649"S.find(sub [,start [,end]]) -> int\n\
5650\n\
5651Return the lowest index in S where substring sub is found,\n\
5652such that sub is contained within s[start,end]. Optional\n\
5653arguments start and end are interpreted as in slice notation.\n\
5654\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005655Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
5657static PyObject *
5658unicode_find(PyUnicodeObject *self, PyObject *args)
5659{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005660 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005661 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005662 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005663 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
Guido van Rossumb8872e62000-05-09 14:14:27 +00005665 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5666 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005668 substring = PyUnicode_FromObject(substring);
5669 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 return NULL;
5671
Thomas Wouters477c8d52006-05-27 19:21:47 +00005672 result = stringlib_find_slice(
5673 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5674 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5675 start, end
5676 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
5678 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005679
5680 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681}
5682
5683static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
5686 if (index < 0 || index >= self->length) {
5687 PyErr_SetString(PyExc_IndexError, "string index out of range");
5688 return NULL;
5689 }
5690
5691 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5692}
5693
5694static long
5695unicode_hash(PyUnicodeObject *self)
5696{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005697 /* Since Unicode objects compare equal to their ASCII string
5698 counterparts, they should use the individual character values
5699 as basis for their hash value. This is needed to assure that
5700 strings and Unicode objects behave in the same way as
5701 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005704 register Py_UNICODE *p;
5705 register long x;
5706
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 if (self->hash != -1)
5708 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005709 len = PyUnicode_GET_SIZE(self);
5710 p = PyUnicode_AS_UNICODE(self);
5711 x = *p << 7;
5712 while (--len >= 0)
5713 x = (1000003*x) ^ *p++;
5714 x ^= PyUnicode_GET_SIZE(self);
5715 if (x == -1)
5716 x = -2;
5717 self->hash = x;
5718 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719}
5720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005721PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722"S.index(sub [,start [,end]]) -> int\n\
5723\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005724Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
5726static PyObject *
5727unicode_index(PyUnicodeObject *self, PyObject *args)
5728{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005730 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005731 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005732 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Guido van Rossumb8872e62000-05-09 14:14:27 +00005734 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5735 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005737 substring = PyUnicode_FromObject(substring);
5738 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 return NULL;
5740
Thomas Wouters477c8d52006-05-27 19:21:47 +00005741 result = stringlib_find_slice(
5742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5744 start, end
5745 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
5747 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005748
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 if (result < 0) {
5750 PyErr_SetString(PyExc_ValueError, "substring not found");
5751 return NULL;
5752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005753
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755}
5756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005757PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005758"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005760Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005761at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
5763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005764unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765{
5766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5767 register const Py_UNICODE *e;
5768 int cased;
5769
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 /* Shortcut for single character strings */
5771 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005772 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005774 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005775 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005776 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005777
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 e = p + PyUnicode_GET_SIZE(self);
5779 cased = 0;
5780 for (; p < e; p++) {
5781 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005782
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005784 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 else if (!cased && Py_UNICODE_ISLOWER(ch))
5786 cased = 1;
5787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005788 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789}
5790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005791PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005792"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005794Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005795at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
5797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005798unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799{
5800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5801 register const Py_UNICODE *e;
5802 int cased;
5803
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 /* Shortcut for single character strings */
5805 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005806 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005808 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005809 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005810 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005811
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 e = p + PyUnicode_GET_SIZE(self);
5813 cased = 0;
5814 for (; p < e; p++) {
5815 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005816
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005818 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 else if (!cased && Py_UNICODE_ISUPPER(ch))
5820 cased = 1;
5821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823}
5824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005825PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005826"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005828Return True if S is a titlecased string and there is at least one\n\
5829character in S, i.e. upper- and titlecase characters may only\n\
5830follow uncased characters and lowercase characters only cased ones.\n\
5831Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
5833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005834unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835{
5836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5837 register const Py_UNICODE *e;
5838 int cased, previous_is_cased;
5839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 /* Shortcut for single character strings */
5841 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005842 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5843 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005846 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005848
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 e = p + PyUnicode_GET_SIZE(self);
5850 cased = 0;
5851 previous_is_cased = 0;
5852 for (; p < e; p++) {
5853 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005854
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5856 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005857 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 previous_is_cased = 1;
5859 cased = 1;
5860 }
5861 else if (Py_UNICODE_ISLOWER(ch)) {
5862 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005863 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 previous_is_cased = 1;
5865 cased = 1;
5866 }
5867 else
5868 previous_is_cased = 0;
5869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005870 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005873PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005874"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005876Return True if all characters in S are whitespace\n\
5877and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
5879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005880unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
5882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5883 register const Py_UNICODE *e;
5884
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 /* Shortcut for single character strings */
5886 if (PyUnicode_GET_SIZE(self) == 1 &&
5887 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005891 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 e = p + PyUnicode_GET_SIZE(self);
5895 for (; p < e; p++) {
5896 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005897 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005899 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900}
5901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005902PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005903"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005905Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005906and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005907
5908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005909unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005910{
5911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5912 register const Py_UNICODE *e;
5913
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005914 /* Shortcut for single character strings */
5915 if (PyUnicode_GET_SIZE(self) == 1 &&
5916 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005917 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005918
5919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005920 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005921 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005922
5923 e = p + PyUnicode_GET_SIZE(self);
5924 for (; p < e; p++) {
5925 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005926 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005928 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005929}
5930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005931PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005932"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005934Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005936
5937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005938unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005939{
5940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5941 register const Py_UNICODE *e;
5942
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005943 /* Shortcut for single character strings */
5944 if (PyUnicode_GET_SIZE(self) == 1 &&
5945 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005946 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005947
5948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005949 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005951
5952 e = p + PyUnicode_GET_SIZE(self);
5953 for (; p < e; p++) {
5954 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005955 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005957 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005958}
5959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005960PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005961"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005963Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005964False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965
5966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005967unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968{
5969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5970 register const Py_UNICODE *e;
5971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 /* Shortcut for single character strings */
5973 if (PyUnicode_GET_SIZE(self) == 1 &&
5974 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005977 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005978 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 e = p + PyUnicode_GET_SIZE(self);
5982 for (; p < e; p++) {
5983 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005984 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987}
5988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005989PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005990"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005992Return True if all characters in S are digits\n\
5993and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
5995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005996unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997{
5998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5999 register const Py_UNICODE *e;
6000
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 /* Shortcut for single character strings */
6002 if (PyUnicode_GET_SIZE(self) == 1 &&
6003 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006006 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006007 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006008 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006009
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 e = p + PyUnicode_GET_SIZE(self);
6011 for (; p < e; p++) {
6012 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006013 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006015 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006019"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006022False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
6024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006025unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
6027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6028 register const Py_UNICODE *e;
6029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Shortcut for single character strings */
6031 if (PyUnicode_GET_SIZE(self) == 1 &&
6032 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006033 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006035 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006036 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006037 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006038
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 e = p + PyUnicode_GET_SIZE(self);
6040 for (; p < e; p++) {
6041 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006042 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006044 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045}
6046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048"S.join(sequence) -> unicode\n\
6049\n\
6050Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006051sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
6053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006054unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006056 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060unicode_length(PyUnicodeObject *self)
6061{
6062 return self->length;
6063}
6064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006065PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006066"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067\n\
6068Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006069done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070
6071static PyObject *
6072unicode_ljust(PyUnicodeObject *self, PyObject *args)
6073{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006074 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006075 Py_UNICODE fillchar = ' ';
6076
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006077 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 return NULL;
6079
Tim Peters7a29bd52001-09-12 03:03:31 +00006080 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 Py_INCREF(self);
6082 return (PyObject*) self;
6083 }
6084
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006085 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086}
6087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089"S.lower() -> unicode\n\
6090\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092
6093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006094unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 return fixup(self, fixlower);
6097}
6098
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006099#define LEFTSTRIP 0
6100#define RIGHTSTRIP 1
6101#define BOTHSTRIP 2
6102
6103/* Arrays indexed by above */
6104static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6105
6106#define STRIPNAME(i) (stripformat[i]+3)
6107
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006108/* externally visible for str.strip(unicode) */
6109PyObject *
6110_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6111{
6112 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006113 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006114 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6116 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006117
Thomas Wouters477c8d52006-05-27 19:21:47 +00006118 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6119
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006120 i = 0;
6121 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006122 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6123 i++;
6124 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006125 }
6126
6127 j = len;
6128 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006129 do {
6130 j--;
6131 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6132 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006133 }
6134
6135 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006136 Py_INCREF(self);
6137 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006138 }
6139 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006140 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006141}
6142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
6144static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006145do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006147 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006148 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006149
6150 i = 0;
6151 if (striptype != RIGHTSTRIP) {
6152 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6153 i++;
6154 }
6155 }
6156
6157 j = len;
6158 if (striptype != LEFTSTRIP) {
6159 do {
6160 j--;
6161 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6162 j++;
6163 }
6164
6165 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6166 Py_INCREF(self);
6167 return (PyObject*)self;
6168 }
6169 else
6170 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171}
6172
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006173
6174static PyObject *
6175do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6176{
6177 PyObject *sep = NULL;
6178
6179 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6180 return NULL;
6181
6182 if (sep != NULL && sep != Py_None) {
6183 if (PyUnicode_Check(sep))
6184 return _PyUnicode_XStrip(self, striptype, sep);
6185 else if (PyString_Check(sep)) {
6186 PyObject *res;
6187 sep = PyUnicode_FromObject(sep);
6188 if (sep==NULL)
6189 return NULL;
6190 res = _PyUnicode_XStrip(self, striptype, sep);
6191 Py_DECREF(sep);
6192 return res;
6193 }
6194 else {
6195 PyErr_Format(PyExc_TypeError,
6196 "%s arg must be None, unicode or str",
6197 STRIPNAME(striptype));
6198 return NULL;
6199 }
6200 }
6201
6202 return do_strip(self, striptype);
6203}
6204
6205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006206PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006207"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006208\n\
6209Return a copy of the string S with leading and trailing\n\
6210whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006211If chars is given and not None, remove characters in chars instead.\n\
6212If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006213
6214static PyObject *
6215unicode_strip(PyUnicodeObject *self, PyObject *args)
6216{
6217 if (PyTuple_GET_SIZE(args) == 0)
6218 return do_strip(self, BOTHSTRIP); /* Common case */
6219 else
6220 return do_argstrip(self, BOTHSTRIP, args);
6221}
6222
6223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006224PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006225"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006226\n\
6227Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006228If chars is given and not None, remove characters in chars instead.\n\
6229If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006230
6231static PyObject *
6232unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6233{
6234 if (PyTuple_GET_SIZE(args) == 0)
6235 return do_strip(self, LEFTSTRIP); /* Common case */
6236 else
6237 return do_argstrip(self, LEFTSTRIP, args);
6238}
6239
6240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006241PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006242"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243\n\
6244Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006245If chars is given and not None, remove characters in chars instead.\n\
6246If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006247
6248static PyObject *
6249unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6250{
6251 if (PyTuple_GET_SIZE(args) == 0)
6252 return do_strip(self, RIGHTSTRIP); /* Common case */
6253 else
6254 return do_argstrip(self, RIGHTSTRIP, args);
6255}
6256
6257
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006259unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
6261 PyUnicodeObject *u;
6262 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006263 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006264 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265
6266 if (len < 0)
6267 len = 0;
6268
Tim Peters7a29bd52001-09-12 03:03:31 +00006269 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 /* no repeat, return original string */
6271 Py_INCREF(str);
6272 return (PyObject*) str;
6273 }
Tim Peters8f422462000-09-09 06:13:41 +00006274
6275 /* ensure # of chars needed doesn't overflow int and # of bytes
6276 * needed doesn't overflow size_t
6277 */
6278 nchars = len * str->length;
6279 if (len && nchars / len != str->length) {
6280 PyErr_SetString(PyExc_OverflowError,
6281 "repeated string is too long");
6282 return NULL;
6283 }
6284 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6285 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6286 PyErr_SetString(PyExc_OverflowError,
6287 "repeated string is too long");
6288 return NULL;
6289 }
6290 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 if (!u)
6292 return NULL;
6293
6294 p = u->str;
6295
Thomas Wouters477c8d52006-05-27 19:21:47 +00006296 if (str->length == 1 && len > 0) {
6297 Py_UNICODE_FILL(p, str->str[0], len);
6298 } else {
6299 Py_ssize_t done = 0; /* number of characters copied this far */
6300 if (done < nchars) {
6301 Py_UNICODE_COPY(p, str->str, str->length);
6302 done = str->length;
6303 }
6304 while (done < nchars) {
6305 int n = (done <= nchars-done) ? done : nchars-done;
6306 Py_UNICODE_COPY(p+done, p, n);
6307 done += n;
6308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 }
6310
6311 return (PyObject*) u;
6312}
6313
6314PyObject *PyUnicode_Replace(PyObject *obj,
6315 PyObject *subobj,
6316 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006317 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318{
6319 PyObject *self;
6320 PyObject *str1;
6321 PyObject *str2;
6322 PyObject *result;
6323
6324 self = PyUnicode_FromObject(obj);
6325 if (self == NULL)
6326 return NULL;
6327 str1 = PyUnicode_FromObject(subobj);
6328 if (str1 == NULL) {
6329 Py_DECREF(self);
6330 return NULL;
6331 }
6332 str2 = PyUnicode_FromObject(replobj);
6333 if (str2 == NULL) {
6334 Py_DECREF(self);
6335 Py_DECREF(str1);
6336 return NULL;
6337 }
Tim Petersced69f82003-09-16 20:30:58 +00006338 result = replace((PyUnicodeObject *)self,
6339 (PyUnicodeObject *)str1,
6340 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 maxcount);
6342 Py_DECREF(self);
6343 Py_DECREF(str1);
6344 Py_DECREF(str2);
6345 return result;
6346}
6347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006348PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349"S.replace (old, new[, maxsplit]) -> unicode\n\
6350\n\
6351Return a copy of S with all occurrences of substring\n\
6352old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355static PyObject*
6356unicode_replace(PyUnicodeObject *self, PyObject *args)
6357{
6358 PyUnicodeObject *str1;
6359 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 PyObject *result;
6362
Martin v. Löwis18e16552006-02-15 17:27:45 +00006363 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 return NULL;
6365 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6366 if (str1 == NULL)
6367 return NULL;
6368 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006369 if (str2 == NULL) {
6370 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
6374 result = replace(self, str1, str2, maxcount);
6375
6376 Py_DECREF(str1);
6377 Py_DECREF(str2);
6378 return result;
6379}
6380
6381static
6382PyObject *unicode_repr(PyObject *unicode)
6383{
6384 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6385 PyUnicode_GET_SIZE(unicode),
6386 1);
6387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390"S.rfind(sub [,start [,end]]) -> int\n\
6391\n\
6392Return the highest index in S where substring sub is found,\n\
6393such that sub is contained within s[start,end]. Optional\n\
6394arguments start and end are interpreted as in slice notation.\n\
6395\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398static PyObject *
6399unicode_rfind(PyUnicodeObject *self, PyObject *args)
6400{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006401 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006403 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Guido van Rossumb8872e62000-05-09 14:14:27 +00006406 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6407 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006409 substring = PyUnicode_FromObject(substring);
6410 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 return NULL;
6412
Thomas Wouters477c8d52006-05-27 19:21:47 +00006413 result = stringlib_rfind_slice(
6414 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6415 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6416 start, end
6417 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
6419 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006420
6421 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422}
6423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006424PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425"S.rindex(sub [,start [,end]]) -> int\n\
6426\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
6429static PyObject *
6430unicode_rindex(PyUnicodeObject *self, PyObject *args)
6431{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006432 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006433 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006434 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006435 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
Guido van Rossumb8872e62000-05-09 14:14:27 +00006437 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6438 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006440 substring = PyUnicode_FromObject(substring);
6441 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 return NULL;
6443
Thomas Wouters477c8d52006-05-27 19:21:47 +00006444 result = stringlib_rfind_slice(
6445 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6446 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6447 start, end
6448 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449
6450 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006451
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 if (result < 0) {
6453 PyErr_SetString(PyExc_ValueError, "substring not found");
6454 return NULL;
6455 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457}
6458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006459PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006460"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461\n\
6462Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006463done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465static PyObject *
6466unicode_rjust(PyUnicodeObject *self, PyObject *args)
6467{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006468 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006469 Py_UNICODE fillchar = ' ';
6470
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006471 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 return NULL;
6473
Tim Peters7a29bd52001-09-12 03:03:31 +00006474 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 Py_INCREF(self);
6476 return (PyObject*) self;
6477 }
6478
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006479 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006483unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
6485 /* standard clamping */
6486 if (start < 0)
6487 start = 0;
6488 if (end < 0)
6489 end = 0;
6490 if (end > self->length)
6491 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006492 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 /* full slice, return original string */
6494 Py_INCREF(self);
6495 return (PyObject*) self;
6496 }
6497 if (start > end)
6498 start = end;
6499 /* copy slice */
6500 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6501 end - start);
6502}
6503
6504PyObject *PyUnicode_Split(PyObject *s,
6505 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006506 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
6508 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006509
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 s = PyUnicode_FromObject(s);
6511 if (s == NULL)
6512 return NULL;
6513 if (sep != NULL) {
6514 sep = PyUnicode_FromObject(sep);
6515 if (sep == NULL) {
6516 Py_DECREF(s);
6517 return NULL;
6518 }
6519 }
6520
6521 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6522
6523 Py_DECREF(s);
6524 Py_XDECREF(sep);
6525 return result;
6526}
6527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006528PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529"S.split([sep [,maxsplit]]) -> list of strings\n\
6530\n\
6531Return a list of the words in S, using sep as the\n\
6532delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006533splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006534any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535
6536static PyObject*
6537unicode_split(PyUnicodeObject *self, PyObject *args)
6538{
6539 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541
Martin v. Löwis18e16552006-02-15 17:27:45 +00006542 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 return NULL;
6544
6545 if (substring == Py_None)
6546 return split(self, NULL, maxcount);
6547 else if (PyUnicode_Check(substring))
6548 return split(self, (PyUnicodeObject *)substring, maxcount);
6549 else
6550 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6551}
6552
Thomas Wouters477c8d52006-05-27 19:21:47 +00006553PyObject *
6554PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6555{
6556 PyObject* str_obj;
6557 PyObject* sep_obj;
6558 PyObject* out;
6559
6560 str_obj = PyUnicode_FromObject(str_in);
6561 if (!str_obj)
6562 return NULL;
6563 sep_obj = PyUnicode_FromObject(sep_in);
6564 if (!sep_obj) {
6565 Py_DECREF(str_obj);
6566 return NULL;
6567 }
6568
6569 out = stringlib_partition(
6570 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6571 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6572 );
6573
6574 Py_DECREF(sep_obj);
6575 Py_DECREF(str_obj);
6576
6577 return out;
6578}
6579
6580
6581PyObject *
6582PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6583{
6584 PyObject* str_obj;
6585 PyObject* sep_obj;
6586 PyObject* out;
6587
6588 str_obj = PyUnicode_FromObject(str_in);
6589 if (!str_obj)
6590 return NULL;
6591 sep_obj = PyUnicode_FromObject(sep_in);
6592 if (!sep_obj) {
6593 Py_DECREF(str_obj);
6594 return NULL;
6595 }
6596
6597 out = stringlib_rpartition(
6598 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6599 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6600 );
6601
6602 Py_DECREF(sep_obj);
6603 Py_DECREF(str_obj);
6604
6605 return out;
6606}
6607
6608PyDoc_STRVAR(partition__doc__,
6609"S.partition(sep) -> (head, sep, tail)\n\
6610\n\
6611Searches for the separator sep in S, and returns the part before it,\n\
6612the separator itself, and the part after it. If the separator is not\n\
6613found, returns S and two empty strings.");
6614
6615static PyObject*
6616unicode_partition(PyUnicodeObject *self, PyObject *separator)
6617{
6618 return PyUnicode_Partition((PyObject *)self, separator);
6619}
6620
6621PyDoc_STRVAR(rpartition__doc__,
6622"S.rpartition(sep) -> (head, sep, tail)\n\
6623\n\
6624Searches for the separator sep in S, starting at the end of S, and returns\n\
6625the part before it, the separator itself, and the part after it. If the\n\
6626separator is not found, returns S and two empty strings.");
6627
6628static PyObject*
6629unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6630{
6631 return PyUnicode_RPartition((PyObject *)self, separator);
6632}
6633
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006634PyObject *PyUnicode_RSplit(PyObject *s,
6635 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006636 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006637{
6638 PyObject *result;
6639
6640 s = PyUnicode_FromObject(s);
6641 if (s == NULL)
6642 return NULL;
6643 if (sep != NULL) {
6644 sep = PyUnicode_FromObject(sep);
6645 if (sep == NULL) {
6646 Py_DECREF(s);
6647 return NULL;
6648 }
6649 }
6650
6651 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6652
6653 Py_DECREF(s);
6654 Py_XDECREF(sep);
6655 return result;
6656}
6657
6658PyDoc_STRVAR(rsplit__doc__,
6659"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6660\n\
6661Return a list of the words in S, using sep as the\n\
6662delimiter string, starting at the end of the string and\n\
6663working to the front. If maxsplit is given, at most maxsplit\n\
6664splits are done. If sep is not specified, any whitespace string\n\
6665is a separator.");
6666
6667static PyObject*
6668unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6669{
6670 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006671 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006672
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006674 return NULL;
6675
6676 if (substring == Py_None)
6677 return rsplit(self, NULL, maxcount);
6678 else if (PyUnicode_Check(substring))
6679 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6680 else
6681 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6682}
6683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006684PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006685"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686\n\
6687Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006688Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006689is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
6691static PyObject*
6692unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6693{
Guido van Rossum86662912000-04-11 15:38:46 +00006694 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695
Guido van Rossum86662912000-04-11 15:38:46 +00006696 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 return NULL;
6698
Guido van Rossum86662912000-04-11 15:38:46 +00006699 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700}
6701
6702static
6703PyObject *unicode_str(PyUnicodeObject *self)
6704{
Fred Drakee4315f52000-05-09 19:53:39 +00006705 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706}
6707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006708PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709"S.swapcase() -> unicode\n\
6710\n\
6711Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006715unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 return fixup(self, fixswapcase);
6718}
6719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721"S.translate(table) -> unicode\n\
6722\n\
6723Return a copy of the string S, where all characters have been mapped\n\
6724through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006725Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6726Unmapped characters are left untouched. Characters mapped to None\n\
6727are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
6729static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006730unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731{
Tim Petersced69f82003-09-16 20:30:58 +00006732 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006734 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 "ignore");
6736}
6737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006738PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739"S.upper() -> unicode\n\
6740\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
6743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006744unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 return fixup(self, fixupper);
6747}
6748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006749PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750"S.zfill(width) -> unicode\n\
6751\n\
6752Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006753of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
6755static PyObject *
6756unicode_zfill(PyUnicodeObject *self, PyObject *args)
6757{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006758 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 PyUnicodeObject *u;
6760
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 Py_ssize_t width;
6762 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 return NULL;
6764
6765 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006766 if (PyUnicode_CheckExact(self)) {
6767 Py_INCREF(self);
6768 return (PyObject*) self;
6769 }
6770 else
6771 return PyUnicode_FromUnicode(
6772 PyUnicode_AS_UNICODE(self),
6773 PyUnicode_GET_SIZE(self)
6774 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 }
6776
6777 fill = width - self->length;
6778
6779 u = pad(self, fill, 0, '0');
6780
Walter Dörwald068325e2002-04-15 13:36:47 +00006781 if (u == NULL)
6782 return NULL;
6783
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 if (u->str[fill] == '+' || u->str[fill] == '-') {
6785 /* move sign to beginning of string */
6786 u->str[0] = u->str[fill];
6787 u->str[fill] = '0';
6788 }
6789
6790 return (PyObject*) u;
6791}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792
6793#if 0
6794static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006795unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 return PyInt_FromLong(unicode_freelist_size);
6798}
6799#endif
6800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006801PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006804Return True if S starts with the specified prefix, False otherwise.\n\
6805With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806With optional end, stop comparing S at that position.\n\
6807prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808
6809static PyObject *
6810unicode_startswith(PyUnicodeObject *self,
6811 PyObject *args)
6812{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006816 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006820 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822 if (PyTuple_Check(subobj)) {
6823 Py_ssize_t i;
6824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6825 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6826 PyTuple_GET_ITEM(subobj, i));
6827 if (substring == NULL)
6828 return NULL;
6829 result = tailmatch(self, substring, start, end, -1);
6830 Py_DECREF(substring);
6831 if (result) {
6832 Py_RETURN_TRUE;
6833 }
6834 }
6835 /* nothing matched */
6836 Py_RETURN_FALSE;
6837 }
6838 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006840 return NULL;
6841 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844}
6845
6846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006847PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006850Return True if S ends with the specified suffix, False otherwise.\n\
6851With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006852With optional end, stop comparing S at that position.\n\
6853suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
6855static PyObject *
6856unicode_endswith(PyUnicodeObject *self,
6857 PyObject *args)
6858{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006862 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6866 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 if (PyTuple_Check(subobj)) {
6869 Py_ssize_t i;
6870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6871 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6872 PyTuple_GET_ITEM(subobj, i));
6873 if (substring == NULL)
6874 return NULL;
6875 result = tailmatch(self, substring, start, end, +1);
6876 Py_DECREF(substring);
6877 if (result) {
6878 Py_RETURN_TRUE;
6879 }
6880 }
6881 Py_RETURN_FALSE;
6882 }
6883 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006889 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890}
6891
6892
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006893
6894static PyObject *
6895unicode_getnewargs(PyUnicodeObject *v)
6896{
6897 return Py_BuildValue("(u#)", v->str, v->length);
6898}
6899
6900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901static PyMethodDef unicode_methods[] = {
6902
6903 /* Order is according to common usage: often used methods should
6904 appear first, since lookup is done sequentially. */
6905
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006906 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6907 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6908 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006909 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006910 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6911 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6912 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6913 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6914 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6915 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6916 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00006917 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006918 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6919 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6920 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006921 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006922 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006923/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6924 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6925 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6926 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006927 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00006928 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006930 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006931 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6932 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6933 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6934 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6935 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6936 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6937 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6938 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6939 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6940 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6941 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6942 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6943 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6944 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006946#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006947 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948#endif
6949
6950#if 0
6951 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006952 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953#endif
6954
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006955 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 {NULL, NULL}
6957};
6958
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006959static PyObject *
6960unicode_mod(PyObject *v, PyObject *w)
6961{
6962 if (!PyUnicode_Check(v)) {
6963 Py_INCREF(Py_NotImplemented);
6964 return Py_NotImplemented;
6965 }
6966 return PyUnicode_Format(v, w);
6967}
6968
6969static PyNumberMethods unicode_as_number = {
6970 0, /*nb_add*/
6971 0, /*nb_subtract*/
6972 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006973 unicode_mod, /*nb_remainder*/
6974};
6975
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006977 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006978 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006979 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6980 (ssizeargfunc) unicode_getitem, /* sq_item */
6981 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 0, /* sq_ass_item */
6983 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006984 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985};
6986
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006987static PyObject*
6988unicode_subscript(PyUnicodeObject* self, PyObject* item)
6989{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006990 PyNumberMethods *nb = item->ob_type->tp_as_number;
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00006991 if (nb != NULL && nb->nb_index != NULL) {
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006992 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006993 if (i == -1 && PyErr_Occurred())
6994 return NULL;
6995 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006996 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006997 return unicode_getitem(self, i);
6998 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006999 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007000 Py_UNICODE* source_buf;
7001 Py_UNICODE* result_buf;
7002 PyObject* result;
7003
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007004 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007005 &start, &stop, &step, &slicelength) < 0) {
7006 return NULL;
7007 }
7008
7009 if (slicelength <= 0) {
7010 return PyUnicode_FromUnicode(NULL, 0);
7011 } else {
7012 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007013 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7014 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007015
7016 if (result_buf == NULL)
7017 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007018
7019 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7020 result_buf[i] = source_buf[cur];
7021 }
Tim Petersced69f82003-09-16 20:30:58 +00007022
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007023 result = PyUnicode_FromUnicode(result_buf, slicelength);
7024 PyMem_FREE(result_buf);
7025 return result;
7026 }
7027 } else {
7028 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7029 return NULL;
7030 }
7031}
7032
7033static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007034 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007035 (binaryfunc)unicode_subscript, /* mp_subscript */
7036 (objobjargproc)0, /* mp_ass_subscript */
7037};
7038
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007041 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 const void **ptr)
7043{
7044 if (index != 0) {
7045 PyErr_SetString(PyExc_SystemError,
7046 "accessing non-existent unicode segment");
7047 return -1;
7048 }
7049 *ptr = (void *) self->str;
7050 return PyUnicode_GET_DATA_SIZE(self);
7051}
7052
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053static Py_ssize_t
7054unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 const void **ptr)
7056{
7057 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007058 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 return -1;
7060}
7061
7062static int
7063unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007064 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065{
7066 if (lenp)
7067 *lenp = PyUnicode_GET_DATA_SIZE(self);
7068 return 1;
7069}
7070
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007071static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007073 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 const void **ptr)
7075{
7076 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007077
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 if (index != 0) {
7079 PyErr_SetString(PyExc_SystemError,
7080 "accessing non-existent unicode segment");
7081 return -1;
7082 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007083 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 if (str == NULL)
7085 return -1;
7086 *ptr = (void *) PyString_AS_STRING(str);
7087 return PyString_GET_SIZE(str);
7088}
7089
7090/* Helpers for PyUnicode_Format() */
7091
7092static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007093getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007095 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 if (argidx < arglen) {
7097 (*p_argidx)++;
7098 if (arglen < 0)
7099 return args;
7100 else
7101 return PyTuple_GetItem(args, argidx);
7102 }
7103 PyErr_SetString(PyExc_TypeError,
7104 "not enough arguments for format string");
7105 return NULL;
7106}
7107
7108#define F_LJUST (1<<0)
7109#define F_SIGN (1<<1)
7110#define F_BLANK (1<<2)
7111#define F_ALT (1<<3)
7112#define F_ZERO (1<<4)
7113
Martin v. Löwis18e16552006-02-15 17:27:45 +00007114static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007115strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007117 register Py_ssize_t i;
7118 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 for (i = len - 1; i >= 0; i--)
7120 buffer[i] = (Py_UNICODE) charbuffer[i];
7121
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 return len;
7123}
7124
Neal Norwitzfc76d632006-01-10 06:03:13 +00007125static int
7126doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7127{
Tim Peters15231542006-02-16 01:08:01 +00007128 Py_ssize_t result;
7129
Neal Norwitzfc76d632006-01-10 06:03:13 +00007130 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007131 result = strtounicode(buffer, (char *)buffer);
7132 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007133}
7134
7135static int
7136longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7137{
Tim Peters15231542006-02-16 01:08:01 +00007138 Py_ssize_t result;
7139
Neal Norwitzfc76d632006-01-10 06:03:13 +00007140 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007141 result = strtounicode(buffer, (char *)buffer);
7142 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007143}
7144
Guido van Rossum078151d2002-08-11 04:24:12 +00007145/* XXX To save some code duplication, formatfloat/long/int could have been
7146 shared with stringobject.c, converting from 8-bit to Unicode after the
7147 formatting is done. */
7148
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149static int
7150formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007151 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 int flags,
7153 int prec,
7154 int type,
7155 PyObject *v)
7156{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007157 /* fmt = '%#.' + `prec` + `type`
7158 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 char fmt[20];
7160 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007161
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 x = PyFloat_AsDouble(v);
7163 if (x == -1.0 && PyErr_Occurred())
7164 return -1;
7165 if (prec < 0)
7166 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7168 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007169 /* Worst case length calc to ensure no buffer overrun:
7170
7171 'g' formats:
7172 fmt = %#.<prec>g
7173 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7174 for any double rep.)
7175 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7176
7177 'f' formats:
7178 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7179 len = 1 + 50 + 1 + prec = 52 + prec
7180
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007181 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007182 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007183
7184 */
7185 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7186 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007187 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007188 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007189 return -1;
7190 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007191 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7192 (flags&F_ALT) ? "#" : "",
7193 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007194 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195}
7196
Tim Peters38fd5b62000-09-21 05:43:11 +00007197static PyObject*
7198formatlong(PyObject *val, int flags, int prec, int type)
7199{
7200 char *buf;
7201 int i, len;
7202 PyObject *str; /* temporary string object. */
7203 PyUnicodeObject *result;
7204
7205 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7206 if (!str)
7207 return NULL;
7208 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007209 if (!result) {
7210 Py_DECREF(str);
7211 return NULL;
7212 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007213 for (i = 0; i < len; i++)
7214 result->str[i] = buf[i];
7215 result->str[len] = 0;
7216 Py_DECREF(str);
7217 return (PyObject*)result;
7218}
7219
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220static int
7221formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007222 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 int flags,
7224 int prec,
7225 int type,
7226 PyObject *v)
7227{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007228 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007229 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7230 * + 1 + 1
7231 * = 24
7232 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007233 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007234 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 long x;
7236
7237 x = PyInt_AsLong(v);
7238 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007239 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007240 if (x < 0 && type == 'u') {
7241 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007242 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007243 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7244 sign = "-";
7245 else
7246 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007248 prec = 1;
7249
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007250 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7251 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007252 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007253 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007254 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007255 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007256 return -1;
7257 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007258
7259 if ((flags & F_ALT) &&
7260 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007261 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007262 * of issues that cause pain:
7263 * - when 0 is being converted, the C standard leaves off
7264 * the '0x' or '0X', which is inconsistent with other
7265 * %#x/%#X conversions and inconsistent with Python's
7266 * hex() function
7267 * - there are platforms that violate the standard and
7268 * convert 0 with the '0x' or '0X'
7269 * (Metrowerks, Compaq Tru64)
7270 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007271 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007272 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007273 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007274 * We can achieve the desired consistency by inserting our
7275 * own '0x' or '0X' prefix, and substituting %x/%X in place
7276 * of %#x/%#X.
7277 *
7278 * Note that this is the same approach as used in
7279 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007280 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007281 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7282 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007283 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007284 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007285 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7286 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007287 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007288 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007289 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007290 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007291 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007292 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293}
7294
7295static int
7296formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007297 size_t buflen,
7298 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007300 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007301 if (PyUnicode_Check(v)) {
7302 if (PyUnicode_GET_SIZE(v) != 1)
7303 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007307 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007308 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007309 goto onError;
7310 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
7313 else {
7314 /* Integer input truncated to a character */
7315 long x;
7316 x = PyInt_AsLong(v);
7317 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007318 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007319#ifdef Py_UNICODE_WIDE
7320 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007321 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007322 "%c arg not in range(0x110000) "
7323 "(wide Python build)");
7324 return -1;
7325 }
7326#else
7327 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007328 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007329 "%c arg not in range(0x10000) "
7330 "(narrow Python build)");
7331 return -1;
7332 }
7333#endif
7334 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 }
7336 buf[1] = '\0';
7337 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007338
7339 onError:
7340 PyErr_SetString(PyExc_TypeError,
7341 "%c requires int or char");
7342 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343}
7344
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007345/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7346
7347 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7348 chars are formatted. XXX This is a magic number. Each formatting
7349 routine does bounds checking to ensure no overflow, but a better
7350 solution may be to malloc a buffer of appropriate size for each
7351 format. For now, the current solution is sufficient.
7352*/
7353#define FORMATBUFLEN (size_t)120
7354
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355PyObject *PyUnicode_Format(PyObject *format,
7356 PyObject *args)
7357{
7358 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007359 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 int args_owned = 0;
7361 PyUnicodeObject *result = NULL;
7362 PyObject *dict = NULL;
7363 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 if (format == NULL || args == NULL) {
7366 PyErr_BadInternalCall();
7367 return NULL;
7368 }
7369 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007370 if (uformat == NULL)
7371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 fmt = PyUnicode_AS_UNICODE(uformat);
7373 fmtcnt = PyUnicode_GET_SIZE(uformat);
7374
7375 reslen = rescnt = fmtcnt + 100;
7376 result = _PyUnicode_New(reslen);
7377 if (result == NULL)
7378 goto onError;
7379 res = PyUnicode_AS_UNICODE(result);
7380
7381 if (PyTuple_Check(args)) {
7382 arglen = PyTuple_Size(args);
7383 argidx = 0;
7384 }
7385 else {
7386 arglen = -1;
7387 argidx = -2;
7388 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007389 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7390 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 dict = args;
7392
7393 while (--fmtcnt >= 0) {
7394 if (*fmt != '%') {
7395 if (--rescnt < 0) {
7396 rescnt = fmtcnt + 100;
7397 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007398 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7401 --rescnt;
7402 }
7403 *res++ = *fmt++;
7404 }
7405 else {
7406 /* Got a format specifier */
7407 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007408 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 Py_UNICODE c = '\0';
7411 Py_UNICODE fill;
7412 PyObject *v = NULL;
7413 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007414 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007416 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007417 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
7419 fmt++;
7420 if (*fmt == '(') {
7421 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007422 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 PyObject *key;
7424 int pcount = 1;
7425
7426 if (dict == NULL) {
7427 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007428 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 goto onError;
7430 }
7431 ++fmt;
7432 --fmtcnt;
7433 keystart = fmt;
7434 /* Skip over balanced parentheses */
7435 while (pcount > 0 && --fmtcnt >= 0) {
7436 if (*fmt == ')')
7437 --pcount;
7438 else if (*fmt == '(')
7439 ++pcount;
7440 fmt++;
7441 }
7442 keylen = fmt - keystart - 1;
7443 if (fmtcnt < 0 || pcount > 0) {
7444 PyErr_SetString(PyExc_ValueError,
7445 "incomplete format key");
7446 goto onError;
7447 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007448#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007449 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 then looked up since Python uses strings to hold
7451 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007452 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 key = PyUnicode_EncodeUTF8(keystart,
7454 keylen,
7455 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007456#else
7457 key = PyUnicode_FromUnicode(keystart, keylen);
7458#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 if (key == NULL)
7460 goto onError;
7461 if (args_owned) {
7462 Py_DECREF(args);
7463 args_owned = 0;
7464 }
7465 args = PyObject_GetItem(dict, key);
7466 Py_DECREF(key);
7467 if (args == NULL) {
7468 goto onError;
7469 }
7470 args_owned = 1;
7471 arglen = -1;
7472 argidx = -2;
7473 }
7474 while (--fmtcnt >= 0) {
7475 switch (c = *fmt++) {
7476 case '-': flags |= F_LJUST; continue;
7477 case '+': flags |= F_SIGN; continue;
7478 case ' ': flags |= F_BLANK; continue;
7479 case '#': flags |= F_ALT; continue;
7480 case '0': flags |= F_ZERO; continue;
7481 }
7482 break;
7483 }
7484 if (c == '*') {
7485 v = getnextarg(args, arglen, &argidx);
7486 if (v == NULL)
7487 goto onError;
7488 if (!PyInt_Check(v)) {
7489 PyErr_SetString(PyExc_TypeError,
7490 "* wants int");
7491 goto onError;
7492 }
7493 width = PyInt_AsLong(v);
7494 if (width < 0) {
7495 flags |= F_LJUST;
7496 width = -width;
7497 }
7498 if (--fmtcnt >= 0)
7499 c = *fmt++;
7500 }
7501 else if (c >= '0' && c <= '9') {
7502 width = c - '0';
7503 while (--fmtcnt >= 0) {
7504 c = *fmt++;
7505 if (c < '0' || c > '9')
7506 break;
7507 if ((width*10) / 10 != width) {
7508 PyErr_SetString(PyExc_ValueError,
7509 "width too big");
7510 goto onError;
7511 }
7512 width = width*10 + (c - '0');
7513 }
7514 }
7515 if (c == '.') {
7516 prec = 0;
7517 if (--fmtcnt >= 0)
7518 c = *fmt++;
7519 if (c == '*') {
7520 v = getnextarg(args, arglen, &argidx);
7521 if (v == NULL)
7522 goto onError;
7523 if (!PyInt_Check(v)) {
7524 PyErr_SetString(PyExc_TypeError,
7525 "* wants int");
7526 goto onError;
7527 }
7528 prec = PyInt_AsLong(v);
7529 if (prec < 0)
7530 prec = 0;
7531 if (--fmtcnt >= 0)
7532 c = *fmt++;
7533 }
7534 else if (c >= '0' && c <= '9') {
7535 prec = c - '0';
7536 while (--fmtcnt >= 0) {
7537 c = Py_CHARMASK(*fmt++);
7538 if (c < '0' || c > '9')
7539 break;
7540 if ((prec*10) / 10 != prec) {
7541 PyErr_SetString(PyExc_ValueError,
7542 "prec too big");
7543 goto onError;
7544 }
7545 prec = prec*10 + (c - '0');
7546 }
7547 }
7548 } /* prec */
7549 if (fmtcnt >= 0) {
7550 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 if (--fmtcnt >= 0)
7552 c = *fmt++;
7553 }
7554 }
7555 if (fmtcnt < 0) {
7556 PyErr_SetString(PyExc_ValueError,
7557 "incomplete format");
7558 goto onError;
7559 }
7560 if (c != '%') {
7561 v = getnextarg(args, arglen, &argidx);
7562 if (v == NULL)
7563 goto onError;
7564 }
7565 sign = 0;
7566 fill = ' ';
7567 switch (c) {
7568
7569 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007570 pbuf = formatbuf;
7571 /* presume that buffer length is at least 1 */
7572 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 len = 1;
7574 break;
7575
7576 case 's':
7577 case 'r':
7578 if (PyUnicode_Check(v) && c == 's') {
7579 temp = v;
7580 Py_INCREF(temp);
7581 }
7582 else {
7583 PyObject *unicode;
7584 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007585 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 else
7587 temp = PyObject_Repr(v);
7588 if (temp == NULL)
7589 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007590 if (PyUnicode_Check(temp))
7591 /* nothing to do */;
7592 else if (PyString_Check(temp)) {
7593 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007594 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007596 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007598 Py_DECREF(temp);
7599 temp = unicode;
7600 if (temp == NULL)
7601 goto onError;
7602 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007603 else {
7604 Py_DECREF(temp);
7605 PyErr_SetString(PyExc_TypeError,
7606 "%s argument has non-string str()");
7607 goto onError;
7608 }
7609 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007610 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 len = PyUnicode_GET_SIZE(temp);
7612 if (prec >= 0 && len > prec)
7613 len = prec;
7614 break;
7615
7616 case 'i':
7617 case 'd':
7618 case 'u':
7619 case 'o':
7620 case 'x':
7621 case 'X':
7622 if (c == 'i')
7623 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007624 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007625 temp = formatlong(v, flags, prec, c);
7626 if (!temp)
7627 goto onError;
7628 pbuf = PyUnicode_AS_UNICODE(temp);
7629 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007630 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007632 else {
7633 pbuf = formatbuf;
7634 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7635 flags, prec, c, v);
7636 if (len < 0)
7637 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007638 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007639 }
7640 if (flags & F_ZERO)
7641 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 break;
7643
7644 case 'e':
7645 case 'E':
7646 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007647 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 case 'g':
7649 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007650 if (c == 'F')
7651 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007652 pbuf = formatbuf;
7653 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7654 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 if (len < 0)
7656 goto onError;
7657 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007658 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659 fill = '0';
7660 break;
7661
7662 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007663 pbuf = formatbuf;
7664 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 if (len < 0)
7666 goto onError;
7667 break;
7668
7669 default:
7670 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007671 "unsupported format character '%c' (0x%x) "
7672 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007673 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007674 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007675 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 goto onError;
7677 }
7678 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007679 if (*pbuf == '-' || *pbuf == '+') {
7680 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 len--;
7682 }
7683 else if (flags & F_SIGN)
7684 sign = '+';
7685 else if (flags & F_BLANK)
7686 sign = ' ';
7687 else
7688 sign = 0;
7689 }
7690 if (width < len)
7691 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007692 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 reslen -= rescnt;
7694 rescnt = width + fmtcnt + 100;
7695 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007696 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007697 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007698 PyErr_NoMemory();
7699 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007700 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007701 if (_PyUnicode_Resize(&result, reslen) < 0) {
7702 Py_XDECREF(temp);
7703 goto onError;
7704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 res = PyUnicode_AS_UNICODE(result)
7706 + reslen - rescnt;
7707 }
7708 if (sign) {
7709 if (fill != ' ')
7710 *res++ = sign;
7711 rescnt--;
7712 if (width > len)
7713 width--;
7714 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007715 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7716 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007717 assert(pbuf[1] == c);
7718 if (fill != ' ') {
7719 *res++ = *pbuf++;
7720 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007721 }
Tim Petersfff53252001-04-12 18:38:48 +00007722 rescnt -= 2;
7723 width -= 2;
7724 if (width < 0)
7725 width = 0;
7726 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 if (width > len && !(flags & F_LJUST)) {
7729 do {
7730 --rescnt;
7731 *res++ = fill;
7732 } while (--width > len);
7733 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007734 if (fill == ' ') {
7735 if (sign)
7736 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007737 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007738 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007739 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007740 *res++ = *pbuf++;
7741 *res++ = *pbuf++;
7742 }
7743 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007744 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 res += len;
7746 rescnt -= len;
7747 while (--width >= len) {
7748 --rescnt;
7749 *res++ = ' ';
7750 }
7751 if (dict && (argidx < arglen) && c != '%') {
7752 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007753 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007754 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 goto onError;
7756 }
7757 Py_XDECREF(temp);
7758 } /* '%' */
7759 } /* until end */
7760 if (argidx < arglen && !dict) {
7761 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007762 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 goto onError;
7764 }
7765
Thomas Woutersa96affe2006-03-12 00:29:36 +00007766 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7767 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 if (args_owned) {
7769 Py_DECREF(args);
7770 }
7771 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 return (PyObject *)result;
7773
7774 onError:
7775 Py_XDECREF(result);
7776 Py_DECREF(uformat);
7777 if (args_owned) {
7778 Py_DECREF(args);
7779 }
7780 return NULL;
7781}
7782
7783static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007784 (readbufferproc) unicode_buffer_getreadbuf,
7785 (writebufferproc) unicode_buffer_getwritebuf,
7786 (segcountproc) unicode_buffer_getsegcount,
7787 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788};
7789
Jeremy Hylton938ace62002-07-17 16:30:39 +00007790static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007791unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7792
Tim Peters6d6c1a32001-08-02 04:15:00 +00007793static PyObject *
7794unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7795{
7796 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007797 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007798 char *encoding = NULL;
7799 char *errors = NULL;
7800
Guido van Rossume023fe02001-08-30 03:12:59 +00007801 if (type != &PyUnicode_Type)
7802 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007803 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7804 kwlist, &x, &encoding, &errors))
7805 return NULL;
7806 if (x == NULL)
7807 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007808 if (encoding == NULL && errors == NULL)
7809 return PyObject_Unicode(x);
7810 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007811 return PyUnicode_FromEncodedObject(x, encoding, errors);
7812}
7813
Guido van Rossume023fe02001-08-30 03:12:59 +00007814static PyObject *
7815unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7816{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007817 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007818 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007819
7820 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7821 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7822 if (tmp == NULL)
7823 return NULL;
7824 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007825 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007826 if (pnew == NULL) {
7827 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007828 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007829 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007830 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7831 if (pnew->str == NULL) {
7832 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007833 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007834 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007835 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007836 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007837 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7838 pnew->length = n;
7839 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007840 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007841 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007842}
7843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007844PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007845"unicode(string [, encoding[, errors]]) -> object\n\
7846\n\
7847Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007848encoding defaults to the current default string encoding.\n\
7849errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007850
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851PyTypeObject PyUnicode_Type = {
7852 PyObject_HEAD_INIT(&PyType_Type)
7853 0, /* ob_size */
7854 "unicode", /* tp_name */
7855 sizeof(PyUnicodeObject), /* tp_size */
7856 0, /* tp_itemsize */
7857 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007858 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007860 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 0, /* tp_setattr */
7862 (cmpfunc) unicode_compare, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007863 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007864 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007866 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 (hashfunc) unicode_hash, /* tp_hash*/
7868 0, /* tp_call*/
7869 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007870 PyObject_GenericGetAttr, /* tp_getattro */
7871 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00007873 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007874 unicode_doc, /* tp_doc */
7875 0, /* tp_traverse */
7876 0, /* tp_clear */
7877 0, /* tp_richcompare */
7878 0, /* tp_weaklistoffset */
7879 0, /* tp_iter */
7880 0, /* tp_iternext */
7881 unicode_methods, /* tp_methods */
7882 0, /* tp_members */
7883 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007884 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007885 0, /* tp_dict */
7886 0, /* tp_descr_get */
7887 0, /* tp_descr_set */
7888 0, /* tp_dictoffset */
7889 0, /* tp_init */
7890 0, /* tp_alloc */
7891 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007892 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893};
7894
7895/* Initialize the Unicode implementation */
7896
Thomas Wouters78890102000-07-22 19:25:51 +00007897void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007899 int i;
7900
Thomas Wouters477c8d52006-05-27 19:21:47 +00007901 /* XXX - move this array to unicodectype.c ? */
7902 Py_UNICODE linebreak[] = {
7903 0x000A, /* LINE FEED */
7904 0x000D, /* CARRIAGE RETURN */
7905 0x001C, /* FILE SEPARATOR */
7906 0x001D, /* GROUP SEPARATOR */
7907 0x001E, /* RECORD SEPARATOR */
7908 0x0085, /* NEXT LINE */
7909 0x2028, /* LINE SEPARATOR */
7910 0x2029, /* PARAGRAPH SEPARATOR */
7911 };
7912
Fred Drakee4315f52000-05-09 19:53:39 +00007913 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007914 unicode_freelist = NULL;
7915 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007917 if (!unicode_empty)
7918 return;
7919
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007920 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007921 for (i = 0; i < 256; i++)
7922 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007923 if (PyType_Ready(&PyUnicode_Type) < 0)
7924 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007925
7926 /* initialize the linebreak bloom filter */
7927 bloom_linebreak = make_bloom_mask(
7928 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7929 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007930
7931 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932}
7933
7934/* Finalize the Unicode implementation */
7935
7936void
Thomas Wouters78890102000-07-22 19:25:51 +00007937_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007939 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007940 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007942 Py_XDECREF(unicode_empty);
7943 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007944
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007945 for (i = 0; i < 256; i++) {
7946 if (unicode_latin1[i]) {
7947 Py_DECREF(unicode_latin1[i]);
7948 unicode_latin1[i] = NULL;
7949 }
7950 }
7951
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007952 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 PyUnicodeObject *v = u;
7954 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007955 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007956 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007957 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007958 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007960 unicode_freelist = NULL;
7961 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007963
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007964#ifdef __cplusplus
7965}
7966#endif
7967
7968
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007969/*
7970Local variables:
7971c-basic-offset: 4
7972indent-tabs-mode: nil
7973End:
7974*/