blob: b46093e5d4b485b04ba41d747333b39411879caa [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldd2034312007-05-18 16:29:38 +0000396PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000397{
398 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000399 /* If the Unicode data is known at construction time, we can apply
400 some optimizations which share commonly used objects. */
401 if (u != NULL) {
402
403 /* Optimization for empty strings */
404 if (size == 0 && unicode_empty != NULL) {
405 Py_INCREF(unicode_empty);
406 return (PyObject *)unicode_empty;
407 }
408
Walter Dörwald071b9da2007-05-05 14:21:20 +0000409 /* Single characters are shared when using this constructor */
410 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000411 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000412 if (!unicode) {
413 unicode = _PyUnicode_New(1);
414 if (!unicode)
415 return NULL;
416 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 }
419 Py_INCREF(unicode);
420 return (PyObject *)unicode;
421 }
422 }
423
Walter Dörwald55507312007-05-18 13:12:10 +0000424 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000425 if (!unicode)
426 return NULL;
427
428 /* Copy the Unicode data into the new object */
429 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000430 Py_UNICODE *p = unicode->str;
431 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 ;
433 }
434
435 return (PyObject *)unicode;
436}
437
Walter Dörwaldd2034312007-05-18 16:29:38 +0000438PyObject *PyUnicode_FromString(const char *u)
439{
440 size_t size = strlen(u);
441 if (size > PY_SSIZE_T_MAX) {
442 PyErr_SetString(PyExc_OverflowError, "input too long");
443 return NULL;
444 }
445
446 return PyUnicode_FromStringAndSize(u, size);
447}
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_WCHAR_H
450
451PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
456 if (w == NULL) {
457 PyErr_BadInternalCall();
458 return NULL;
459 }
460
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the wchar_t data into the new object */
466#ifdef HAVE_USABLE_WCHAR_T
467 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000468#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 {
470 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000471 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000473 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 *u++ = *w++;
475 }
476#endif
477
478 return (PyObject *)unicode;
479}
480
Walter Dörwaldd2034312007-05-18 16:29:38 +0000481#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
482
483PyObject *
484PyUnicode_FromFormatV(const char *format, va_list vargs)
485{
486 va_list count;
487 Py_ssize_t n = 0;
488 const char* f;
489 Py_UNICODE *s;
490 PyObject *string;
491 /* used by sprintf */
492 char buffer[21];
493 const char *copy;
494
495#ifdef VA_LIST_IS_ARRAY
496 Py_MEMCPY(count, vargs, sizeof(va_list));
497#else
498#ifdef __va_copy
499 __va_copy(count, vargs);
500#else
501 count = vargs;
502#endif
503#endif
504 /* step 1: figure out how large a buffer we need */
505 for (f = format; *f; f++) {
506 if (*f == '%') {
507 const char* p = f;
508 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
509 ;
510
511 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
512 * they don't affect the amount of space we reserve.
513 */
514 if ((*f == 'l' || *f == 'z') &&
515 (f[1] == 'd' || f[1] == 'u'))
516 ++f;
517
518 switch (*f) {
519 case 'c':
520 (void)va_arg(count, int);
521 /* fall through... */
522 case '%':
523 n++;
524 break;
525 case 'd': case 'u': case 'i': case 'x':
526 (void) va_arg(count, int);
527 /* 20 bytes is enough to hold a 64-bit
528 integer. Decimal takes the most space.
529 This isn't enough for octal. */
530 n += 20;
531 break;
532 case 's':
533 n += strlen(va_arg(count, char*));
534 break;
535 case 'U':
536 {
537 PyObject *obj = va_arg(count, PyObject *);
538 assert(obj && PyUnicode_Check(obj));
539 n += PyUnicode_GET_SIZE(obj);
540 break;
541 }
542 case 'p':
543 (void) va_arg(count, int);
544 /* maximum 64-bit pointer representation:
545 * 0xffffffffffffffff
546 * so 19 characters is enough.
547 * XXX I count 18 -- what's the extra for?
548 */
549 n += 19;
550 break;
551 default:
552 /* if we stumble upon an unknown
553 formatting code, copy the rest of
554 the format string to the output
555 string. (we cannot just skip the
556 code, since there's no way to know
557 what's in the argument list) */
558 n += strlen(p);
559 goto expand;
560 }
561 } else
562 n++;
563 }
564 expand:
565 /* step 2: fill the buffer */
566 /* Since we've analyzed how much space we need for the worst case,
567 we don't have to resize the string. */
568 string = PyUnicode_FromUnicode(NULL, n);
569 if (!string)
570 return NULL;
571
572 s = PyUnicode_AS_UNICODE(string);
573
574 for (f = format; *f; f++) {
575 if (*f == '%') {
576 const char* p = f++;
577 int longflag = 0;
578 int size_tflag = 0;
579 /* parse the width.precision part (we're only
580 interested in the precision value, if any) */
581 n = 0;
582 while (isdigit(Py_CHARMASK(*f)))
583 n = (n*10) + *f++ - '0';
584 if (*f == '.') {
585 f++;
586 n = 0;
587 while (isdigit(Py_CHARMASK(*f)))
588 n = (n*10) + *f++ - '0';
589 }
590 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
591 f++;
592 /* handle the long flag, but only for %ld and %lu.
593 others can be added when necessary. */
594 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
595 longflag = 1;
596 ++f;
597 }
598 /* handle the size_t flag. */
599 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
600 size_tflag = 1;
601 ++f;
602 }
603
604 switch (*f) {
605 case 'c':
606 *s++ = va_arg(vargs, int);
607 break;
608 case 'd':
609 if (longflag)
610 sprintf(buffer, "%ld", va_arg(vargs, long));
611 else if (size_tflag)
612 sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
613 va_arg(vargs, Py_ssize_t));
614 else
615 sprintf(buffer, "%d", va_arg(vargs, int));
616 appendstring(buffer);
617 break;
618 case 'u':
619 if (longflag)
620 sprintf(buffer, "%lu",
621 va_arg(vargs, unsigned long));
622 else if (size_tflag)
623 sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
624 va_arg(vargs, size_t));
625 else
626 sprintf(buffer, "%u",
627 va_arg(vargs, unsigned int));
628 appendstring(buffer);
629 break;
630 case 'i':
631 sprintf(buffer, "%i", va_arg(vargs, int));
632 appendstring(buffer);
633 break;
634 case 'x':
635 sprintf(buffer, "%x", va_arg(vargs, int));
636 appendstring(buffer);
637 break;
638 case 's':
639 p = va_arg(vargs, char*);
640 appendstring(p);
641 break;
642 case 'U':
643 {
644 PyObject *obj = va_arg(vargs, PyObject *);
645 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
646 Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
647 Py_ssize_t upos;
648 for (upos = 0; upos<usize;)
649 *s++ = ucopy[upos++];
650 break;
651 }
652 case 'p':
653 sprintf(buffer, "%p", va_arg(vargs, void*));
654 /* %p is ill-defined: ensure leading 0x. */
655 if (buffer[1] == 'X')
656 buffer[1] = 'x';
657 else if (buffer[1] != 'x') {
658 memmove(buffer+2, buffer, strlen(buffer)+1);
659 buffer[0] = '0';
660 buffer[1] = 'x';
661 }
662 appendstring(buffer);
663 break;
664 case '%':
665 *s++ = '%';
666 break;
667 default:
668 appendstring(p);
669 goto end;
670 }
671 } else
672 *s++ = *f;
673 }
674
675 end:
676 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
677 return string;
678}
679
680#undef appendstring
681
682PyObject *
683PyUnicode_FromFormat(const char *format, ...)
684{
685 PyObject* ret;
686 va_list vargs;
687
688#ifdef HAVE_STDARG_PROTOTYPES
689 va_start(vargs, format);
690#else
691 va_start(vargs);
692#endif
693 ret = PyUnicode_FromFormatV(format, vargs);
694 va_end(vargs);
695 return ret;
696}
697
Martin v. Löwis18e16552006-02-15 17:27:45 +0000698Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
699 wchar_t *w,
700 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701{
702 if (unicode == NULL) {
703 PyErr_BadInternalCall();
704 return -1;
705 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000706
707 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000709 size = PyUnicode_GET_SIZE(unicode) + 1;
710
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711#ifdef HAVE_USABLE_WCHAR_T
712 memcpy(w, unicode->str, size * sizeof(wchar_t));
713#else
714 {
715 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000716 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000718 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 *w++ = *u++;
720 }
721#endif
722
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000723 if (size > PyUnicode_GET_SIZE(unicode))
724 return PyUnicode_GET_SIZE(unicode);
725 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 return size;
727}
728
729#endif
730
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000731PyObject *PyUnicode_FromOrdinal(int ordinal)
732{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000733 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000734
735#ifdef Py_UNICODE_WIDE
736 if (ordinal < 0 || ordinal > 0x10ffff) {
737 PyErr_SetString(PyExc_ValueError,
738 "unichr() arg not in range(0x110000) "
739 "(wide Python build)");
740 return NULL;
741 }
742#else
743 if (ordinal < 0 || ordinal > 0xffff) {
744 PyErr_SetString(PyExc_ValueError,
745 "unichr() arg not in range(0x10000) "
746 "(narrow Python build)");
747 return NULL;
748 }
749#endif
750
Hye-Shik Chang40574832004-04-06 07:24:51 +0000751 s[0] = (Py_UNICODE)ordinal;
752 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000753}
754
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755PyObject *PyUnicode_FromObject(register PyObject *obj)
756{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000757 /* XXX Perhaps we should make this API an alias of
758 PyObject_Unicode() instead ?! */
759 if (PyUnicode_CheckExact(obj)) {
760 Py_INCREF(obj);
761 return obj;
762 }
763 if (PyUnicode_Check(obj)) {
764 /* For a Unicode subtype that's not a Unicode object,
765 return a true Unicode object with the same data. */
766 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
767 PyUnicode_GET_SIZE(obj));
768 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000769 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
770}
771
772PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
773 const char *encoding,
774 const char *errors)
775{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000776 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000777 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000778 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000779
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 if (obj == NULL) {
781 PyErr_BadInternalCall();
782 return NULL;
783 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000784
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000785#if 0
786 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000787 that no encodings is given and then redirect to
788 PyObject_Unicode() which then applies the additional logic for
789 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000790
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000791 NOTE: This API should really only be used for object which
792 represent *encoded* Unicode !
793
794 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000795 if (PyUnicode_Check(obj)) {
796 if (encoding) {
797 PyErr_SetString(PyExc_TypeError,
798 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000799 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000800 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000801 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000802 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000803#else
804 if (PyUnicode_Check(obj)) {
805 PyErr_SetString(PyExc_TypeError,
806 "decoding Unicode is not supported");
807 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000808 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000809#endif
810
811 /* Coerce object */
812 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000813 s = PyString_AS_STRING(obj);
814 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000815 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000816 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
817 /* Overwrite the error message with something more useful in
818 case of a TypeError. */
819 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000820 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000821 "coercing to Unicode: need string or buffer, "
822 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000823 obj->ob_type->tp_name);
824 goto onError;
825 }
Tim Petersced69f82003-09-16 20:30:58 +0000826
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000827 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828 if (len == 0) {
829 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000830 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 }
Tim Petersced69f82003-09-16 20:30:58 +0000832 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000833 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000834
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000835 return v;
836
837 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839}
840
841PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000842 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843 const char *encoding,
844 const char *errors)
845{
846 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847
848 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000849 encoding = PyUnicode_GetDefaultEncoding();
850
851 /* Shortcuts for common default encodings */
852 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000853 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000854 else if (strcmp(encoding, "latin-1") == 0)
855 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000856#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
857 else if (strcmp(encoding, "mbcs") == 0)
858 return PyUnicode_DecodeMBCS(s, size, errors);
859#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000860 else if (strcmp(encoding, "ascii") == 0)
861 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
863 /* Decode via the codec registry */
864 buffer = PyBuffer_FromMemory((void *)s, size);
865 if (buffer == NULL)
866 goto onError;
867 unicode = PyCodec_Decode(buffer, encoding, errors);
868 if (unicode == NULL)
869 goto onError;
870 if (!PyUnicode_Check(unicode)) {
871 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000872 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 unicode->ob_type->tp_name);
874 Py_DECREF(unicode);
875 goto onError;
876 }
877 Py_DECREF(buffer);
878 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000879
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 onError:
881 Py_XDECREF(buffer);
882 return NULL;
883}
884
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000885PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
886 const char *encoding,
887 const char *errors)
888{
889 PyObject *v;
890
891 if (!PyUnicode_Check(unicode)) {
892 PyErr_BadArgument();
893 goto onError;
894 }
895
896 if (encoding == NULL)
897 encoding = PyUnicode_GetDefaultEncoding();
898
899 /* Decode via the codec registry */
900 v = PyCodec_Decode(unicode, encoding, errors);
901 if (v == NULL)
902 goto onError;
903 return v;
904
905 onError:
906 return NULL;
907}
908
Guido van Rossumd57fd912000-03-10 22:53:23 +0000909PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000910 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 const char *encoding,
912 const char *errors)
913{
914 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916 unicode = PyUnicode_FromUnicode(s, size);
917 if (unicode == NULL)
918 return NULL;
919 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
920 Py_DECREF(unicode);
921 return v;
922}
923
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000924PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
925 const char *encoding,
926 const char *errors)
927{
928 PyObject *v;
929
930 if (!PyUnicode_Check(unicode)) {
931 PyErr_BadArgument();
932 goto onError;
933 }
934
935 if (encoding == NULL)
936 encoding = PyUnicode_GetDefaultEncoding();
937
938 /* Encode via the codec registry */
939 v = PyCodec_Encode(unicode, encoding, errors);
940 if (v == NULL)
941 goto onError;
942 return v;
943
944 onError:
945 return NULL;
946}
947
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
949 const char *encoding,
950 const char *errors)
951{
952 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000953
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954 if (!PyUnicode_Check(unicode)) {
955 PyErr_BadArgument();
956 goto onError;
957 }
Fred Drakee4315f52000-05-09 19:53:39 +0000958
Tim Petersced69f82003-09-16 20:30:58 +0000959 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000960 encoding = PyUnicode_GetDefaultEncoding();
961
962 /* Shortcuts for common default encodings */
963 if (errors == NULL) {
964 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000965 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000966 else if (strcmp(encoding, "latin-1") == 0)
967 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000968#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
969 else if (strcmp(encoding, "mbcs") == 0)
970 return PyUnicode_AsMBCSString(unicode);
971#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000972 else if (strcmp(encoding, "ascii") == 0)
973 return PyUnicode_AsASCIIString(unicode);
974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975
976 /* Encode via the codec registry */
977 v = PyCodec_Encode(unicode, encoding, errors);
978 if (v == NULL)
979 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000980 if (!PyBytes_Check(v)) {
981 if (PyString_Check(v)) {
982 /* Old codec, turn it into bytes */
983 PyObject *b = PyBytes_FromObject(v);
984 Py_DECREF(v);
985 return b;
986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000988 "encoder did not return a bytes object "
989 "(type=%.400s, encoding=%.20s, errors=%.20s)",
990 v->ob_type->tp_name,
991 encoding ? encoding : "NULL",
992 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993 Py_DECREF(v);
994 goto onError;
995 }
996 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000997
Guido van Rossumd57fd912000-03-10 22:53:23 +0000998 onError:
999 return NULL;
1000}
1001
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001002PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1003 const char *errors)
1004{
1005 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001006 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001007 if (v)
1008 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001009 if (errors != NULL)
1010 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1011 if (errors == NULL) {
1012 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1013 PyUnicode_GET_SIZE(unicode),
1014 NULL);
1015 }
1016 else {
1017 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1018 }
1019 if (!b)
1020 return NULL;
1021 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1022 PyBytes_Size(b));
1023 Py_DECREF(b);
1024 if (!errors) {
1025 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001026 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001027 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001028 return v;
1029}
1030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1032{
1033 if (!PyUnicode_Check(unicode)) {
1034 PyErr_BadArgument();
1035 goto onError;
1036 }
1037 return PyUnicode_AS_UNICODE(unicode);
1038
1039 onError:
1040 return NULL;
1041}
1042
Martin v. Löwis18e16552006-02-15 17:27:45 +00001043Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044{
1045 if (!PyUnicode_Check(unicode)) {
1046 PyErr_BadArgument();
1047 goto onError;
1048 }
1049 return PyUnicode_GET_SIZE(unicode);
1050
1051 onError:
1052 return -1;
1053}
1054
Thomas Wouters78890102000-07-22 19:25:51 +00001055const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001056{
1057 return unicode_default_encoding;
1058}
1059
1060int PyUnicode_SetDefaultEncoding(const char *encoding)
1061{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001062 if (strcmp(encoding, unicode_default_encoding) != 0) {
1063 PyErr_Format(PyExc_ValueError,
1064 "Can only set default encoding to %s",
1065 unicode_default_encoding);
1066 return -1;
1067 }
Fred Drakee4315f52000-05-09 19:53:39 +00001068 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001069}
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071/* error handling callback helper:
1072 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001073 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001074 and adjust various state variables.
1075 return 0 on success, -1 on error
1076*/
1077
1078static
1079int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1080 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1082 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001083{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001085
1086 PyObject *restuple = NULL;
1087 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1089 Py_ssize_t requiredsize;
1090 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001091 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 int res = -1;
1094
1095 if (*errorHandler == NULL) {
1096 *errorHandler = PyCodec_LookupError(errors);
1097 if (*errorHandler == NULL)
1098 goto onError;
1099 }
1100
1101 if (*exceptionObject == NULL) {
1102 *exceptionObject = PyUnicodeDecodeError_Create(
1103 encoding, input, insize, *startinpos, *endinpos, reason);
1104 if (*exceptionObject == NULL)
1105 goto onError;
1106 }
1107 else {
1108 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1109 goto onError;
1110 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1111 goto onError;
1112 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1113 goto onError;
1114 }
1115
1116 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1117 if (restuple == NULL)
1118 goto onError;
1119 if (!PyTuple_Check(restuple)) {
1120 PyErr_Format(PyExc_TypeError, &argparse[4]);
1121 goto onError;
1122 }
1123 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1124 goto onError;
1125 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001126 newpos = insize+newpos;
1127 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001128 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001129 goto onError;
1130 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001131
1132 /* need more space? (at least enough for what we
1133 have+the replacement+the rest of the string (starting
1134 at the new input position), so we won't have to check space
1135 when there are no errors in the rest of the string) */
1136 repptr = PyUnicode_AS_UNICODE(repunicode);
1137 repsize = PyUnicode_GET_SIZE(repunicode);
1138 requiredsize = *outpos + repsize + insize-newpos;
1139 if (requiredsize > outsize) {
1140 if (requiredsize<2*outsize)
1141 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001142 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001143 goto onError;
1144 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1145 }
1146 *endinpos = newpos;
1147 *inptr = input + newpos;
1148 Py_UNICODE_COPY(*outptr, repptr, repsize);
1149 *outptr += repsize;
1150 *outpos += repsize;
1151 /* we made it! */
1152 res = 0;
1153
1154 onError:
1155 Py_XDECREF(restuple);
1156 return res;
1157}
1158
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159/* --- UTF-7 Codec -------------------------------------------------------- */
1160
1161/* see RFC2152 for details */
1162
Tim Petersced69f82003-09-16 20:30:58 +00001163static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001164char utf7_special[128] = {
1165 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1166 encoded:
1167 0 - not special
1168 1 - special
1169 2 - whitespace (optional)
1170 3 - RFC2152 Set O (optional) */
1171 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1173 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1175 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1177 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1179
1180};
1181
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001182/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1183 warnings about the comparison always being false; since
1184 utf7_special[0] is 1, we can safely make that one comparison
1185 true */
1186
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001187#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001188 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001189 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001190 (encodeO && (utf7_special[(c)] == 3)))
1191
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001192#define B64(n) \
1193 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1194#define B64CHAR(c) \
1195 (isalnum(c) || (c) == '+' || (c) == '/')
1196#define UB64(c) \
1197 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1198 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001199
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001200#define ENCODE(out, ch, bits) \
1201 while (bits >= 6) { \
1202 *out++ = B64(ch >> (bits-6)); \
1203 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001204 }
1205
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001206#define DECODE(out, ch, bits, surrogate) \
1207 while (bits >= 16) { \
1208 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1209 bits -= 16; \
1210 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001211 /* We have already generated an error for the high surrogate \
1212 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001213 surrogate = 0; \
1214 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001215 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001216 it in a 16-bit character */ \
1217 surrogate = 1; \
1218 errmsg = "code pairs are not supported"; \
1219 goto utf7Error; \
1220 } else { \
1221 *out++ = outCh; \
1222 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001223 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001224
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001225PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001226 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001227 const char *errors)
1228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001230 Py_ssize_t startinpos;
1231 Py_ssize_t endinpos;
1232 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001233 const char *e;
1234 PyUnicodeObject *unicode;
1235 Py_UNICODE *p;
1236 const char *errmsg = "";
1237 int inShift = 0;
1238 unsigned int bitsleft = 0;
1239 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001240 int surrogate = 0;
1241 PyObject *errorHandler = NULL;
1242 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001243
1244 unicode = _PyUnicode_New(size);
1245 if (!unicode)
1246 return NULL;
1247 if (size == 0)
1248 return (PyObject *)unicode;
1249
1250 p = unicode->str;
1251 e = s + size;
1252
1253 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254 Py_UNICODE ch;
1255 restart:
1256 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001257
1258 if (inShift) {
1259 if ((ch == '-') || !B64CHAR(ch)) {
1260 inShift = 0;
1261 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001262
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001263 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1264 if (bitsleft >= 6) {
1265 /* The shift sequence has a partial character in it. If
1266 bitsleft < 6 then we could just classify it as padding
1267 but that is not the case here */
1268
1269 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001270 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001271 }
1272 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001273 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001274 here so indicate the potential of a misencoded character. */
1275
1276 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1277 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1278 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001279 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001280 }
1281
1282 if (ch == '-') {
1283 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001284 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001285 inShift = 1;
1286 }
1287 } else if (SPECIAL(ch,0,0)) {
1288 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001289 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001290 } else {
1291 *p++ = ch;
1292 }
1293 } else {
1294 charsleft = (charsleft << 6) | UB64(ch);
1295 bitsleft += 6;
1296 s++;
1297 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1298 }
1299 }
1300 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001302 s++;
1303 if (s < e && *s == '-') {
1304 s++;
1305 *p++ = '+';
1306 } else
1307 {
1308 inShift = 1;
1309 bitsleft = 0;
1310 }
1311 }
1312 else if (SPECIAL(ch,0,0)) {
1313 errmsg = "unexpected special character";
1314 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001315 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001316 }
1317 else {
1318 *p++ = ch;
1319 s++;
1320 }
1321 continue;
1322 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001323 outpos = p-PyUnicode_AS_UNICODE(unicode);
1324 endinpos = s-starts;
1325 if (unicode_decode_call_errorhandler(
1326 errors, &errorHandler,
1327 "utf7", errmsg,
1328 starts, size, &startinpos, &endinpos, &exc, &s,
1329 (PyObject **)&unicode, &outpos, &p))
1330 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001331 }
1332
1333 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 outpos = p-PyUnicode_AS_UNICODE(unicode);
1335 endinpos = size;
1336 if (unicode_decode_call_errorhandler(
1337 errors, &errorHandler,
1338 "utf7", "unterminated shift sequence",
1339 starts, size, &startinpos, &endinpos, &exc, &s,
1340 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001341 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001342 if (s < e)
1343 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001344 }
1345
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001346 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001347 goto onError;
1348
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 Py_XDECREF(errorHandler);
1350 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001351 return (PyObject *)unicode;
1352
1353onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 Py_XDECREF(errorHandler);
1355 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001356 Py_DECREF(unicode);
1357 return NULL;
1358}
1359
1360
1361PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001363 int encodeSetO,
1364 int encodeWhiteSpace,
1365 const char *errors)
1366{
1367 PyObject *v;
1368 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001369 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001370 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001371 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001372 unsigned int bitsleft = 0;
1373 unsigned long charsleft = 0;
1374 char * out;
1375 char * start;
1376
1377 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001378 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001379
Walter Dörwald51ab4142007-05-05 14:43:36 +00001380 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001381 if (v == NULL)
1382 return NULL;
1383
Walter Dörwald51ab4142007-05-05 14:43:36 +00001384 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001385 for (;i < size; ++i) {
1386 Py_UNICODE ch = s[i];
1387
1388 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001389 if (ch == '+') {
1390 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001391 *out++ = '-';
1392 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1393 charsleft = ch;
1394 bitsleft = 16;
1395 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001396 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001398 } else {
1399 *out++ = (char) ch;
1400 }
1401 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001402 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1403 *out++ = B64(charsleft << (6-bitsleft));
1404 charsleft = 0;
1405 bitsleft = 0;
1406 /* Characters not in the BASE64 set implicitly unshift the sequence
1407 so no '-' is required, except if the character is itself a '-' */
1408 if (B64CHAR(ch) || ch == '-') {
1409 *out++ = '-';
1410 }
1411 inShift = 0;
1412 *out++ = (char) ch;
1413 } else {
1414 bitsleft += 16;
1415 charsleft = (charsleft << 16) | ch;
1416 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1417
1418 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001419 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 or '-' then the shift sequence will be terminated implicitly and we
1421 don't have to insert a '-'. */
1422
1423 if (bitsleft == 0) {
1424 if (i + 1 < size) {
1425 Py_UNICODE ch2 = s[i+1];
1426
1427 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001428
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429 } else if (B64CHAR(ch2) || ch2 == '-') {
1430 *out++ = '-';
1431 inShift = 0;
1432 } else {
1433 inShift = 0;
1434 }
1435
1436 }
1437 else {
1438 *out++ = '-';
1439 inShift = 0;
1440 }
1441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001443 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001444 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001445 if (bitsleft) {
1446 *out++= B64(charsleft << (6-bitsleft) );
1447 *out++ = '-';
1448 }
1449
Walter Dörwald51ab4142007-05-05 14:43:36 +00001450 if (PyBytes_Resize(v, out - start)) {
1451 Py_DECREF(v);
1452 return NULL;
1453 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 return v;
1455}
1456
1457#undef SPECIAL
1458#undef B64
1459#undef B64CHAR
1460#undef UB64
1461#undef ENCODE
1462#undef DECODE
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464/* --- UTF-8 Codec -------------------------------------------------------- */
1465
Tim Petersced69f82003-09-16 20:30:58 +00001466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467char utf8_code_length[256] = {
1468 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1469 illegal prefix. see RFC 2279 for details */
1470 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1471 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1472 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1473 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1482 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1483 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1484 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1485 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1486};
1487
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001489 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 const char *errors)
1491{
Walter Dörwald69652032004-09-07 20:24:22 +00001492 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1493}
1494
1495PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001496 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001497 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001498 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001502 Py_ssize_t startinpos;
1503 Py_ssize_t endinpos;
1504 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 const char *e;
1506 PyUnicodeObject *unicode;
1507 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001508 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 PyObject *errorHandler = NULL;
1510 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511
1512 /* Note: size will always be longer than the resulting Unicode
1513 character count */
1514 unicode = _PyUnicode_New(size);
1515 if (!unicode)
1516 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001517 if (size == 0) {
1518 if (consumed)
1519 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522
1523 /* Unpack UTF-8 encoded data */
1524 p = unicode->str;
1525 e = s + size;
1526
1527 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529
1530 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001531 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 s++;
1533 continue;
1534 }
1535
1536 n = utf8_code_length[ch];
1537
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001539 if (consumed)
1540 break;
1541 else {
1542 errmsg = "unexpected end of data";
1543 startinpos = s-starts;
1544 endinpos = size;
1545 goto utf8Error;
1546 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548
1549 switch (n) {
1550
1551 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001552 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 startinpos = s-starts;
1554 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001555 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556
1557 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001558 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 startinpos = s-starts;
1560 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001561 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562
1563 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001564 if ((s[1] & 0xc0) != 0x80) {
1565 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001566 startinpos = s-starts;
1567 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001568 goto utf8Error;
1569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001571 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 startinpos = s-starts;
1573 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001574 errmsg = "illegal encoding";
1575 goto utf8Error;
1576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001578 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 break;
1580
1581 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001582 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001583 (s[2] & 0xc0) != 0x80) {
1584 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001585 startinpos = s-starts;
1586 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001587 goto utf8Error;
1588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001590 if (ch < 0x0800) {
1591 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001592 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001593
1594 XXX For wide builds (UCS-4) we should probably try
1595 to recombine the surrogates into a single code
1596 unit.
1597 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001599 startinpos = s-starts;
1600 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001601 goto utf8Error;
1602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001604 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001605 break;
1606
1607 case 4:
1608 if ((s[1] & 0xc0) != 0x80 ||
1609 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001610 (s[3] & 0xc0) != 0x80) {
1611 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 startinpos = s-starts;
1613 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001614 goto utf8Error;
1615 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001616 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1617 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1618 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001619 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001620 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001621 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001622 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001623 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001624 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001625 startinpos = s-starts;
1626 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 goto utf8Error;
1628 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001629#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001630 *p++ = (Py_UNICODE)ch;
1631#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001632 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001633
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001634 /* translate from 10000..10FFFF to 0..FFFF */
1635 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001636
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001637 /* high surrogate = top 10 bits added to D800 */
1638 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001639
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001640 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001641 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 break;
1644
1645 default:
1646 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001647 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 startinpos = s-starts;
1649 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001650 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 }
1652 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001654
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001655 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 outpos = p-PyUnicode_AS_UNICODE(unicode);
1657 if (unicode_decode_call_errorhandler(
1658 errors, &errorHandler,
1659 "utf8", errmsg,
1660 starts, size, &startinpos, &endinpos, &exc, &s,
1661 (PyObject **)&unicode, &outpos, &p))
1662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663 }
Walter Dörwald69652032004-09-07 20:24:22 +00001664 if (consumed)
1665 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666
1667 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001668 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 goto onError;
1670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 Py_XDECREF(errorHandler);
1672 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 return (PyObject *)unicode;
1674
1675onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 Py_XDECREF(errorHandler);
1677 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 Py_DECREF(unicode);
1679 return NULL;
1680}
1681
Tim Peters602f7402002-04-27 18:03:26 +00001682/* Allocation strategy: if the string is short, convert into a stack buffer
1683 and allocate exactly as much space needed at the end. Else allocate the
1684 maximum possible needed (4 result bytes per Unicode character), and return
1685 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001686*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001687PyObject *
1688PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691{
Tim Peters602f7402002-04-27 18:03:26 +00001692#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001693
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001695 PyObject *v; /* result string object */
1696 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001697 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001698 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001699 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001700
Tim Peters602f7402002-04-27 18:03:26 +00001701 assert(s != NULL);
1702 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703
Tim Peters602f7402002-04-27 18:03:26 +00001704 if (size <= MAX_SHORT_UNICHARS) {
1705 /* Write into the stack buffer; nallocated can't overflow.
1706 * At the end, we'll allocate exactly as much heap space as it
1707 * turns out we need.
1708 */
1709 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1710 v = NULL; /* will allocate after we're done */
1711 p = stackbuf;
1712 }
1713 else {
1714 /* Overallocate on the heap, and give the excess back at the end. */
1715 nallocated = size * 4;
1716 if (nallocated / 4 != size) /* overflow! */
1717 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001718 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001719 if (v == NULL)
1720 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001721 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001722 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001723
Tim Peters602f7402002-04-27 18:03:26 +00001724 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001725 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001726
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001727 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001728 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001730
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001732 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001733 *p++ = (char)(0xc0 | (ch >> 6));
1734 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001735 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001736 else {
Tim Peters602f7402002-04-27 18:03:26 +00001737 /* Encode UCS2 Unicode ordinals */
1738 if (ch < 0x10000) {
1739 /* Special case: check for high surrogate */
1740 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1741 Py_UCS4 ch2 = s[i];
1742 /* Check for low surrogate and combine the two to
1743 form a UCS4 value */
1744 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001745 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001746 i++;
1747 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001748 }
Tim Peters602f7402002-04-27 18:03:26 +00001749 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001750 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001751 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001752 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1753 *p++ = (char)(0x80 | (ch & 0x3f));
1754 continue;
1755 }
1756encodeUCS4:
1757 /* Encode UCS4 Unicode ordinals */
1758 *p++ = (char)(0xf0 | (ch >> 18));
1759 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1760 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1761 *p++ = (char)(0x80 | (ch & 0x3f));
1762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001764
Tim Peters602f7402002-04-27 18:03:26 +00001765 if (v == NULL) {
1766 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001767 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001768 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001769 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001770 }
1771 else {
1772 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001773 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001774 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001775 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001778
Tim Peters602f7402002-04-27 18:03:26 +00001779#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780}
1781
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1783{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if (!PyUnicode_Check(unicode)) {
1785 PyErr_BadArgument();
1786 return NULL;
1787 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001788 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1789 PyUnicode_GET_SIZE(unicode),
1790 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791}
1792
1793/* --- UTF-16 Codec ------------------------------------------------------- */
1794
Tim Peters772747b2001-08-09 22:21:55 +00001795PyObject *
1796PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001797 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001798 const char *errors,
1799 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800{
Walter Dörwald69652032004-09-07 20:24:22 +00001801 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1802}
1803
1804PyObject *
1805PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001806 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001807 const char *errors,
1808 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001809 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001812 Py_ssize_t startinpos;
1813 Py_ssize_t endinpos;
1814 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 PyUnicodeObject *unicode;
1816 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001817 const unsigned char *q, *e;
1818 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001819 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001820 /* Offsets from q for retrieving byte pairs in the right order. */
1821#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1822 int ihi = 1, ilo = 0;
1823#else
1824 int ihi = 0, ilo = 1;
1825#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 PyObject *errorHandler = NULL;
1827 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828
1829 /* Note: size will always be longer than the resulting Unicode
1830 character count */
1831 unicode = _PyUnicode_New(size);
1832 if (!unicode)
1833 return NULL;
1834 if (size == 0)
1835 return (PyObject *)unicode;
1836
1837 /* Unpack UTF-16 encoded data */
1838 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001839 q = (unsigned char *)s;
1840 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841
1842 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001843 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001845 /* Check for BOM marks (U+FEFF) in the input and adjust current
1846 byte order setting accordingly. In native mode, the leading BOM
1847 mark is skipped, in all other modes, it is copied to the output
1848 stream as-is (giving a ZWNBSP character). */
1849 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001850 if (size >= 2) {
1851 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001852#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001853 if (bom == 0xFEFF) {
1854 q += 2;
1855 bo = -1;
1856 }
1857 else if (bom == 0xFFFE) {
1858 q += 2;
1859 bo = 1;
1860 }
Tim Petersced69f82003-09-16 20:30:58 +00001861#else
Walter Dörwald69652032004-09-07 20:24:22 +00001862 if (bom == 0xFEFF) {
1863 q += 2;
1864 bo = 1;
1865 }
1866 else if (bom == 0xFFFE) {
1867 q += 2;
1868 bo = -1;
1869 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001870#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001871 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
Tim Peters772747b2001-08-09 22:21:55 +00001874 if (bo == -1) {
1875 /* force LE */
1876 ihi = 1;
1877 ilo = 0;
1878 }
1879 else if (bo == 1) {
1880 /* force BE */
1881 ihi = 0;
1882 ilo = 1;
1883 }
1884
1885 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001887 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001889 if (consumed)
1890 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 errmsg = "truncated data";
1892 startinpos = ((const char *)q)-starts;
1893 endinpos = ((const char *)e)-starts;
1894 goto utf16Error;
1895 /* The remaining input chars are ignored if the callback
1896 chooses to skip the input */
1897 }
1898 ch = (q[ihi] << 8) | q[ilo];
1899
Tim Peters772747b2001-08-09 22:21:55 +00001900 q += 2;
1901
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 if (ch < 0xD800 || ch > 0xDFFF) {
1903 *p++ = ch;
1904 continue;
1905 }
1906
1907 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001908 if (q >= e) {
1909 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001910 startinpos = (((const char *)q)-2)-starts;
1911 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001912 goto utf16Error;
1913 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001914 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001915 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1916 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001917 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001918#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001919 *p++ = ch;
1920 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001921#else
1922 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001923#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001924 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925 }
1926 else {
1927 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001928 startinpos = (((const char *)q)-4)-starts;
1929 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001930 goto utf16Error;
1931 }
1932
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001934 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 startinpos = (((const char *)q)-2)-starts;
1936 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 /* Fall through to report the error */
1938
1939 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 outpos = p-PyUnicode_AS_UNICODE(unicode);
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "utf16", errmsg,
1944 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1945 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 }
1948
1949 if (byteorder)
1950 *byteorder = bo;
1951
Walter Dörwald69652032004-09-07 20:24:22 +00001952 if (consumed)
1953 *consumed = (const char *)q-starts;
1954
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001956 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 goto onError;
1958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 Py_XDECREF(errorHandler);
1960 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 return (PyObject *)unicode;
1962
1963onError:
1964 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 Py_XDECREF(errorHandler);
1966 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 return NULL;
1968}
1969
Tim Peters772747b2001-08-09 22:21:55 +00001970PyObject *
1971PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001972 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001973 const char *errors,
1974 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975{
1976 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001977 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001978#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001979 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001980#else
1981 const int pairs = 0;
1982#endif
Tim Peters772747b2001-08-09 22:21:55 +00001983 /* Offsets from p for storing byte pairs in the right order. */
1984#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1985 int ihi = 1, ilo = 0;
1986#else
1987 int ihi = 0, ilo = 1;
1988#endif
1989
1990#define STORECHAR(CH) \
1991 do { \
1992 p[ihi] = ((CH) >> 8) & 0xff; \
1993 p[ilo] = (CH) & 0xff; \
1994 p += 2; \
1995 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001997#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001998 for (i = pairs = 0; i < size; i++)
1999 if (s[i] >= 0x10000)
2000 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002001#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002002 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002003 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 if (v == NULL)
2005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006
Walter Dörwald3cc34522007-05-04 10:48:27 +00002007 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002009 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002010 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002011 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002012
2013 if (byteorder == -1) {
2014 /* force LE */
2015 ihi = 1;
2016 ilo = 0;
2017 }
2018 else if (byteorder == 1) {
2019 /* force BE */
2020 ihi = 0;
2021 ilo = 1;
2022 }
2023
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002024 while (size-- > 0) {
2025 Py_UNICODE ch = *s++;
2026 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002027#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002028 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002029 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2030 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002032#endif
Tim Peters772747b2001-08-09 22:21:55 +00002033 STORECHAR(ch);
2034 if (ch2)
2035 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002038#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039}
2040
2041PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2042{
2043 if (!PyUnicode_Check(unicode)) {
2044 PyErr_BadArgument();
2045 return NULL;
2046 }
2047 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2048 PyUnicode_GET_SIZE(unicode),
2049 NULL,
2050 0);
2051}
2052
2053/* --- Unicode Escape Codec ----------------------------------------------- */
2054
Fredrik Lundh06d12682001-01-24 07:59:11 +00002055static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002056
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002058 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 const char *errors)
2060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002062 Py_ssize_t startinpos;
2063 Py_ssize_t endinpos;
2064 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002065 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002069 char* message;
2070 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 PyObject *errorHandler = NULL;
2072 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002073
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 /* Escaped strings will always be longer than the resulting
2075 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 length after conversion to the true value.
2077 (but if the error callback returns a long replacement string
2078 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 v = _PyUnicode_New(size);
2080 if (v == NULL)
2081 goto onError;
2082 if (size == 0)
2083 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002087
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088 while (s < end) {
2089 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002090 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092
2093 /* Non-escape characters are interpreted as Unicode ordinals */
2094 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002095 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 continue;
2097 }
2098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 /* \ - Escapes */
2101 s++;
2102 switch (*s++) {
2103
2104 /* \x escapes */
2105 case '\n': break;
2106 case '\\': *p++ = '\\'; break;
2107 case '\'': *p++ = '\''; break;
2108 case '\"': *p++ = '\"'; break;
2109 case 'b': *p++ = '\b'; break;
2110 case 'f': *p++ = '\014'; break; /* FF */
2111 case 't': *p++ = '\t'; break;
2112 case 'n': *p++ = '\n'; break;
2113 case 'r': *p++ = '\r'; break;
2114 case 'v': *p++ = '\013'; break; /* VT */
2115 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2116
2117 /* \OOO (octal) escapes */
2118 case '0': case '1': case '2': case '3':
2119 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002120 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002122 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002124 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002126 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 break;
2128
Fredrik Lundhccc74732001-02-18 22:13:49 +00002129 /* hex escapes */
2130 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002132 digits = 2;
2133 message = "truncated \\xXX escape";
2134 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135
Fredrik Lundhccc74732001-02-18 22:13:49 +00002136 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002138 digits = 4;
2139 message = "truncated \\uXXXX escape";
2140 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141
Fredrik Lundhccc74732001-02-18 22:13:49 +00002142 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002143 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002144 digits = 8;
2145 message = "truncated \\UXXXXXXXX escape";
2146 hexescape:
2147 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002148 outpos = p-PyUnicode_AS_UNICODE(v);
2149 if (s+digits>end) {
2150 endinpos = size;
2151 if (unicode_decode_call_errorhandler(
2152 errors, &errorHandler,
2153 "unicodeescape", "end of string in escape sequence",
2154 starts, size, &startinpos, &endinpos, &exc, &s,
2155 (PyObject **)&v, &outpos, &p))
2156 goto onError;
2157 goto nextByte;
2158 }
2159 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002160 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002161 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 endinpos = (s+i+1)-starts;
2163 if (unicode_decode_call_errorhandler(
2164 errors, &errorHandler,
2165 "unicodeescape", message,
2166 starts, size, &startinpos, &endinpos, &exc, &s,
2167 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002168 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002170 }
2171 chr = (chr<<4) & ~0xF;
2172 if (c >= '0' && c <= '9')
2173 chr += c - '0';
2174 else if (c >= 'a' && c <= 'f')
2175 chr += 10 + c - 'a';
2176 else
2177 chr += 10 + c - 'A';
2178 }
2179 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002180 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 /* _decoding_error will have already written into the
2182 target buffer. */
2183 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002184 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002185 /* when we get here, chr is a 32-bit unicode character */
2186 if (chr <= 0xffff)
2187 /* UCS-2 character */
2188 *p++ = (Py_UNICODE) chr;
2189 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002190 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002191 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002193 *p++ = chr;
2194#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002195 chr -= 0x10000L;
2196 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002197 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002198#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002199 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 endinpos = s-starts;
2201 outpos = p-PyUnicode_AS_UNICODE(v);
2202 if (unicode_decode_call_errorhandler(
2203 errors, &errorHandler,
2204 "unicodeescape", "illegal Unicode character",
2205 starts, size, &startinpos, &endinpos, &exc, &s,
2206 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002207 goto onError;
2208 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002209 break;
2210
2211 /* \N{name} */
2212 case 'N':
2213 message = "malformed \\N character escape";
2214 if (ucnhash_CAPI == NULL) {
2215 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002216 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002217 m = PyImport_ImportModule("unicodedata");
2218 if (m == NULL)
2219 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002220 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002221 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002222 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002223 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002224 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002225 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002226 if (ucnhash_CAPI == NULL)
2227 goto ucnhashError;
2228 }
2229 if (*s == '{') {
2230 const char *start = s+1;
2231 /* look for the closing brace */
2232 while (*s != '}' && s < end)
2233 s++;
2234 if (s > start && s < end && *s == '}') {
2235 /* found a name. look it up in the unicode database */
2236 message = "unknown Unicode character name";
2237 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002238 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002239 goto store;
2240 }
2241 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 endinpos = s-starts;
2243 outpos = p-PyUnicode_AS_UNICODE(v);
2244 if (unicode_decode_call_errorhandler(
2245 errors, &errorHandler,
2246 "unicodeescape", message,
2247 starts, size, &startinpos, &endinpos, &exc, &s,
2248 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002249 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002250 break;
2251
2252 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002253 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 message = "\\ at end of string";
2255 s--;
2256 endinpos = s-starts;
2257 outpos = p-PyUnicode_AS_UNICODE(v);
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "unicodeescape", message,
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002263 goto onError;
2264 }
2265 else {
2266 *p++ = '\\';
2267 *p++ = (unsigned char)s[-1];
2268 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002269 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 nextByte:
2272 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002274 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002276 Py_XDECREF(errorHandler);
2277 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002279
Fredrik Lundhccc74732001-02-18 22:13:49 +00002280ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002281 PyErr_SetString(
2282 PyExc_UnicodeError,
2283 "\\N escapes not supported (can't load unicodedata module)"
2284 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002285 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 Py_XDECREF(errorHandler);
2287 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002288 return NULL;
2289
Fredrik Lundhccc74732001-02-18 22:13:49 +00002290onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 Py_XDECREF(errorHandler);
2293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 return NULL;
2295}
2296
2297/* Return a Unicode-Escape string version of the Unicode object.
2298
2299 If quotes is true, the string is enclosed in u"" or u'' quotes as
2300 appropriate.
2301
2302*/
2303
Thomas Wouters477c8d52006-05-27 19:21:47 +00002304Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2305 Py_ssize_t size,
2306 Py_UNICODE ch)
2307{
2308 /* like wcschr, but doesn't stop at NULL characters */
2309
2310 while (size-- > 0) {
2311 if (*s == ch)
2312 return s;
2313 s++;
2314 }
2315
2316 return NULL;
2317}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002318
Walter Dörwald79e913e2007-05-12 11:08:06 +00002319static const char *hexdigits = "0123456789abcdef";
2320
2321PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2322 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323{
2324 PyObject *repr;
2325 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326
Thomas Wouters89f507f2006-12-13 04:49:30 +00002327 /* XXX(nnorwitz): rather than over-allocating, it would be
2328 better to choose a different scheme. Perhaps scan the
2329 first N-chars of the string and allocate based on that size.
2330 */
2331 /* Initial allocation is based on the longest-possible unichr
2332 escape.
2333
2334 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2335 unichr, so in this case it's the longest unichr escape. In
2336 narrow (UTF-16) builds this is five chars per source unichr
2337 since there are two unichrs in the surrogate pair, so in narrow
2338 (UTF-16) builds it's not the longest unichr escape.
2339
2340 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2341 so in the narrow (UTF-16) build case it's the longest unichr
2342 escape.
2343 */
2344
Walter Dörwald79e913e2007-05-12 11:08:06 +00002345 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002346#ifdef Py_UNICODE_WIDE
2347 + 10*size
2348#else
2349 + 6*size
2350#endif
2351 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 if (repr == NULL)
2353 return NULL;
2354
Walter Dörwald79e913e2007-05-12 11:08:06 +00002355 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 while (size-- > 0) {
2358 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002359
Walter Dörwald79e913e2007-05-12 11:08:06 +00002360 /* Escape backslashes */
2361 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 *p++ = '\\';
2363 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002364 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002365 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002366
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002367#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002368 /* Map 21-bit characters to '\U00xxxxxx' */
2369 else if (ch >= 0x10000) {
2370 *p++ = '\\';
2371 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002372 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2373 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2374 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2375 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2376 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2377 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2378 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2379 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002380 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002381 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002382#else
2383 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002384 else if (ch >= 0xD800 && ch < 0xDC00) {
2385 Py_UNICODE ch2;
2386 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002388 ch2 = *s++;
2389 size--;
2390 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2391 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2392 *p++ = '\\';
2393 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002394 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2395 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2396 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2397 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2398 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2399 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2400 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2401 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002402 continue;
2403 }
2404 /* Fall through: isolated surrogates are copied as-is */
2405 s--;
2406 size++;
2407 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002408#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002409
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002411 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 *p++ = '\\';
2413 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002414 *p++ = hexdigits[(ch >> 12) & 0x000F];
2415 *p++ = hexdigits[(ch >> 8) & 0x000F];
2416 *p++ = hexdigits[(ch >> 4) & 0x000F];
2417 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002419
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002420 /* Map special whitespace to '\t', \n', '\r' */
2421 else if (ch == '\t') {
2422 *p++ = '\\';
2423 *p++ = 't';
2424 }
2425 else if (ch == '\n') {
2426 *p++ = '\\';
2427 *p++ = 'n';
2428 }
2429 else if (ch == '\r') {
2430 *p++ = '\\';
2431 *p++ = 'r';
2432 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002433
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002434 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002435 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002437 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002438 *p++ = hexdigits[(ch >> 4) & 0x000F];
2439 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002440 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002441
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 /* Copy everything else as-is */
2443 else
2444 *p++ = (char) ch;
2445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446
2447 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002448 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2449 Py_DECREF(repr);
2450 return NULL;
2451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 return repr;
2453}
2454
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2456{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002457 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 if (!PyUnicode_Check(unicode)) {
2459 PyErr_BadArgument();
2460 return NULL;
2461 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002462 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2463 PyUnicode_GET_SIZE(unicode));
2464
2465 if (!s)
2466 return NULL;
2467 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2468 PyBytes_GET_SIZE(s));
2469 Py_DECREF(s);
2470 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471}
2472
2473/* --- Raw Unicode Escape Codec ------------------------------------------- */
2474
2475PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002476 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 const char *errors)
2478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002480 Py_ssize_t startinpos;
2481 Py_ssize_t endinpos;
2482 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 const char *end;
2486 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 PyObject *errorHandler = NULL;
2488 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002489
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 /* Escaped strings will always be longer than the resulting
2491 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 length after conversion to the true value. (But decoding error
2493 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 v = _PyUnicode_New(size);
2495 if (v == NULL)
2496 goto onError;
2497 if (size == 0)
2498 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 end = s + size;
2501 while (s < end) {
2502 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002503 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002505 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 /* Non-escape characters are interpreted as Unicode ordinals */
2508 if (*s != '\\') {
2509 *p++ = (unsigned char)*s++;
2510 continue;
2511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002512 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
2514 /* \u-escapes are only interpreted iff the number of leading
2515 backslashes if odd */
2516 bs = s;
2517 for (;s < end;) {
2518 if (*s != '\\')
2519 break;
2520 *p++ = (unsigned char)*s++;
2521 }
2522 if (((s - bs) & 1) == 0 ||
2523 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002524 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 continue;
2526 }
2527 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002528 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 s++;
2530
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002531 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002533 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 endinpos = s-starts;
2537 if (unicode_decode_call_errorhandler(
2538 errors, &errorHandler,
2539 "rawunicodeescape", "truncated \\uXXXX",
2540 starts, size, &startinpos, &endinpos, &exc, &s,
2541 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 }
2545 x = (x<<4) & ~0xF;
2546 if (c >= '0' && c <= '9')
2547 x += c - '0';
2548 else if (c >= 'a' && c <= 'f')
2549 x += 10 + c - 'a';
2550 else
2551 x += 10 + c - 'A';
2552 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002553#ifndef Py_UNICODE_WIDE
2554 if (x > 0x10000) {
2555 if (unicode_decode_call_errorhandler(
2556 errors, &errorHandler,
2557 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2558 starts, size, &startinpos, &endinpos, &exc, &s,
2559 (PyObject **)&v, &outpos, &p))
2560 goto onError;
2561 }
2562#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 *p++ = x;
2564 nextByte:
2565 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002567 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002568 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002569 Py_XDECREF(errorHandler);
2570 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002572
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 onError:
2574 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002575 Py_XDECREF(errorHandler);
2576 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577 return NULL;
2578}
2579
2580PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582{
2583 PyObject *repr;
2584 char *p;
2585 char *q;
2586
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002587#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002588 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002589#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002590 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002591#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 if (repr == NULL)
2593 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002594 if (size == 0)
2595 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596
Walter Dörwald711005d2007-05-12 12:03:26 +00002597 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 while (size-- > 0) {
2599 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002600#ifdef Py_UNICODE_WIDE
2601 /* Map 32-bit characters to '\Uxxxxxxxx' */
2602 if (ch >= 0x10000) {
2603 *p++ = '\\';
2604 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002605 *p++ = hexdigits[(ch >> 28) & 0xf];
2606 *p++ = hexdigits[(ch >> 24) & 0xf];
2607 *p++ = hexdigits[(ch >> 20) & 0xf];
2608 *p++ = hexdigits[(ch >> 16) & 0xf];
2609 *p++ = hexdigits[(ch >> 12) & 0xf];
2610 *p++ = hexdigits[(ch >> 8) & 0xf];
2611 *p++ = hexdigits[(ch >> 4) & 0xf];
2612 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002613 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002614 else
2615#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 /* Map 16-bit characters to '\uxxxx' */
2617 if (ch >= 256) {
2618 *p++ = '\\';
2619 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002620 *p++ = hexdigits[(ch >> 12) & 0xf];
2621 *p++ = hexdigits[(ch >> 8) & 0xf];
2622 *p++ = hexdigits[(ch >> 4) & 0xf];
2623 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 }
2625 /* Copy everything else as-is */
2626 else
2627 *p++ = (char) ch;
2628 }
2629 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002630 if (PyBytes_Resize(repr, p - q)) {
2631 Py_DECREF(repr);
2632 return NULL;
2633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 return repr;
2635}
2636
2637PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2638{
Walter Dörwald711005d2007-05-12 12:03:26 +00002639 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002641 PyErr_BadArgument();
2642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002644 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2645 PyUnicode_GET_SIZE(unicode));
2646
2647 if (!s)
2648 return NULL;
2649 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2650 PyBytes_GET_SIZE(s));
2651 Py_DECREF(s);
2652 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653}
2654
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002655/* --- Unicode Internal Codec ------------------------------------------- */
2656
2657PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002658 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002659 const char *errors)
2660{
2661 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002662 Py_ssize_t startinpos;
2663 Py_ssize_t endinpos;
2664 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002665 PyUnicodeObject *v;
2666 Py_UNICODE *p;
2667 const char *end;
2668 const char *reason;
2669 PyObject *errorHandler = NULL;
2670 PyObject *exc = NULL;
2671
Neal Norwitzd43069c2006-01-08 01:12:10 +00002672#ifdef Py_UNICODE_WIDE
2673 Py_UNICODE unimax = PyUnicode_GetMax();
2674#endif
2675
Thomas Wouters89f507f2006-12-13 04:49:30 +00002676 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002677 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2678 if (v == NULL)
2679 goto onError;
2680 if (PyUnicode_GetSize((PyObject *)v) == 0)
2681 return (PyObject *)v;
2682 p = PyUnicode_AS_UNICODE(v);
2683 end = s + size;
2684
2685 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002686 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002687 /* We have to sanity check the raw data, otherwise doom looms for
2688 some malformed UCS-4 data. */
2689 if (
2690 #ifdef Py_UNICODE_WIDE
2691 *p > unimax || *p < 0 ||
2692 #endif
2693 end-s < Py_UNICODE_SIZE
2694 )
2695 {
2696 startinpos = s - starts;
2697 if (end-s < Py_UNICODE_SIZE) {
2698 endinpos = end-starts;
2699 reason = "truncated input";
2700 }
2701 else {
2702 endinpos = s - starts + Py_UNICODE_SIZE;
2703 reason = "illegal code point (> 0x10FFFF)";
2704 }
2705 outpos = p - PyUnicode_AS_UNICODE(v);
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "unicode_internal", reason,
2709 starts, size, &startinpos, &endinpos, &exc, &s,
2710 (PyObject **)&v, &outpos, &p)) {
2711 goto onError;
2712 }
2713 }
2714 else {
2715 p++;
2716 s += Py_UNICODE_SIZE;
2717 }
2718 }
2719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002720 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002721 goto onError;
2722 Py_XDECREF(errorHandler);
2723 Py_XDECREF(exc);
2724 return (PyObject *)v;
2725
2726 onError:
2727 Py_XDECREF(v);
2728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
2730 return NULL;
2731}
2732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733/* --- Latin-1 Codec ------------------------------------------------------ */
2734
2735PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002736 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 const char *errors)
2738{
2739 PyUnicodeObject *v;
2740 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002741
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002743 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002744 Py_UNICODE r = *(unsigned char*)s;
2745 return PyUnicode_FromUnicode(&r, 1);
2746 }
2747
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 v = _PyUnicode_New(size);
2749 if (v == NULL)
2750 goto onError;
2751 if (size == 0)
2752 return (PyObject *)v;
2753 p = PyUnicode_AS_UNICODE(v);
2754 while (size-- > 0)
2755 *p++ = (unsigned char)*s++;
2756 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002757
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 onError:
2759 Py_XDECREF(v);
2760 return NULL;
2761}
2762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763/* create or adjust a UnicodeEncodeError */
2764static void make_encode_exception(PyObject **exceptionObject,
2765 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 const Py_UNICODE *unicode, Py_ssize_t size,
2767 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 if (*exceptionObject == NULL) {
2771 *exceptionObject = PyUnicodeEncodeError_Create(
2772 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2776 goto onError;
2777 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2778 goto onError;
2779 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2780 goto onError;
2781 return;
2782 onError:
2783 Py_DECREF(*exceptionObject);
2784 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 }
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* raises a UnicodeEncodeError */
2789static void raise_encode_exception(PyObject **exceptionObject,
2790 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002791 const Py_UNICODE *unicode, Py_ssize_t size,
2792 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 const char *reason)
2794{
2795 make_encode_exception(exceptionObject,
2796 encoding, unicode, size, startpos, endpos, reason);
2797 if (*exceptionObject != NULL)
2798 PyCodec_StrictErrors(*exceptionObject);
2799}
2800
2801/* error handling callback helper:
2802 build arguments, call the callback and check the arguments,
2803 put the result into newpos and return the replacement string, which
2804 has to be freed by the caller */
2805static PyObject *unicode_encode_call_errorhandler(const char *errors,
2806 PyObject **errorHandler,
2807 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002808 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2809 Py_ssize_t startpos, Py_ssize_t endpos,
2810 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002812 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813
2814 PyObject *restuple;
2815 PyObject *resunicode;
2816
2817 if (*errorHandler == NULL) {
2818 *errorHandler = PyCodec_LookupError(errors);
2819 if (*errorHandler == NULL)
2820 return NULL;
2821 }
2822
2823 make_encode_exception(exceptionObject,
2824 encoding, unicode, size, startpos, endpos, reason);
2825 if (*exceptionObject == NULL)
2826 return NULL;
2827
2828 restuple = PyObject_CallFunctionObjArgs(
2829 *errorHandler, *exceptionObject, NULL);
2830 if (restuple == NULL)
2831 return NULL;
2832 if (!PyTuple_Check(restuple)) {
2833 PyErr_Format(PyExc_TypeError, &argparse[4]);
2834 Py_DECREF(restuple);
2835 return NULL;
2836 }
2837 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2838 &resunicode, newpos)) {
2839 Py_DECREF(restuple);
2840 return NULL;
2841 }
2842 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002843 *newpos = size+*newpos;
2844 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002846 Py_DECREF(restuple);
2847 return NULL;
2848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 Py_INCREF(resunicode);
2850 Py_DECREF(restuple);
2851 return resunicode;
2852}
2853
2854static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 const char *errors,
2857 int limit)
2858{
2859 /* output object */
2860 PyObject *res;
2861 /* pointers to the beginning and end+1 of input */
2862 const Py_UNICODE *startp = p;
2863 const Py_UNICODE *endp = p + size;
2864 /* pointer to the beginning of the unencodable characters */
2865 /* const Py_UNICODE *badp = NULL; */
2866 /* pointer into the output */
2867 char *str;
2868 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002869 Py_ssize_t respos = 0;
2870 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002871 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2872 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 PyObject *errorHandler = NULL;
2874 PyObject *exc = NULL;
2875 /* the following variable is used for caching string comparisons
2876 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2877 int known_errorHandler = -1;
2878
2879 /* allocate enough for a simple encoding without
2880 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002881 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 if (res == NULL)
2883 goto onError;
2884 if (size == 0)
2885 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002886 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 ressize = size;
2888
2889 while (p<endp) {
2890 Py_UNICODE c = *p;
2891
2892 /* can we encode this? */
2893 if (c<limit) {
2894 /* no overflow check, because we know that the space is enough */
2895 *str++ = (char)c;
2896 ++p;
2897 }
2898 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002899 Py_ssize_t unicodepos = p-startp;
2900 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002902 Py_ssize_t repsize;
2903 Py_ssize_t newpos;
2904 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905 Py_UNICODE *uni2;
2906 /* startpos for collecting unencodable chars */
2907 const Py_UNICODE *collstart = p;
2908 const Py_UNICODE *collend = p;
2909 /* find all unecodable characters */
2910 while ((collend < endp) && ((*collend)>=limit))
2911 ++collend;
2912 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2913 if (known_errorHandler==-1) {
2914 if ((errors==NULL) || (!strcmp(errors, "strict")))
2915 known_errorHandler = 1;
2916 else if (!strcmp(errors, "replace"))
2917 known_errorHandler = 2;
2918 else if (!strcmp(errors, "ignore"))
2919 known_errorHandler = 3;
2920 else if (!strcmp(errors, "xmlcharrefreplace"))
2921 known_errorHandler = 4;
2922 else
2923 known_errorHandler = 0;
2924 }
2925 switch (known_errorHandler) {
2926 case 1: /* strict */
2927 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2928 goto onError;
2929 case 2: /* replace */
2930 while (collstart++<collend)
2931 *str++ = '?'; /* fall through */
2932 case 3: /* ignore */
2933 p = collend;
2934 break;
2935 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002936 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 /* determine replacement size (temporarily (mis)uses p) */
2938 for (p = collstart, repsize = 0; p < collend; ++p) {
2939 if (*p<10)
2940 repsize += 2+1+1;
2941 else if (*p<100)
2942 repsize += 2+2+1;
2943 else if (*p<1000)
2944 repsize += 2+3+1;
2945 else if (*p<10000)
2946 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002947#ifndef Py_UNICODE_WIDE
2948 else
2949 repsize += 2+5+1;
2950#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 else if (*p<100000)
2952 repsize += 2+5+1;
2953 else if (*p<1000000)
2954 repsize += 2+6+1;
2955 else
2956 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002957#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 }
2959 requiredsize = respos+repsize+(endp-collend);
2960 if (requiredsize > ressize) {
2961 if (requiredsize<2*ressize)
2962 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002963 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002965 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 ressize = requiredsize;
2967 }
2968 /* generate replacement (temporarily (mis)uses p) */
2969 for (p = collstart; p < collend; ++p) {
2970 str += sprintf(str, "&#%d;", (int)*p);
2971 }
2972 p = collend;
2973 break;
2974 default:
2975 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2976 encoding, reason, startp, size, &exc,
2977 collstart-startp, collend-startp, &newpos);
2978 if (repunicode == NULL)
2979 goto onError;
2980 /* need more space? (at least enough for what we
2981 have+the replacement+the rest of the string, so
2982 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002983 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 repsize = PyUnicode_GET_SIZE(repunicode);
2985 requiredsize = respos+repsize+(endp-collend);
2986 if (requiredsize > ressize) {
2987 if (requiredsize<2*ressize)
2988 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002989 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 Py_DECREF(repunicode);
2991 goto onError;
2992 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002993 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 ressize = requiredsize;
2995 }
2996 /* check if there is anything unencodable in the replacement
2997 and copy it to the output */
2998 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2999 c = *uni2;
3000 if (c >= limit) {
3001 raise_encode_exception(&exc, encoding, startp, size,
3002 unicodepos, unicodepos+1, reason);
3003 Py_DECREF(repunicode);
3004 goto onError;
3005 }
3006 *str = (char)c;
3007 }
3008 p = startp + newpos;
3009 Py_DECREF(repunicode);
3010 }
3011 }
3012 }
3013 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003014 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015 if (respos<ressize)
3016 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003017 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 Py_XDECREF(errorHandler);
3019 Py_XDECREF(exc);
3020 return res;
3021
3022 onError:
3023 Py_XDECREF(res);
3024 Py_XDECREF(errorHandler);
3025 Py_XDECREF(exc);
3026 return NULL;
3027}
3028
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003030 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 const char *errors)
3032{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034}
3035
3036PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3037{
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 return NULL;
3041 }
3042 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3043 PyUnicode_GET_SIZE(unicode),
3044 NULL);
3045}
3046
3047/* --- 7-bit ASCII Codec -------------------------------------------------- */
3048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 const char *errors)
3052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 PyUnicodeObject *v;
3055 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003056 Py_ssize_t startinpos;
3057 Py_ssize_t endinpos;
3058 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *e;
3060 PyObject *errorHandler = NULL;
3061 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003062
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003064 if (size == 1 && *(unsigned char*)s < 128) {
3065 Py_UNICODE r = *(unsigned char*)s;
3066 return PyUnicode_FromUnicode(&r, 1);
3067 }
Tim Petersced69f82003-09-16 20:30:58 +00003068
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 v = _PyUnicode_New(size);
3070 if (v == NULL)
3071 goto onError;
3072 if (size == 0)
3073 return (PyObject *)v;
3074 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 e = s + size;
3076 while (s < e) {
3077 register unsigned char c = (unsigned char)*s;
3078 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 ++s;
3081 }
3082 else {
3083 startinpos = s-starts;
3084 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003085 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 if (unicode_decode_call_errorhandler(
3087 errors, &errorHandler,
3088 "ascii", "ordinal not in range(128)",
3089 starts, size, &startinpos, &endinpos, &exc, &s,
3090 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003094 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003095 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003096 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 Py_XDECREF(errorHandler);
3098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003100
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 onError:
3102 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 Py_XDECREF(errorHandler);
3104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 return NULL;
3106}
3107
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003109 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 const char *errors)
3111{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113}
3114
3115PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3116{
3117 if (!PyUnicode_Check(unicode)) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 NULL);
3124}
3125
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003126#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003127
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003128/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003129
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003130#if SIZEOF_INT < SIZEOF_SSIZE_T
3131#define NEED_RETRY
3132#endif
3133
3134/* XXX This code is limited to "true" double-byte encodings, as
3135 a) it assumes an incomplete character consists of a single byte, and
3136 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3137 encodings, see IsDBCSLeadByteEx documentation. */
3138
3139static int is_dbcs_lead_byte(const char *s, int offset)
3140{
3141 const char *curr = s + offset;
3142
3143 if (IsDBCSLeadByte(*curr)) {
3144 const char *prev = CharPrev(s, curr);
3145 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3146 }
3147 return 0;
3148}
3149
3150/*
3151 * Decode MBCS string into unicode object. If 'final' is set, converts
3152 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3153 */
3154static int decode_mbcs(PyUnicodeObject **v,
3155 const char *s, /* MBCS string */
3156 int size, /* sizeof MBCS string */
3157 int final)
3158{
3159 Py_UNICODE *p;
3160 Py_ssize_t n = 0;
3161 int usize = 0;
3162
3163 assert(size >= 0);
3164
3165 /* Skip trailing lead-byte unless 'final' is set */
3166 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3167 --size;
3168
3169 /* First get the size of the result */
3170 if (size > 0) {
3171 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3172 if (usize == 0) {
3173 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3174 return -1;
3175 }
3176 }
3177
3178 if (*v == NULL) {
3179 /* Create unicode object */
3180 *v = _PyUnicode_New(usize);
3181 if (*v == NULL)
3182 return -1;
3183 }
3184 else {
3185 /* Extend unicode object */
3186 n = PyUnicode_GET_SIZE(*v);
3187 if (_PyUnicode_Resize(v, n + usize) < 0)
3188 return -1;
3189 }
3190
3191 /* Do the conversion */
3192 if (size > 0) {
3193 p = PyUnicode_AS_UNICODE(*v) + n;
3194 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3195 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3196 return -1;
3197 }
3198 }
3199
3200 return size;
3201}
3202
3203PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3204 Py_ssize_t size,
3205 const char *errors,
3206 Py_ssize_t *consumed)
3207{
3208 PyUnicodeObject *v = NULL;
3209 int done;
3210
3211 if (consumed)
3212 *consumed = 0;
3213
3214#ifdef NEED_RETRY
3215 retry:
3216 if (size > INT_MAX)
3217 done = decode_mbcs(&v, s, INT_MAX, 0);
3218 else
3219#endif
3220 done = decode_mbcs(&v, s, (int)size, !consumed);
3221
3222 if (done < 0) {
3223 Py_XDECREF(v);
3224 return NULL;
3225 }
3226
3227 if (consumed)
3228 *consumed += done;
3229
3230#ifdef NEED_RETRY
3231 if (size > INT_MAX) {
3232 s += done;
3233 size -= done;
3234 goto retry;
3235 }
3236#endif
3237
3238 return (PyObject *)v;
3239}
3240
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003241PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003242 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003243 const char *errors)
3244{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003245 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3246}
3247
3248/*
3249 * Convert unicode into string object (MBCS).
3250 * Returns 0 if succeed, -1 otherwise.
3251 */
3252static int encode_mbcs(PyObject **repr,
3253 const Py_UNICODE *p, /* unicode */
3254 int size) /* size of unicode */
3255{
3256 int mbcssize = 0;
3257 Py_ssize_t n = 0;
3258
3259 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003260
3261 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003262 if (size > 0) {
3263 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3264 if (mbcssize == 0) {
3265 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3266 return -1;
3267 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003268 }
3269
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003270 if (*repr == NULL) {
3271 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003272 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003273 if (*repr == NULL)
3274 return -1;
3275 }
3276 else {
3277 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003278 n = PyBytes_Size(*repr);
3279 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003280 return -1;
3281 }
3282
3283 /* Do the conversion */
3284 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003285 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003286 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3287 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3288 return -1;
3289 }
3290 }
3291
3292 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003293}
3294
3295PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003296 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003297 const char *errors)
3298{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003299 PyObject *repr = NULL;
3300 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003302#ifdef NEED_RETRY
3303 retry:
3304 if (size > INT_MAX)
3305 ret = encode_mbcs(&repr, p, INT_MAX);
3306 else
3307#endif
3308 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003309
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003310 if (ret < 0) {
3311 Py_XDECREF(repr);
3312 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003313 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003314
3315#ifdef NEED_RETRY
3316 if (size > INT_MAX) {
3317 p += INT_MAX;
3318 size -= INT_MAX;
3319 goto retry;
3320 }
3321#endif
3322
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003323 return repr;
3324}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003325
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003326PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3327{
3328 if (!PyUnicode_Check(unicode)) {
3329 PyErr_BadArgument();
3330 return NULL;
3331 }
3332 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3333 PyUnicode_GET_SIZE(unicode),
3334 NULL);
3335}
3336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003337#undef NEED_RETRY
3338
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003339#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341/* --- Character Mapping Codec -------------------------------------------- */
3342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 PyObject *mapping,
3346 const char *errors)
3347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003349 Py_ssize_t startinpos;
3350 Py_ssize_t endinpos;
3351 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 PyUnicodeObject *v;
3354 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003355 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 PyObject *errorHandler = NULL;
3357 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003358 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003359 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003360
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 /* Default to Latin-1 */
3362 if (mapping == NULL)
3363 return PyUnicode_DecodeLatin1(s, size, errors);
3364
3365 v = _PyUnicode_New(size);
3366 if (v == NULL)
3367 goto onError;
3368 if (size == 0)
3369 return (PyObject *)v;
3370 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003372 if (PyUnicode_CheckExact(mapping)) {
3373 mapstring = PyUnicode_AS_UNICODE(mapping);
3374 maplen = PyUnicode_GET_SIZE(mapping);
3375 while (s < e) {
3376 unsigned char ch = *s;
3377 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003379 if (ch < maplen)
3380 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003382 if (x == 0xfffe) {
3383 /* undefined mapping */
3384 outpos = p-PyUnicode_AS_UNICODE(v);
3385 startinpos = s-starts;
3386 endinpos = startinpos+1;
3387 if (unicode_decode_call_errorhandler(
3388 errors, &errorHandler,
3389 "charmap", "character maps to <undefined>",
3390 starts, size, &startinpos, &endinpos, &exc, &s,
3391 (PyObject **)&v, &outpos, &p)) {
3392 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003393 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003394 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003395 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003396 *p++ = x;
3397 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003399 }
3400 else {
3401 while (s < e) {
3402 unsigned char ch = *s;
3403 PyObject *w, *x;
3404
3405 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3406 w = PyInt_FromLong((long)ch);
3407 if (w == NULL)
3408 goto onError;
3409 x = PyObject_GetItem(mapping, w);
3410 Py_DECREF(w);
3411 if (x == NULL) {
3412 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3413 /* No mapping found means: mapping is undefined. */
3414 PyErr_Clear();
3415 x = Py_None;
3416 Py_INCREF(x);
3417 } else
3418 goto onError;
3419 }
3420
3421 /* Apply mapping */
3422 if (PyInt_Check(x)) {
3423 long value = PyInt_AS_LONG(x);
3424 if (value < 0 || value > 65535) {
3425 PyErr_SetString(PyExc_TypeError,
3426 "character mapping must be in range(65536)");
3427 Py_DECREF(x);
3428 goto onError;
3429 }
3430 *p++ = (Py_UNICODE)value;
3431 }
3432 else if (x == Py_None) {
3433 /* undefined mapping */
3434 outpos = p-PyUnicode_AS_UNICODE(v);
3435 startinpos = s-starts;
3436 endinpos = startinpos+1;
3437 if (unicode_decode_call_errorhandler(
3438 errors, &errorHandler,
3439 "charmap", "character maps to <undefined>",
3440 starts, size, &startinpos, &endinpos, &exc, &s,
3441 (PyObject **)&v, &outpos, &p)) {
3442 Py_DECREF(x);
3443 goto onError;
3444 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003445 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003446 continue;
3447 }
3448 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003449 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003450
3451 if (targetsize == 1)
3452 /* 1-1 mapping */
3453 *p++ = *PyUnicode_AS_UNICODE(x);
3454
3455 else if (targetsize > 1) {
3456 /* 1-n mapping */
3457 if (targetsize > extrachars) {
3458 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003459 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3460 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003461 (targetsize << 2);
3462 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003463 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003464 if (_PyUnicode_Resize(&v,
3465 PyUnicode_GET_SIZE(v) + needed) < 0) {
3466 Py_DECREF(x);
3467 goto onError;
3468 }
3469 p = PyUnicode_AS_UNICODE(v) + oldpos;
3470 }
3471 Py_UNICODE_COPY(p,
3472 PyUnicode_AS_UNICODE(x),
3473 targetsize);
3474 p += targetsize;
3475 extrachars -= targetsize;
3476 }
3477 /* 1-0 mapping: skip the character */
3478 }
3479 else {
3480 /* wrong return value */
3481 PyErr_SetString(PyExc_TypeError,
3482 "character mapping must return integer, None or unicode");
3483 Py_DECREF(x);
3484 goto onError;
3485 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003487 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
3490 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003491 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 Py_XDECREF(errorHandler);
3494 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003496
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 Py_XDECREF(errorHandler);
3499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 Py_XDECREF(v);
3501 return NULL;
3502}
3503
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003504/* Charmap encoding: the lookup table */
3505
3506struct encoding_map{
3507 PyObject_HEAD
3508 unsigned char level1[32];
3509 int count2, count3;
3510 unsigned char level23[1];
3511};
3512
3513static PyObject*
3514encoding_map_size(PyObject *obj, PyObject* args)
3515{
3516 struct encoding_map *map = (struct encoding_map*)obj;
3517 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3518 128*map->count3);
3519}
3520
3521static PyMethodDef encoding_map_methods[] = {
3522 {"size", encoding_map_size, METH_NOARGS,
3523 PyDoc_STR("Return the size (in bytes) of this object") },
3524 { 0 }
3525};
3526
3527static void
3528encoding_map_dealloc(PyObject* o)
3529{
3530 PyObject_FREE(o);
3531}
3532
3533static PyTypeObject EncodingMapType = {
3534 PyObject_HEAD_INIT(NULL)
3535 0, /*ob_size*/
3536 "EncodingMap", /*tp_name*/
3537 sizeof(struct encoding_map), /*tp_basicsize*/
3538 0, /*tp_itemsize*/
3539 /* methods */
3540 encoding_map_dealloc, /*tp_dealloc*/
3541 0, /*tp_print*/
3542 0, /*tp_getattr*/
3543 0, /*tp_setattr*/
3544 0, /*tp_compare*/
3545 0, /*tp_repr*/
3546 0, /*tp_as_number*/
3547 0, /*tp_as_sequence*/
3548 0, /*tp_as_mapping*/
3549 0, /*tp_hash*/
3550 0, /*tp_call*/
3551 0, /*tp_str*/
3552 0, /*tp_getattro*/
3553 0, /*tp_setattro*/
3554 0, /*tp_as_buffer*/
3555 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3556 0, /*tp_doc*/
3557 0, /*tp_traverse*/
3558 0, /*tp_clear*/
3559 0, /*tp_richcompare*/
3560 0, /*tp_weaklistoffset*/
3561 0, /*tp_iter*/
3562 0, /*tp_iternext*/
3563 encoding_map_methods, /*tp_methods*/
3564 0, /*tp_members*/
3565 0, /*tp_getset*/
3566 0, /*tp_base*/
3567 0, /*tp_dict*/
3568 0, /*tp_descr_get*/
3569 0, /*tp_descr_set*/
3570 0, /*tp_dictoffset*/
3571 0, /*tp_init*/
3572 0, /*tp_alloc*/
3573 0, /*tp_new*/
3574 0, /*tp_free*/
3575 0, /*tp_is_gc*/
3576};
3577
3578PyObject*
3579PyUnicode_BuildEncodingMap(PyObject* string)
3580{
3581 Py_UNICODE *decode;
3582 PyObject *result;
3583 struct encoding_map *mresult;
3584 int i;
3585 int need_dict = 0;
3586 unsigned char level1[32];
3587 unsigned char level2[512];
3588 unsigned char *mlevel1, *mlevel2, *mlevel3;
3589 int count2 = 0, count3 = 0;
3590
3591 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3592 PyErr_BadArgument();
3593 return NULL;
3594 }
3595 decode = PyUnicode_AS_UNICODE(string);
3596 memset(level1, 0xFF, sizeof level1);
3597 memset(level2, 0xFF, sizeof level2);
3598
3599 /* If there isn't a one-to-one mapping of NULL to \0,
3600 or if there are non-BMP characters, we need to use
3601 a mapping dictionary. */
3602 if (decode[0] != 0)
3603 need_dict = 1;
3604 for (i = 1; i < 256; i++) {
3605 int l1, l2;
3606 if (decode[i] == 0
3607 #ifdef Py_UNICODE_WIDE
3608 || decode[i] > 0xFFFF
3609 #endif
3610 ) {
3611 need_dict = 1;
3612 break;
3613 }
3614 if (decode[i] == 0xFFFE)
3615 /* unmapped character */
3616 continue;
3617 l1 = decode[i] >> 11;
3618 l2 = decode[i] >> 7;
3619 if (level1[l1] == 0xFF)
3620 level1[l1] = count2++;
3621 if (level2[l2] == 0xFF)
3622 level2[l2] = count3++;
3623 }
3624
3625 if (count2 >= 0xFF || count3 >= 0xFF)
3626 need_dict = 1;
3627
3628 if (need_dict) {
3629 PyObject *result = PyDict_New();
3630 PyObject *key, *value;
3631 if (!result)
3632 return NULL;
3633 for (i = 0; i < 256; i++) {
3634 key = value = NULL;
3635 key = PyInt_FromLong(decode[i]);
3636 value = PyInt_FromLong(i);
3637 if (!key || !value)
3638 goto failed1;
3639 if (PyDict_SetItem(result, key, value) == -1)
3640 goto failed1;
3641 Py_DECREF(key);
3642 Py_DECREF(value);
3643 }
3644 return result;
3645 failed1:
3646 Py_XDECREF(key);
3647 Py_XDECREF(value);
3648 Py_DECREF(result);
3649 return NULL;
3650 }
3651
3652 /* Create a three-level trie */
3653 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3654 16*count2 + 128*count3 - 1);
3655 if (!result)
3656 return PyErr_NoMemory();
3657 PyObject_Init(result, &EncodingMapType);
3658 mresult = (struct encoding_map*)result;
3659 mresult->count2 = count2;
3660 mresult->count3 = count3;
3661 mlevel1 = mresult->level1;
3662 mlevel2 = mresult->level23;
3663 mlevel3 = mresult->level23 + 16*count2;
3664 memcpy(mlevel1, level1, 32);
3665 memset(mlevel2, 0xFF, 16*count2);
3666 memset(mlevel3, 0, 128*count3);
3667 count3 = 0;
3668 for (i = 1; i < 256; i++) {
3669 int o1, o2, o3, i2, i3;
3670 if (decode[i] == 0xFFFE)
3671 /* unmapped character */
3672 continue;
3673 o1 = decode[i]>>11;
3674 o2 = (decode[i]>>7) & 0xF;
3675 i2 = 16*mlevel1[o1] + o2;
3676 if (mlevel2[i2] == 0xFF)
3677 mlevel2[i2] = count3++;
3678 o3 = decode[i] & 0x7F;
3679 i3 = 128*mlevel2[i2] + o3;
3680 mlevel3[i3] = i;
3681 }
3682 return result;
3683}
3684
3685static int
3686encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3687{
3688 struct encoding_map *map = (struct encoding_map*)mapping;
3689 int l1 = c>>11;
3690 int l2 = (c>>7) & 0xF;
3691 int l3 = c & 0x7F;
3692 int i;
3693
3694#ifdef Py_UNICODE_WIDE
3695 if (c > 0xFFFF) {
3696 return -1;
3697 }
3698#endif
3699 if (c == 0)
3700 return 0;
3701 /* level 1*/
3702 i = map->level1[l1];
3703 if (i == 0xFF) {
3704 return -1;
3705 }
3706 /* level 2*/
3707 i = map->level23[16*i+l2];
3708 if (i == 0xFF) {
3709 return -1;
3710 }
3711 /* level 3 */
3712 i = map->level23[16*map->count2 + 128*i + l3];
3713 if (i == 0) {
3714 return -1;
3715 }
3716 return i;
3717}
3718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719/* Lookup the character ch in the mapping. If the character
3720 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003721 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 PyObject *w = PyInt_FromLong((long)c);
3725 PyObject *x;
3726
3727 if (w == NULL)
3728 return NULL;
3729 x = PyObject_GetItem(mapping, w);
3730 Py_DECREF(w);
3731 if (x == NULL) {
3732 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3733 /* No mapping found means: mapping is undefined. */
3734 PyErr_Clear();
3735 x = Py_None;
3736 Py_INCREF(x);
3737 return x;
3738 } else
3739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003741 else if (x == Py_None)
3742 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 else if (PyInt_Check(x)) {
3744 long value = PyInt_AS_LONG(x);
3745 if (value < 0 || value > 255) {
3746 PyErr_SetString(PyExc_TypeError,
3747 "character mapping must be in range(256)");
3748 Py_DECREF(x);
3749 return NULL;
3750 }
3751 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 else if (PyString_Check(x))
3754 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003757 PyErr_Format(PyExc_TypeError,
3758 "character mapping must return integer, None or str8, not %.400s",
3759 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 Py_DECREF(x);
3761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 }
3763}
3764
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003765static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003766charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003767{
Walter Dörwald827b0552007-05-12 13:23:53 +00003768 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003769 /* exponentially overallocate to minimize reallocations */
3770 if (requiredsize < 2*outsize)
3771 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003772 if (PyBytes_Resize(outobj, requiredsize)) {
3773 Py_DECREF(outobj);
3774 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003775 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003776 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003777}
3778
3779typedef enum charmapencode_result {
3780 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3781}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003783 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 space is available. Return a new reference to the object that
3785 was put in the output buffer, or Py_None, if the mapping was undefined
3786 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003787 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003789charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003790 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003792 PyObject *rep;
3793 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003794 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003796 if (mapping->ob_type == &EncodingMapType) {
3797 int res = encoding_map_lookup(c, mapping);
3798 Py_ssize_t requiredsize = *outpos+1;
3799 if (res == -1)
3800 return enc_FAILED;
3801 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003802 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003803 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003804 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003805 outstart[(*outpos)++] = (char)res;
3806 return enc_SUCCESS;
3807 }
3808
3809 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003811 return enc_EXCEPTION;
3812 else if (rep==Py_None) {
3813 Py_DECREF(rep);
3814 return enc_FAILED;
3815 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003818 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003819 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003821 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003823 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3825 }
3826 else {
3827 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3829 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003830 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003831 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003833 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003835 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 memcpy(outstart + *outpos, repchars, repsize);
3837 *outpos += repsize;
3838 }
3839 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003840 Py_DECREF(rep);
3841 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842}
3843
3844/* handle an error in PyUnicode_EncodeCharmap
3845 Return 0 on success, -1 on error */
3846static
3847int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003850 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003851 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852{
3853 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854 Py_ssize_t repsize;
3855 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 Py_UNICODE *uni2;
3857 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003858 Py_ssize_t collstartpos = *inpos;
3859 Py_ssize_t collendpos = *inpos+1;
3860 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 char *encoding = "charmap";
3862 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003863 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 /* find all unencodable characters */
3866 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003867 PyObject *rep;
3868 if (mapping->ob_type == &EncodingMapType) {
3869 int res = encoding_map_lookup(p[collendpos], mapping);
3870 if (res != -1)
3871 break;
3872 ++collendpos;
3873 continue;
3874 }
3875
3876 rep = charmapencode_lookup(p[collendpos], mapping);
3877 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003879 else if (rep!=Py_None) {
3880 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 break;
3882 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003883 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 ++collendpos;
3885 }
3886 /* cache callback name lookup
3887 * (if not done yet, i.e. it's the first error) */
3888 if (*known_errorHandler==-1) {
3889 if ((errors==NULL) || (!strcmp(errors, "strict")))
3890 *known_errorHandler = 1;
3891 else if (!strcmp(errors, "replace"))
3892 *known_errorHandler = 2;
3893 else if (!strcmp(errors, "ignore"))
3894 *known_errorHandler = 3;
3895 else if (!strcmp(errors, "xmlcharrefreplace"))
3896 *known_errorHandler = 4;
3897 else
3898 *known_errorHandler = 0;
3899 }
3900 switch (*known_errorHandler) {
3901 case 1: /* strict */
3902 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3903 return -1;
3904 case 2: /* replace */
3905 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3906 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003907 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 return -1;
3909 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003910 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3912 return -1;
3913 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 }
3915 /* fall through */
3916 case 3: /* ignore */
3917 *inpos = collendpos;
3918 break;
3919 case 4: /* xmlcharrefreplace */
3920 /* generate replacement (temporarily (mis)uses p) */
3921 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3922 char buffer[2+29+1+1];
3923 char *cp;
3924 sprintf(buffer, "&#%d;", (int)p[collpos]);
3925 for (cp = buffer; *cp; ++cp) {
3926 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003927 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003929 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3931 return -1;
3932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 }
3934 }
3935 *inpos = collendpos;
3936 break;
3937 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003938 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 encoding, reason, p, size, exceptionObject,
3940 collstartpos, collendpos, &newpos);
3941 if (repunicode == NULL)
3942 return -1;
3943 /* generate replacement */
3944 repsize = PyUnicode_GET_SIZE(repunicode);
3945 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3946 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003947 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 return -1;
3949 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003950 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3953 return -1;
3954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 }
3956 *inpos = newpos;
3957 Py_DECREF(repunicode);
3958 }
3959 return 0;
3960}
3961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 PyObject *mapping,
3965 const char *errors)
3966{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 /* output object */
3968 PyObject *res = NULL;
3969 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003970 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003972 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 PyObject *errorHandler = NULL;
3974 PyObject *exc = NULL;
3975 /* the following variable is used for caching string comparisons
3976 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3977 * 3=ignore, 4=xmlcharrefreplace */
3978 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979
3980 /* Default to Latin-1 */
3981 if (mapping == NULL)
3982 return PyUnicode_EncodeLatin1(p, size, errors);
3983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 /* allocate enough for a simple encoding without
3985 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00003986 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 if (res == NULL)
3988 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003989 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 while (inpos<size) {
3993 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00003994 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003995 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 if (charmap_encoding_error(p, size, &inpos, mapping,
3999 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004000 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004001 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004002 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 else
4006 /* done with this character => adjust input position */
4007 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004011 if (respos<PyBytes_GET_SIZE(res)) {
4012 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 goto onError;
4014 }
4015 Py_XDECREF(exc);
4016 Py_XDECREF(errorHandler);
4017 return res;
4018
4019 onError:
4020 Py_XDECREF(res);
4021 Py_XDECREF(exc);
4022 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 return NULL;
4024}
4025
4026PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4027 PyObject *mapping)
4028{
4029 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4030 PyErr_BadArgument();
4031 return NULL;
4032 }
4033 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4034 PyUnicode_GET_SIZE(unicode),
4035 mapping,
4036 NULL);
4037}
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039/* create or adjust a UnicodeTranslateError */
4040static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004041 const Py_UNICODE *unicode, Py_ssize_t size,
4042 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 if (*exceptionObject == NULL) {
4046 *exceptionObject = PyUnicodeTranslateError_Create(
4047 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 }
4049 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4051 goto onError;
4052 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4053 goto onError;
4054 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4055 goto onError;
4056 return;
4057 onError:
4058 Py_DECREF(*exceptionObject);
4059 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 }
4061}
4062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063/* raises a UnicodeTranslateError */
4064static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 const Py_UNICODE *unicode, Py_ssize_t size,
4066 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 const char *reason)
4068{
4069 make_translate_exception(exceptionObject,
4070 unicode, size, startpos, endpos, reason);
4071 if (*exceptionObject != NULL)
4072 PyCodec_StrictErrors(*exceptionObject);
4073}
4074
4075/* error handling callback helper:
4076 build arguments, call the callback and check the arguments,
4077 put the result into newpos and return the replacement string, which
4078 has to be freed by the caller */
4079static PyObject *unicode_translate_call_errorhandler(const char *errors,
4080 PyObject **errorHandler,
4081 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004082 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4083 Py_ssize_t startpos, Py_ssize_t endpos,
4084 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004086 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004088 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 PyObject *restuple;
4090 PyObject *resunicode;
4091
4092 if (*errorHandler == NULL) {
4093 *errorHandler = PyCodec_LookupError(errors);
4094 if (*errorHandler == NULL)
4095 return NULL;
4096 }
4097
4098 make_translate_exception(exceptionObject,
4099 unicode, size, startpos, endpos, reason);
4100 if (*exceptionObject == NULL)
4101 return NULL;
4102
4103 restuple = PyObject_CallFunctionObjArgs(
4104 *errorHandler, *exceptionObject, NULL);
4105 if (restuple == NULL)
4106 return NULL;
4107 if (!PyTuple_Check(restuple)) {
4108 PyErr_Format(PyExc_TypeError, &argparse[4]);
4109 Py_DECREF(restuple);
4110 return NULL;
4111 }
4112 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004113 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 Py_DECREF(restuple);
4115 return NULL;
4116 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004117 if (i_newpos<0)
4118 *newpos = size+i_newpos;
4119 else
4120 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004121 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004122 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004123 Py_DECREF(restuple);
4124 return NULL;
4125 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 Py_INCREF(resunicode);
4127 Py_DECREF(restuple);
4128 return resunicode;
4129}
4130
4131/* Lookup the character ch in the mapping and put the result in result,
4132 which must be decrefed by the caller.
4133 Return 0 on success, -1 on error */
4134static
4135int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4136{
4137 PyObject *w = PyInt_FromLong((long)c);
4138 PyObject *x;
4139
4140 if (w == NULL)
4141 return -1;
4142 x = PyObject_GetItem(mapping, w);
4143 Py_DECREF(w);
4144 if (x == NULL) {
4145 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4146 /* No mapping found means: use 1:1 mapping. */
4147 PyErr_Clear();
4148 *result = NULL;
4149 return 0;
4150 } else
4151 return -1;
4152 }
4153 else if (x == Py_None) {
4154 *result = x;
4155 return 0;
4156 }
4157 else if (PyInt_Check(x)) {
4158 long value = PyInt_AS_LONG(x);
4159 long max = PyUnicode_GetMax();
4160 if (value < 0 || value > max) {
4161 PyErr_Format(PyExc_TypeError,
4162 "character mapping must be in range(0x%lx)", max+1);
4163 Py_DECREF(x);
4164 return -1;
4165 }
4166 *result = x;
4167 return 0;
4168 }
4169 else if (PyUnicode_Check(x)) {
4170 *result = x;
4171 return 0;
4172 }
4173 else {
4174 /* wrong return value */
4175 PyErr_SetString(PyExc_TypeError,
4176 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004177 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 return -1;
4179 }
4180}
4181/* ensure that *outobj is at least requiredsize characters long,
4182if not reallocate and adjust various state variables.
4183Return 0 on success, -1 on error */
4184static
Walter Dörwald4894c302003-10-24 14:25:28 +00004185int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004189 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004191 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004193 if (requiredsize < 2 * oldsize)
4194 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004195 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 return -1;
4197 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 }
4199 return 0;
4200}
4201/* lookup the character, put the result in the output string and adjust
4202 various state variables. Return a new reference to the object that
4203 was put in the output buffer in *result, or Py_None, if the mapping was
4204 undefined (in which case no character was written).
4205 The called must decref result.
4206 Return 0 on success, -1 on error. */
4207static
Walter Dörwald4894c302003-10-24 14:25:28 +00004208int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004209 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004210 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211{
Walter Dörwald4894c302003-10-24 14:25:28 +00004212 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 return -1;
4214 if (*res==NULL) {
4215 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004216 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 }
4218 else if (*res==Py_None)
4219 ;
4220 else if (PyInt_Check(*res)) {
4221 /* no overflow check, because we know that the space is enough */
4222 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4223 }
4224 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004225 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 if (repsize==1) {
4227 /* no overflow check, because we know that the space is enough */
4228 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4229 }
4230 else if (repsize!=0) {
4231 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004233 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004234 repsize - 1;
4235 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 return -1;
4237 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4238 *outp += repsize;
4239 }
4240 }
4241 else
4242 return -1;
4243 return 0;
4244}
4245
4246PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004247 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248 PyObject *mapping,
4249 const char *errors)
4250{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 /* output object */
4252 PyObject *res = NULL;
4253 /* pointers to the beginning and end+1 of input */
4254 const Py_UNICODE *startp = p;
4255 const Py_UNICODE *endp = p + size;
4256 /* pointer into the output */
4257 Py_UNICODE *str;
4258 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004259 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 char *reason = "character maps to <undefined>";
4261 PyObject *errorHandler = NULL;
4262 PyObject *exc = NULL;
4263 /* the following variable is used for caching string comparisons
4264 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4265 * 3=ignore, 4=xmlcharrefreplace */
4266 int known_errorHandler = -1;
4267
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 if (mapping == NULL) {
4269 PyErr_BadArgument();
4270 return NULL;
4271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272
4273 /* allocate enough for a simple 1:1 translation without
4274 replacements, if we need more, we'll resize */
4275 res = PyUnicode_FromUnicode(NULL, size);
4276 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 return res;
4280 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 while (p<endp) {
4283 /* try to encode it */
4284 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004285 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 goto onError;
4288 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004289 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 if (x!=Py_None) /* it worked => adjust input pointer */
4291 ++p;
4292 else { /* untranslatable character */
4293 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004294 Py_ssize_t repsize;
4295 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_UNICODE *uni2;
4297 /* startpos for collecting untranslatable chars */
4298 const Py_UNICODE *collstart = p;
4299 const Py_UNICODE *collend = p+1;
4300 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 /* find all untranslatable characters */
4303 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004304 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 goto onError;
4306 Py_XDECREF(x);
4307 if (x!=Py_None)
4308 break;
4309 ++collend;
4310 }
4311 /* cache callback name lookup
4312 * (if not done yet, i.e. it's the first error) */
4313 if (known_errorHandler==-1) {
4314 if ((errors==NULL) || (!strcmp(errors, "strict")))
4315 known_errorHandler = 1;
4316 else if (!strcmp(errors, "replace"))
4317 known_errorHandler = 2;
4318 else if (!strcmp(errors, "ignore"))
4319 known_errorHandler = 3;
4320 else if (!strcmp(errors, "xmlcharrefreplace"))
4321 known_errorHandler = 4;
4322 else
4323 known_errorHandler = 0;
4324 }
4325 switch (known_errorHandler) {
4326 case 1: /* strict */
4327 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4328 goto onError;
4329 case 2: /* replace */
4330 /* No need to check for space, this is a 1:1 replacement */
4331 for (coll = collstart; coll<collend; ++coll)
4332 *str++ = '?';
4333 /* fall through */
4334 case 3: /* ignore */
4335 p = collend;
4336 break;
4337 case 4: /* xmlcharrefreplace */
4338 /* generate replacement (temporarily (mis)uses p) */
4339 for (p = collstart; p < collend; ++p) {
4340 char buffer[2+29+1+1];
4341 char *cp;
4342 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004343 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4345 goto onError;
4346 for (cp = buffer; *cp; ++cp)
4347 *str++ = *cp;
4348 }
4349 p = collend;
4350 break;
4351 default:
4352 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4353 reason, startp, size, &exc,
4354 collstart-startp, collend-startp, &newpos);
4355 if (repunicode == NULL)
4356 goto onError;
4357 /* generate replacement */
4358 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004359 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4361 Py_DECREF(repunicode);
4362 goto onError;
4363 }
4364 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4365 *str++ = *uni2;
4366 p = startp + newpos;
4367 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 }
4369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 /* Resize if we allocated to much */
4372 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004373 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004374 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 }
4377 Py_XDECREF(exc);
4378 Py_XDECREF(errorHandler);
4379 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 onError:
4382 Py_XDECREF(res);
4383 Py_XDECREF(exc);
4384 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 return NULL;
4386}
4387
4388PyObject *PyUnicode_Translate(PyObject *str,
4389 PyObject *mapping,
4390 const char *errors)
4391{
4392 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004393
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 str = PyUnicode_FromObject(str);
4395 if (str == NULL)
4396 goto onError;
4397 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4398 PyUnicode_GET_SIZE(str),
4399 mapping,
4400 errors);
4401 Py_DECREF(str);
4402 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004403
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 onError:
4405 Py_XDECREF(str);
4406 return NULL;
4407}
Tim Petersced69f82003-09-16 20:30:58 +00004408
Guido van Rossum9e896b32000-04-05 20:11:21 +00004409/* --- Decimal Encoder ---------------------------------------------------- */
4410
4411int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004413 char *output,
4414 const char *errors)
4415{
4416 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 PyObject *errorHandler = NULL;
4418 PyObject *exc = NULL;
4419 const char *encoding = "decimal";
4420 const char *reason = "invalid decimal Unicode string";
4421 /* the following variable is used for caching string comparisons
4422 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4423 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004424
4425 if (output == NULL) {
4426 PyErr_BadArgument();
4427 return -1;
4428 }
4429
4430 p = s;
4431 end = s + length;
4432 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004434 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t repsize;
4437 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 Py_UNICODE *uni2;
4439 Py_UNICODE *collstart;
4440 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004441
Guido van Rossum9e896b32000-04-05 20:11:21 +00004442 if (Py_UNICODE_ISSPACE(ch)) {
4443 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004445 continue;
4446 }
4447 decimal = Py_UNICODE_TODECIMAL(ch);
4448 if (decimal >= 0) {
4449 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004451 continue;
4452 }
Guido van Rossumba477042000-04-06 18:18:10 +00004453 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004454 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004456 continue;
4457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 /* All other characters are considered unencodable */
4459 collstart = p;
4460 collend = p+1;
4461 while (collend < end) {
4462 if ((0 < *collend && *collend < 256) ||
4463 !Py_UNICODE_ISSPACE(*collend) ||
4464 Py_UNICODE_TODECIMAL(*collend))
4465 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004466 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 /* cache callback name lookup
4468 * (if not done yet, i.e. it's the first error) */
4469 if (known_errorHandler==-1) {
4470 if ((errors==NULL) || (!strcmp(errors, "strict")))
4471 known_errorHandler = 1;
4472 else if (!strcmp(errors, "replace"))
4473 known_errorHandler = 2;
4474 else if (!strcmp(errors, "ignore"))
4475 known_errorHandler = 3;
4476 else if (!strcmp(errors, "xmlcharrefreplace"))
4477 known_errorHandler = 4;
4478 else
4479 known_errorHandler = 0;
4480 }
4481 switch (known_errorHandler) {
4482 case 1: /* strict */
4483 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4484 goto onError;
4485 case 2: /* replace */
4486 for (p = collstart; p < collend; ++p)
4487 *output++ = '?';
4488 /* fall through */
4489 case 3: /* ignore */
4490 p = collend;
4491 break;
4492 case 4: /* xmlcharrefreplace */
4493 /* generate replacement (temporarily (mis)uses p) */
4494 for (p = collstart; p < collend; ++p)
4495 output += sprintf(output, "&#%d;", (int)*p);
4496 p = collend;
4497 break;
4498 default:
4499 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4500 encoding, reason, s, length, &exc,
4501 collstart-s, collend-s, &newpos);
4502 if (repunicode == NULL)
4503 goto onError;
4504 /* generate replacement */
4505 repsize = PyUnicode_GET_SIZE(repunicode);
4506 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4507 Py_UNICODE ch = *uni2;
4508 if (Py_UNICODE_ISSPACE(ch))
4509 *output++ = ' ';
4510 else {
4511 decimal = Py_UNICODE_TODECIMAL(ch);
4512 if (decimal >= 0)
4513 *output++ = '0' + decimal;
4514 else if (0 < ch && ch < 256)
4515 *output++ = (char)ch;
4516 else {
4517 Py_DECREF(repunicode);
4518 raise_encode_exception(&exc, encoding,
4519 s, length, collstart-s, collend-s, reason);
4520 goto onError;
4521 }
4522 }
4523 }
4524 p = s + newpos;
4525 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004526 }
4527 }
4528 /* 0-terminate the output string */
4529 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_XDECREF(exc);
4531 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004532 return 0;
4533
4534 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 Py_XDECREF(exc);
4536 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004537 return -1;
4538}
4539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540/* --- Helpers ------------------------------------------------------------ */
4541
Thomas Wouters477c8d52006-05-27 19:21:47 +00004542#define STRINGLIB_CHAR Py_UNICODE
4543
4544#define STRINGLIB_LEN PyUnicode_GET_SIZE
4545#define STRINGLIB_NEW PyUnicode_FromUnicode
4546#define STRINGLIB_STR PyUnicode_AS_UNICODE
4547
4548Py_LOCAL_INLINE(int)
4549STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004551 if (str[0] != other[0])
4552 return 1;
4553 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554}
4555
Thomas Wouters477c8d52006-05-27 19:21:47 +00004556#define STRINGLIB_EMPTY unicode_empty
4557
4558#include "stringlib/fastsearch.h"
4559
4560#include "stringlib/count.h"
4561#include "stringlib/find.h"
4562#include "stringlib/partition.h"
4563
4564/* helper macro to fixup start/end slice values */
4565#define FIX_START_END(obj) \
4566 if (start < 0) \
4567 start += (obj)->length; \
4568 if (start < 0) \
4569 start = 0; \
4570 if (end > (obj)->length) \
4571 end = (obj)->length; \
4572 if (end < 0) \
4573 end += (obj)->length; \
4574 if (end < 0) \
4575 end = 0;
4576
Martin v. Löwis18e16552006-02-15 17:27:45 +00004577Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004578 PyObject *substr,
4579 Py_ssize_t start,
4580 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004582 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004583 PyUnicodeObject* str_obj;
4584 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004585
Thomas Wouters477c8d52006-05-27 19:21:47 +00004586 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4587 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004589 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4590 if (!sub_obj) {
4591 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 return -1;
4593 }
Tim Petersced69f82003-09-16 20:30:58 +00004594
Thomas Wouters477c8d52006-05-27 19:21:47 +00004595 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004596
Thomas Wouters477c8d52006-05-27 19:21:47 +00004597 result = stringlib_count(
4598 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4599 );
4600
4601 Py_DECREF(sub_obj);
4602 Py_DECREF(str_obj);
4603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 return result;
4605}
4606
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004608 PyObject *sub,
4609 Py_ssize_t start,
4610 Py_ssize_t end,
4611 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004613 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004614
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004616 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004617 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004618 sub = PyUnicode_FromObject(sub);
4619 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004620 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004621 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 }
Tim Petersced69f82003-09-16 20:30:58 +00004623
Thomas Wouters477c8d52006-05-27 19:21:47 +00004624 if (direction > 0)
4625 result = stringlib_find_slice(
4626 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4627 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4628 start, end
4629 );
4630 else
4631 result = stringlib_rfind_slice(
4632 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4633 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4634 start, end
4635 );
4636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004638 Py_DECREF(sub);
4639
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 return result;
4641}
4642
Tim Petersced69f82003-09-16 20:30:58 +00004643static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644int tailmatch(PyUnicodeObject *self,
4645 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 Py_ssize_t start,
4647 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 int direction)
4649{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 if (substring->length == 0)
4651 return 1;
4652
Thomas Wouters477c8d52006-05-27 19:21:47 +00004653 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654
4655 end -= substring->length;
4656 if (end < start)
4657 return 0;
4658
4659 if (direction > 0) {
4660 if (Py_UNICODE_MATCH(self, end, substring))
4661 return 1;
4662 } else {
4663 if (Py_UNICODE_MATCH(self, start, substring))
4664 return 1;
4665 }
4666
4667 return 0;
4668}
4669
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004672 Py_ssize_t start,
4673 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 int direction)
4675{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004677
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 str = PyUnicode_FromObject(str);
4679 if (str == NULL)
4680 return -1;
4681 substr = PyUnicode_FromObject(substr);
4682 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004683 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 return -1;
4685 }
Tim Petersced69f82003-09-16 20:30:58 +00004686
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 result = tailmatch((PyUnicodeObject *)str,
4688 (PyUnicodeObject *)substr,
4689 start, end, direction);
4690 Py_DECREF(str);
4691 Py_DECREF(substr);
4692 return result;
4693}
4694
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695/* Apply fixfct filter to the Unicode object self and return a
4696 reference to the modified object */
4697
Tim Petersced69f82003-09-16 20:30:58 +00004698static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699PyObject *fixup(PyUnicodeObject *self,
4700 int (*fixfct)(PyUnicodeObject *s))
4701{
4702
4703 PyUnicodeObject *u;
4704
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004705 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 if (u == NULL)
4707 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004708
4709 Py_UNICODE_COPY(u->str, self->str, self->length);
4710
Tim Peters7a29bd52001-09-12 03:03:31 +00004711 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 /* fixfct should return TRUE if it modified the buffer. If
4713 FALSE, return a reference to the original buffer instead
4714 (to save space, not time) */
4715 Py_INCREF(self);
4716 Py_DECREF(u);
4717 return (PyObject*) self;
4718 }
4719 return (PyObject*) u;
4720}
4721
Tim Petersced69f82003-09-16 20:30:58 +00004722static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723int fixupper(PyUnicodeObject *self)
4724{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004725 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 Py_UNICODE *s = self->str;
4727 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 while (len-- > 0) {
4730 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004731
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 ch = Py_UNICODE_TOUPPER(*s);
4733 if (ch != *s) {
4734 status = 1;
4735 *s = ch;
4736 }
4737 s++;
4738 }
4739
4740 return status;
4741}
4742
Tim Petersced69f82003-09-16 20:30:58 +00004743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744int fixlower(PyUnicodeObject *self)
4745{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 Py_UNICODE *s = self->str;
4748 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 while (len-- > 0) {
4751 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004752
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 ch = Py_UNICODE_TOLOWER(*s);
4754 if (ch != *s) {
4755 status = 1;
4756 *s = ch;
4757 }
4758 s++;
4759 }
4760
4761 return status;
4762}
4763
Tim Petersced69f82003-09-16 20:30:58 +00004764static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765int fixswapcase(PyUnicodeObject *self)
4766{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 Py_UNICODE *s = self->str;
4769 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004770
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 while (len-- > 0) {
4772 if (Py_UNICODE_ISUPPER(*s)) {
4773 *s = Py_UNICODE_TOLOWER(*s);
4774 status = 1;
4775 } else if (Py_UNICODE_ISLOWER(*s)) {
4776 *s = Py_UNICODE_TOUPPER(*s);
4777 status = 1;
4778 }
4779 s++;
4780 }
4781
4782 return status;
4783}
4784
Tim Petersced69f82003-09-16 20:30:58 +00004785static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786int fixcapitalize(PyUnicodeObject *self)
4787{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004789 Py_UNICODE *s = self->str;
4790 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004791
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004792 if (len == 0)
4793 return 0;
4794 if (Py_UNICODE_ISLOWER(*s)) {
4795 *s = Py_UNICODE_TOUPPER(*s);
4796 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004798 s++;
4799 while (--len > 0) {
4800 if (Py_UNICODE_ISUPPER(*s)) {
4801 *s = Py_UNICODE_TOLOWER(*s);
4802 status = 1;
4803 }
4804 s++;
4805 }
4806 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807}
4808
4809static
4810int fixtitle(PyUnicodeObject *self)
4811{
4812 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4813 register Py_UNICODE *e;
4814 int previous_is_cased;
4815
4816 /* Shortcut for single character strings */
4817 if (PyUnicode_GET_SIZE(self) == 1) {
4818 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4819 if (*p != ch) {
4820 *p = ch;
4821 return 1;
4822 }
4823 else
4824 return 0;
4825 }
Tim Petersced69f82003-09-16 20:30:58 +00004826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 e = p + PyUnicode_GET_SIZE(self);
4828 previous_is_cased = 0;
4829 for (; p < e; p++) {
4830 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 if (previous_is_cased)
4833 *p = Py_UNICODE_TOLOWER(ch);
4834 else
4835 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004836
4837 if (Py_UNICODE_ISLOWER(ch) ||
4838 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 Py_UNICODE_ISTITLE(ch))
4840 previous_is_cased = 1;
4841 else
4842 previous_is_cased = 0;
4843 }
4844 return 1;
4845}
4846
Tim Peters8ce9f162004-08-27 01:49:32 +00004847PyObject *
4848PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Tim Peters8ce9f162004-08-27 01:49:32 +00004850 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004851 const Py_UNICODE blank = ' ';
4852 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004853 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004854 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004855 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4856 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004857 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4858 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004859 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004860 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004861 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
Tim Peters05eba1f2004-08-27 21:32:02 +00004863 fseq = PySequence_Fast(seq, "");
4864 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004865 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004866 }
4867
Tim Peters91879ab2004-08-27 22:35:44 +00004868 /* Grrrr. A codec may be invoked to convert str objects to
4869 * Unicode, and so it's possible to call back into Python code
4870 * during PyUnicode_FromObject(), and so it's possible for a sick
4871 * codec to change the size of fseq (if seq is a list). Therefore
4872 * we have to keep refetching the size -- can't assume seqlen
4873 * is invariant.
4874 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004875 seqlen = PySequence_Fast_GET_SIZE(fseq);
4876 /* If empty sequence, return u"". */
4877 if (seqlen == 0) {
4878 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4879 goto Done;
4880 }
4881 /* If singleton sequence with an exact Unicode, return that. */
4882 if (seqlen == 1) {
4883 item = PySequence_Fast_GET_ITEM(fseq, 0);
4884 if (PyUnicode_CheckExact(item)) {
4885 Py_INCREF(item);
4886 res = (PyUnicodeObject *)item;
4887 goto Done;
4888 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004889 }
4890
Tim Peters05eba1f2004-08-27 21:32:02 +00004891 /* At least two items to join, or one that isn't exact Unicode. */
4892 if (seqlen > 1) {
4893 /* Set up sep and seplen -- they're needed. */
4894 if (separator == NULL) {
4895 sep = &blank;
4896 seplen = 1;
4897 }
4898 else {
4899 internal_separator = PyUnicode_FromObject(separator);
4900 if (internal_separator == NULL)
4901 goto onError;
4902 sep = PyUnicode_AS_UNICODE(internal_separator);
4903 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004904 /* In case PyUnicode_FromObject() mutated seq. */
4905 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004906 }
4907 }
4908
4909 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004910 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004911 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004912 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004913 res_p = PyUnicode_AS_UNICODE(res);
4914 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004915
Tim Peters05eba1f2004-08-27 21:32:02 +00004916 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004917 Py_ssize_t itemlen;
4918 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004919
4920 item = PySequence_Fast_GET_ITEM(fseq, i);
4921 /* Convert item to Unicode. */
4922 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4923 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004924 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004925 " %.80s found",
4926 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004927 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004928 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004929 item = PyUnicode_FromObject(item);
4930 if (item == NULL)
4931 goto onError;
4932 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004933
Tim Peters91879ab2004-08-27 22:35:44 +00004934 /* In case PyUnicode_FromObject() mutated seq. */
4935 seqlen = PySequence_Fast_GET_SIZE(fseq);
4936
Tim Peters8ce9f162004-08-27 01:49:32 +00004937 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004939 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004940 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004941 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004942 if (i < seqlen - 1) {
4943 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004944 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004945 goto Overflow;
4946 }
4947 if (new_res_used > res_alloc) {
4948 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004949 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004950 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004951 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004952 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004953 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004954 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004955 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004957 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004958 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004960
4961 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004962 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004963 res_p += itemlen;
4964 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004965 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004966 res_p += seplen;
4967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004969 res_used = new_res_used;
4970 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004971
Tim Peters05eba1f2004-08-27 21:32:02 +00004972 /* Shrink res to match the used area; this probably can't fail,
4973 * but it's cheap to check.
4974 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004975 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004976 goto onError;
4977
4978 Done:
4979 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004980 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 return (PyObject *)res;
4982
Tim Peters8ce9f162004-08-27 01:49:32 +00004983 Overflow:
4984 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004986 Py_DECREF(item);
4987 /* fall through */
4988
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004990 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004991 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004992 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 return NULL;
4994}
4995
Tim Petersced69f82003-09-16 20:30:58 +00004996static
4997PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004998 Py_ssize_t left,
4999 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 Py_UNICODE fill)
5001{
5002 PyUnicodeObject *u;
5003
5004 if (left < 0)
5005 left = 0;
5006 if (right < 0)
5007 right = 0;
5008
Tim Peters7a29bd52001-09-12 03:03:31 +00005009 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 Py_INCREF(self);
5011 return self;
5012 }
5013
5014 u = _PyUnicode_New(left + self->length + right);
5015 if (u) {
5016 if (left)
5017 Py_UNICODE_FILL(u->str, fill, left);
5018 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5019 if (right)
5020 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5021 }
5022
5023 return u;
5024}
5025
5026#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005027 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 if (!str) \
5029 goto onError; \
5030 if (PyList_Append(list, str)) { \
5031 Py_DECREF(str); \
5032 goto onError; \
5033 } \
5034 else \
5035 Py_DECREF(str);
5036
5037static
5038PyObject *split_whitespace(PyUnicodeObject *self,
5039 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005040 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 register Py_ssize_t i;
5043 register Py_ssize_t j;
5044 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 PyObject *str;
5046
5047 for (i = j = 0; i < len; ) {
5048 /* find a token */
5049 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5050 i++;
5051 j = i;
5052 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5053 i++;
5054 if (j < i) {
5055 if (maxcount-- <= 0)
5056 break;
5057 SPLIT_APPEND(self->str, j, i);
5058 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5059 i++;
5060 j = i;
5061 }
5062 }
5063 if (j < len) {
5064 SPLIT_APPEND(self->str, j, len);
5065 }
5066 return list;
5067
5068 onError:
5069 Py_DECREF(list);
5070 return NULL;
5071}
5072
5073PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005074 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005076 register Py_ssize_t i;
5077 register Py_ssize_t j;
5078 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 PyObject *list;
5080 PyObject *str;
5081 Py_UNICODE *data;
5082
5083 string = PyUnicode_FromObject(string);
5084 if (string == NULL)
5085 return NULL;
5086 data = PyUnicode_AS_UNICODE(string);
5087 len = PyUnicode_GET_SIZE(string);
5088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 list = PyList_New(0);
5090 if (!list)
5091 goto onError;
5092
5093 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005095
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005097 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099
5100 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005101 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (i < len) {
5103 if (data[i] == '\r' && i + 1 < len &&
5104 data[i+1] == '\n')
5105 i += 2;
5106 else
5107 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005108 if (keepends)
5109 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 }
Guido van Rossum86662912000-04-11 15:38:46 +00005111 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 j = i;
5113 }
5114 if (j < len) {
5115 SPLIT_APPEND(data, j, len);
5116 }
5117
5118 Py_DECREF(string);
5119 return list;
5120
5121 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005122 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 Py_DECREF(string);
5124 return NULL;
5125}
5126
Tim Petersced69f82003-09-16 20:30:58 +00005127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128PyObject *split_char(PyUnicodeObject *self,
5129 PyObject *list,
5130 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005131 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005133 register Py_ssize_t i;
5134 register Py_ssize_t j;
5135 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 PyObject *str;
5137
5138 for (i = j = 0; i < len; ) {
5139 if (self->str[i] == ch) {
5140 if (maxcount-- <= 0)
5141 break;
5142 SPLIT_APPEND(self->str, j, i);
5143 i = j = i + 1;
5144 } else
5145 i++;
5146 }
5147 if (j <= len) {
5148 SPLIT_APPEND(self->str, j, len);
5149 }
5150 return list;
5151
5152 onError:
5153 Py_DECREF(list);
5154 return NULL;
5155}
5156
Tim Petersced69f82003-09-16 20:30:58 +00005157static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158PyObject *split_substring(PyUnicodeObject *self,
5159 PyObject *list,
5160 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005161 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005163 register Py_ssize_t i;
5164 register Py_ssize_t j;
5165 Py_ssize_t len = self->length;
5166 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 PyObject *str;
5168
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005169 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 if (Py_UNICODE_MATCH(self, i, substring)) {
5171 if (maxcount-- <= 0)
5172 break;
5173 SPLIT_APPEND(self->str, j, i);
5174 i = j = i + sublen;
5175 } else
5176 i++;
5177 }
5178 if (j <= len) {
5179 SPLIT_APPEND(self->str, j, len);
5180 }
5181 return list;
5182
5183 onError:
5184 Py_DECREF(list);
5185 return NULL;
5186}
5187
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005188static
5189PyObject *rsplit_whitespace(PyUnicodeObject *self,
5190 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005191 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 register Py_ssize_t i;
5194 register Py_ssize_t j;
5195 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005196 PyObject *str;
5197
5198 for (i = j = len - 1; i >= 0; ) {
5199 /* find a token */
5200 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5201 i--;
5202 j = i;
5203 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5204 i--;
5205 if (j > i) {
5206 if (maxcount-- <= 0)
5207 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005208 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005209 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5210 i--;
5211 j = i;
5212 }
5213 }
5214 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005215 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005216 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005217 if (PyList_Reverse(list) < 0)
5218 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005219 return list;
5220
5221 onError:
5222 Py_DECREF(list);
5223 return NULL;
5224}
5225
5226static
5227PyObject *rsplit_char(PyUnicodeObject *self,
5228 PyObject *list,
5229 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 register Py_ssize_t i;
5233 register Py_ssize_t j;
5234 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005235 PyObject *str;
5236
5237 for (i = j = len - 1; i >= 0; ) {
5238 if (self->str[i] == ch) {
5239 if (maxcount-- <= 0)
5240 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005241 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005242 j = i = i - 1;
5243 } else
5244 i--;
5245 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005246 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005247 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005248 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005249 if (PyList_Reverse(list) < 0)
5250 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005251 return list;
5252
5253 onError:
5254 Py_DECREF(list);
5255 return NULL;
5256}
5257
5258static
5259PyObject *rsplit_substring(PyUnicodeObject *self,
5260 PyObject *list,
5261 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005262 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 register Py_ssize_t i;
5265 register Py_ssize_t j;
5266 Py_ssize_t len = self->length;
5267 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005268 PyObject *str;
5269
5270 for (i = len - sublen, j = len; i >= 0; ) {
5271 if (Py_UNICODE_MATCH(self, i, substring)) {
5272 if (maxcount-- <= 0)
5273 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005274 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005275 j = i;
5276 i -= sublen;
5277 } else
5278 i--;
5279 }
5280 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005281 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005282 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005283 if (PyList_Reverse(list) < 0)
5284 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005285 return list;
5286
5287 onError:
5288 Py_DECREF(list);
5289 return NULL;
5290}
5291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292#undef SPLIT_APPEND
5293
5294static
5295PyObject *split(PyUnicodeObject *self,
5296 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298{
5299 PyObject *list;
5300
5301 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005302 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303
5304 list = PyList_New(0);
5305 if (!list)
5306 return NULL;
5307
5308 if (substring == NULL)
5309 return split_whitespace(self,list,maxcount);
5310
5311 else if (substring->length == 1)
5312 return split_char(self,list,substring->str[0],maxcount);
5313
5314 else if (substring->length == 0) {
5315 Py_DECREF(list);
5316 PyErr_SetString(PyExc_ValueError, "empty separator");
5317 return NULL;
5318 }
5319 else
5320 return split_substring(self,list,substring,maxcount);
5321}
5322
Tim Petersced69f82003-09-16 20:30:58 +00005323static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005324PyObject *rsplit(PyUnicodeObject *self,
5325 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005327{
5328 PyObject *list;
5329
5330 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005331 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005332
5333 list = PyList_New(0);
5334 if (!list)
5335 return NULL;
5336
5337 if (substring == NULL)
5338 return rsplit_whitespace(self,list,maxcount);
5339
5340 else if (substring->length == 1)
5341 return rsplit_char(self,list,substring->str[0],maxcount);
5342
5343 else if (substring->length == 0) {
5344 Py_DECREF(list);
5345 PyErr_SetString(PyExc_ValueError, "empty separator");
5346 return NULL;
5347 }
5348 else
5349 return rsplit_substring(self,list,substring,maxcount);
5350}
5351
5352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353PyObject *replace(PyUnicodeObject *self,
5354 PyUnicodeObject *str1,
5355 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357{
5358 PyUnicodeObject *u;
5359
5360 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005361 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
Thomas Wouters477c8d52006-05-27 19:21:47 +00005363 if (str1->length == str2->length) {
5364 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005365 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005366 if (str1->length == 1) {
5367 /* replace characters */
5368 Py_UNICODE u1, u2;
5369 if (!findchar(self->str, self->length, str1->str[0]))
5370 goto nothing;
5371 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5372 if (!u)
5373 return NULL;
5374 Py_UNICODE_COPY(u->str, self->str, self->length);
5375 u1 = str1->str[0];
5376 u2 = str2->str[0];
5377 for (i = 0; i < u->length; i++)
5378 if (u->str[i] == u1) {
5379 if (--maxcount < 0)
5380 break;
5381 u->str[i] = u2;
5382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005384 i = fastsearch(
5385 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005387 if (i < 0)
5388 goto nothing;
5389 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5390 if (!u)
5391 return NULL;
5392 Py_UNICODE_COPY(u->str, self->str, self->length);
5393 while (i <= self->length - str1->length)
5394 if (Py_UNICODE_MATCH(self, i, str1)) {
5395 if (--maxcount < 0)
5396 break;
5397 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5398 i += str1->length;
5399 } else
5400 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005403
5404 Py_ssize_t n, i, j, e;
5405 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 Py_UNICODE *p;
5407
5408 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 if (n > maxcount)
5411 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005412 if (n == 0)
5413 goto nothing;
5414 /* new_size = self->length + n * (str2->length - str1->length)); */
5415 delta = (str2->length - str1->length);
5416 if (delta == 0) {
5417 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 product = n * (str2->length - str1->length);
5420 if ((product / (str2->length - str1->length)) != n) {
5421 PyErr_SetString(PyExc_OverflowError,
5422 "replace string is too long");
5423 return NULL;
5424 }
5425 new_size = self->length + product;
5426 if (new_size < 0) {
5427 PyErr_SetString(PyExc_OverflowError,
5428 "replace string is too long");
5429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 }
5431 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005432 u = _PyUnicode_New(new_size);
5433 if (!u)
5434 return NULL;
5435 i = 0;
5436 p = u->str;
5437 e = self->length - str1->length;
5438 if (str1->length > 0) {
5439 while (n-- > 0) {
5440 /* look for next match */
5441 j = i;
5442 while (j <= e) {
5443 if (Py_UNICODE_MATCH(self, j, str1))
5444 break;
5445 j++;
5446 }
5447 if (j > i) {
5448 if (j > e)
5449 break;
5450 /* copy unchanged part [i:j] */
5451 Py_UNICODE_COPY(p, self->str+i, j-i);
5452 p += j - i;
5453 }
5454 /* copy substitution string */
5455 if (str2->length > 0) {
5456 Py_UNICODE_COPY(p, str2->str, str2->length);
5457 p += str2->length;
5458 }
5459 i = j + str1->length;
5460 }
5461 if (i < self->length)
5462 /* copy tail [i:] */
5463 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5464 } else {
5465 /* interleave */
5466 while (n > 0) {
5467 Py_UNICODE_COPY(p, str2->str, str2->length);
5468 p += str2->length;
5469 if (--n <= 0)
5470 break;
5471 *p++ = self->str[i++];
5472 }
5473 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005477
5478nothing:
5479 /* nothing to replace; return original string (when possible) */
5480 if (PyUnicode_CheckExact(self)) {
5481 Py_INCREF(self);
5482 return (PyObject *) self;
5483 }
5484 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485}
5486
5487/* --- Unicode Object Methods --------------------------------------------- */
5488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005489PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490"S.title() -> unicode\n\
5491\n\
5492Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005493characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
5495static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005496unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 return fixup(self, fixtitle);
5499}
5500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005501PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502"S.capitalize() -> unicode\n\
5503\n\
5504Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005505have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506
5507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005508unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 return fixup(self, fixcapitalize);
5511}
5512
5513#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005514PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515"S.capwords() -> unicode\n\
5516\n\
5517Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005518normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
5520static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005521unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522{
5523 PyObject *list;
5524 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 /* Split into words */
5528 list = split(self, NULL, -1);
5529 if (!list)
5530 return NULL;
5531
5532 /* Capitalize each word */
5533 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5534 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5535 fixcapitalize);
5536 if (item == NULL)
5537 goto onError;
5538 Py_DECREF(PyList_GET_ITEM(list, i));
5539 PyList_SET_ITEM(list, i, item);
5540 }
5541
5542 /* Join the words to form a new string */
5543 item = PyUnicode_Join(NULL, list);
5544
5545onError:
5546 Py_DECREF(list);
5547 return (PyObject *)item;
5548}
5549#endif
5550
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005551/* Argument converter. Coerces to a single unicode character */
5552
5553static int
5554convert_uc(PyObject *obj, void *addr)
5555{
5556 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5557 PyObject *uniobj;
5558 Py_UNICODE *unistr;
5559
5560 uniobj = PyUnicode_FromObject(obj);
5561 if (uniobj == NULL) {
5562 PyErr_SetString(PyExc_TypeError,
5563 "The fill character cannot be converted to Unicode");
5564 return 0;
5565 }
5566 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5567 PyErr_SetString(PyExc_TypeError,
5568 "The fill character must be exactly one character long");
5569 Py_DECREF(uniobj);
5570 return 0;
5571 }
5572 unistr = PyUnicode_AS_UNICODE(uniobj);
5573 *fillcharloc = unistr[0];
5574 Py_DECREF(uniobj);
5575 return 1;
5576}
5577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005578PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005579"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005581Return S centered in a Unicode string of length width. Padding is\n\
5582done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
5584static PyObject *
5585unicode_center(PyUnicodeObject *self, PyObject *args)
5586{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 Py_ssize_t marg, left;
5588 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005589 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590
Thomas Woutersde017742006-02-16 19:34:37 +00005591 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 return NULL;
5593
Tim Peters7a29bd52001-09-12 03:03:31 +00005594 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 Py_INCREF(self);
5596 return (PyObject*) self;
5597 }
5598
5599 marg = width - self->length;
5600 left = marg / 2 + (marg & width & 1);
5601
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005602 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603}
5604
Marc-André Lemburge5034372000-08-08 08:04:29 +00005605#if 0
5606
5607/* This code should go into some future Unicode collation support
5608 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005609 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005610
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005611/* speedy UTF-16 code point order comparison */
5612/* gleaned from: */
5613/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5614
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005615static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005616{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005617 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005618 0, 0, 0, 0, 0, 0, 0, 0,
5619 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005620 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005621};
5622
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623static int
5624unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005626 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 Py_UNICODE *s1 = str1->str;
5629 Py_UNICODE *s2 = str2->str;
5630
5631 len1 = str1->length;
5632 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005633
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005635 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005636
5637 c1 = *s1++;
5638 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005639
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005640 if (c1 > (1<<11) * 26)
5641 c1 += utf16Fixup[c1>>11];
5642 if (c2 > (1<<11) * 26)
5643 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005644 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005645
5646 if (c1 != c2)
5647 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005648
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005649 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 }
5651
5652 return (len1 < len2) ? -1 : (len1 != len2);
5653}
5654
Marc-André Lemburge5034372000-08-08 08:04:29 +00005655#else
5656
5657static int
5658unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005661
5662 Py_UNICODE *s1 = str1->str;
5663 Py_UNICODE *s2 = str2->str;
5664
5665 len1 = str1->length;
5666 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005667
Marc-André Lemburge5034372000-08-08 08:04:29 +00005668 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005669 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005670
Fredrik Lundh45714e92001-06-26 16:39:36 +00005671 c1 = *s1++;
5672 c2 = *s2++;
5673
5674 if (c1 != c2)
5675 return (c1 < c2) ? -1 : 1;
5676
Marc-André Lemburge5034372000-08-08 08:04:29 +00005677 len1--; len2--;
5678 }
5679
5680 return (len1 < len2) ? -1 : (len1 != len2);
5681}
5682
5683#endif
5684
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685int PyUnicode_Compare(PyObject *left,
5686 PyObject *right)
5687{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005688 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5689 return unicode_compare((PyUnicodeObject *)left,
5690 (PyUnicodeObject *)right);
5691 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5692 (PyUnicode_Check(left) && PyString_Check(right))) {
5693 if (PyUnicode_Check(left))
5694 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5695 if (PyUnicode_Check(right))
5696 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5697 assert(PyString_Check(left));
5698 assert(PyString_Check(right));
5699 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005701 PyErr_Format(PyExc_TypeError,
5702 "Can't compare %.100s and %.100s",
5703 left->ob_type->tp_name,
5704 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return -1;
5706}
5707
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005708PyObject *PyUnicode_RichCompare(PyObject *left,
5709 PyObject *right,
5710 int op)
5711{
5712 int result;
5713
5714 result = PyUnicode_Compare(left, right);
5715 if (result == -1 && PyErr_Occurred())
5716 goto onError;
5717
5718 /* Convert the return value to a Boolean */
5719 switch (op) {
5720 case Py_EQ:
5721 result = (result == 0);
5722 break;
5723 case Py_NE:
5724 result = (result != 0);
5725 break;
5726 case Py_LE:
5727 result = (result <= 0);
5728 break;
5729 case Py_GE:
5730 result = (result >= 0);
5731 break;
5732 case Py_LT:
5733 result = (result == -1);
5734 break;
5735 case Py_GT:
5736 result = (result == 1);
5737 break;
5738 }
5739 return PyBool_FromLong(result);
5740
5741 onError:
5742
5743 /* Standard case
5744
5745 Type errors mean that PyUnicode_FromObject() could not convert
5746 one of the arguments (usually the right hand side) to Unicode,
5747 ie. we can't handle the comparison request. However, it is
5748 possible that the other object knows a comparison method, which
5749 is why we return Py_NotImplemented to give the other object a
5750 chance.
5751
5752 */
5753 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5754 PyErr_Clear();
5755 Py_INCREF(Py_NotImplemented);
5756 return Py_NotImplemented;
5757 }
5758 if (op != Py_EQ && op != Py_NE)
5759 return NULL;
5760
5761 /* Equality comparison.
5762
5763 This is a special case: we silence any PyExc_UnicodeDecodeError
5764 and instead turn it into a PyErr_UnicodeWarning.
5765
5766 */
5767 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5768 return NULL;
5769 PyErr_Clear();
5770 if (PyErr_Warn(PyExc_UnicodeWarning,
5771 (op == Py_EQ) ?
5772 "Unicode equal comparison "
5773 "failed to convert both arguments to Unicode - "
5774 "interpreting them as being unequal" :
5775 "Unicode unequal comparison "
5776 "failed to convert both arguments to Unicode - "
5777 "interpreting them as being unequal"
5778 ) < 0)
5779 return NULL;
5780 result = (op == Py_NE);
5781 return PyBool_FromLong(result);
5782}
5783
Guido van Rossum403d68b2000-03-13 15:55:09 +00005784int PyUnicode_Contains(PyObject *container,
5785 PyObject *element)
5786{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005787 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005788 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005789
5790 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005791 sub = PyUnicode_FromObject(element);
5792 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005793 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005794 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005795 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005796 }
5797
Thomas Wouters477c8d52006-05-27 19:21:47 +00005798 str = PyUnicode_FromObject(container);
5799 if (!str) {
5800 Py_DECREF(sub);
5801 return -1;
5802 }
5803
5804 result = stringlib_contains_obj(str, sub);
5805
5806 Py_DECREF(str);
5807 Py_DECREF(sub);
5808
Guido van Rossum403d68b2000-03-13 15:55:09 +00005809 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005810}
5811
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812/* Concat to string or Unicode object giving a new Unicode object. */
5813
5814PyObject *PyUnicode_Concat(PyObject *left,
5815 PyObject *right)
5816{
5817 PyUnicodeObject *u = NULL, *v = NULL, *w;
5818
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005819 if (PyBytes_Check(left) || PyBytes_Check(right))
5820 return PyBytes_Concat(left, right);
5821
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 /* Coerce the two arguments */
5823 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5824 if (u == NULL)
5825 goto onError;
5826 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5827 if (v == NULL)
5828 goto onError;
5829
5830 /* Shortcuts */
5831 if (v == unicode_empty) {
5832 Py_DECREF(v);
5833 return (PyObject *)u;
5834 }
5835 if (u == unicode_empty) {
5836 Py_DECREF(u);
5837 return (PyObject *)v;
5838 }
5839
5840 /* Concat the two Unicode strings */
5841 w = _PyUnicode_New(u->length + v->length);
5842 if (w == NULL)
5843 goto onError;
5844 Py_UNICODE_COPY(w->str, u->str, u->length);
5845 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5846
5847 Py_DECREF(u);
5848 Py_DECREF(v);
5849 return (PyObject *)w;
5850
5851onError:
5852 Py_XDECREF(u);
5853 Py_XDECREF(v);
5854 return NULL;
5855}
5856
Walter Dörwald1ab83302007-05-18 17:15:44 +00005857void
5858PyUnicode_Append(PyObject **pleft, PyObject *right)
5859{
5860 PyObject *new;
5861 if (*pleft == NULL)
5862 return;
5863 if (right == NULL || !PyUnicode_Check(*pleft)) {
5864 Py_DECREF(*pleft);
5865 *pleft = NULL;
5866 return;
5867 }
5868 new = PyUnicode_Concat(*pleft, right);
5869 Py_DECREF(*pleft);
5870 *pleft = new;
5871}
5872
5873void
5874PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
5875{
5876 PyUnicode_Append(pleft, right);
5877 Py_XDECREF(right);
5878}
5879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005880PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881"S.count(sub[, start[, end]]) -> int\n\
5882\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005883Return the number of non-overlapping occurrences of substring sub in\n\
5884Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005885interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
5887static PyObject *
5888unicode_count(PyUnicodeObject *self, PyObject *args)
5889{
5890 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005891 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005892 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 PyObject *result;
5894
Guido van Rossumb8872e62000-05-09 14:14:27 +00005895 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5896 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
5898
5899 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005900 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 if (substring == NULL)
5902 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005903
Thomas Wouters477c8d52006-05-27 19:21:47 +00005904 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Thomas Wouters477c8d52006-05-27 19:21:47 +00005906 result = PyInt_FromSsize_t(
5907 stringlib_count(self->str + start, end - start,
5908 substring->str, substring->length)
5909 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
5911 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005912
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return result;
5914}
5915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005916PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005917"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005919Encodes S using the codec registered for encoding. encoding defaults\n\
5920to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005921handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5923'xmlcharrefreplace' as well as any other name registered with\n\
5924codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
5926static PyObject *
5927unicode_encode(PyUnicodeObject *self, PyObject *args)
5928{
5929 char *encoding = NULL;
5930 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005931 PyObject *v;
5932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5934 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005935 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005936 if (v == NULL)
5937 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005938 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005939 if (PyString_Check(v)) {
5940 /* Old codec, turn it into bytes */
5941 PyObject *b = PyBytes_FromObject(v);
5942 Py_DECREF(v);
5943 return b;
5944 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005945 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005946 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005947 "(type=%.400s)",
5948 v->ob_type->tp_name);
5949 Py_DECREF(v);
5950 return NULL;
5951 }
5952 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005953
5954 onError:
5955 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005956}
5957
5958PyDoc_STRVAR(decode__doc__,
5959"S.decode([encoding[,errors]]) -> string or unicode\n\
5960\n\
5961Decodes S using the codec registered for encoding. encoding defaults\n\
5962to the default encoding. errors may be given to set a different error\n\
5963handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5964a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5965as well as any other name registerd with codecs.register_error that is\n\
5966able to handle UnicodeDecodeErrors.");
5967
5968static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005969unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005970{
5971 char *encoding = NULL;
5972 char *errors = NULL;
5973 PyObject *v;
5974
5975 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5976 return NULL;
5977 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005978 if (v == NULL)
5979 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005980 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5981 PyErr_Format(PyExc_TypeError,
5982 "decoder did not return a string/unicode object "
5983 "(type=%.400s)",
5984 v->ob_type->tp_name);
5985 Py_DECREF(v);
5986 return NULL;
5987 }
5988 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005989
5990 onError:
5991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992}
5993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005994PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995"S.expandtabs([tabsize]) -> unicode\n\
5996\n\
5997Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005998If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
6000static PyObject*
6001unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6002{
6003 Py_UNICODE *e;
6004 Py_UNICODE *p;
6005 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006006 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 PyUnicodeObject *u;
6008 int tabsize = 8;
6009
6010 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6011 return NULL;
6012
Thomas Wouters7e474022000-07-16 12:04:32 +00006013 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 i = j = 0;
6015 e = self->str + self->length;
6016 for (p = self->str; p < e; p++)
6017 if (*p == '\t') {
6018 if (tabsize > 0)
6019 j += tabsize - (j % tabsize);
6020 }
6021 else {
6022 j++;
6023 if (*p == '\n' || *p == '\r') {
6024 i += j;
6025 j = 0;
6026 }
6027 }
6028
6029 /* Second pass: create output string and fill it */
6030 u = _PyUnicode_New(i + j);
6031 if (!u)
6032 return NULL;
6033
6034 j = 0;
6035 q = u->str;
6036
6037 for (p = self->str; p < e; p++)
6038 if (*p == '\t') {
6039 if (tabsize > 0) {
6040 i = tabsize - (j % tabsize);
6041 j += i;
6042 while (i--)
6043 *q++ = ' ';
6044 }
6045 }
6046 else {
6047 j++;
6048 *q++ = *p;
6049 if (*p == '\n' || *p == '\r')
6050 j = 0;
6051 }
6052
6053 return (PyObject*) u;
6054}
6055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006056PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057"S.find(sub [,start [,end]]) -> int\n\
6058\n\
6059Return the lowest index in S where substring sub is found,\n\
6060such that sub is contained within s[start,end]. Optional\n\
6061arguments start and end are interpreted as in slice notation.\n\
6062\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
6065static PyObject *
6066unicode_find(PyUnicodeObject *self, PyObject *args)
6067{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006068 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006069 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006070 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006071 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
Guido van Rossumb8872e62000-05-09 14:14:27 +00006073 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6074 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006076 substring = PyUnicode_FromObject(substring);
6077 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 return NULL;
6079
Thomas Wouters477c8d52006-05-27 19:21:47 +00006080 result = stringlib_find_slice(
6081 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6082 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6083 start, end
6084 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
6086 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006087
6088 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089}
6090
6091static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093{
6094 if (index < 0 || index >= self->length) {
6095 PyErr_SetString(PyExc_IndexError, "string index out of range");
6096 return NULL;
6097 }
6098
6099 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6100}
6101
6102static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006103unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006105 /* Since Unicode objects compare equal to their UTF-8 string
6106 counterparts, we hash the UTF-8 string. */
6107 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6108 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109}
6110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006111PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112"S.index(sub [,start [,end]]) -> int\n\
6113\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
6116static PyObject *
6117unicode_index(PyUnicodeObject *self, PyObject *args)
6118{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006119 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006120 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006122 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Guido van Rossumb8872e62000-05-09 14:14:27 +00006124 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 substring = PyUnicode_FromObject(substring);
6128 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return NULL;
6130
Thomas Wouters477c8d52006-05-27 19:21:47 +00006131 result = stringlib_find_slice(
6132 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6133 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6134 start, end
6135 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
6137 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006138
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 if (result < 0) {
6140 PyErr_SetString(PyExc_ValueError, "substring not found");
6141 return NULL;
6142 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006143
Martin v. Löwis18e16552006-02-15 17:27:45 +00006144 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145}
6146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006147PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006150Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006154unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
6156 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6157 register const Py_UNICODE *e;
6158 int cased;
6159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 /* Shortcut for single character strings */
6161 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006162 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006164 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006165 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006167
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 e = p + PyUnicode_GET_SIZE(self);
6169 cased = 0;
6170 for (; p < e; p++) {
6171 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006172
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006174 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 else if (!cased && Py_UNICODE_ISLOWER(ch))
6176 cased = 1;
6177 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006178 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179}
6180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006181PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006182"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006184Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006185at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
6187static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006188unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
6190 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6191 register const Py_UNICODE *e;
6192 int cased;
6193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 /* Shortcut for single character strings */
6195 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006196 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006198 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006199 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006200 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006201
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 e = p + PyUnicode_GET_SIZE(self);
6203 cased = 0;
6204 for (; p < e; p++) {
6205 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006208 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 else if (!cased && Py_UNICODE_ISUPPER(ch))
6210 cased = 1;
6211 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006212 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006215PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006216"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006218Return True if S is a titlecased string and there is at least one\n\
6219character in S, i.e. upper- and titlecase characters may only\n\
6220follow uncased characters and lowercase characters only cased ones.\n\
6221Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222
6223static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006224unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225{
6226 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6227 register const Py_UNICODE *e;
6228 int cased, previous_is_cased;
6229
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 /* Shortcut for single character strings */
6231 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006232 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6233 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006235 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006236 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006237 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006238
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 e = p + PyUnicode_GET_SIZE(self);
6240 cased = 0;
6241 previous_is_cased = 0;
6242 for (; p < e; p++) {
6243 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006244
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6246 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006247 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 previous_is_cased = 1;
6249 cased = 1;
6250 }
6251 else if (Py_UNICODE_ISLOWER(ch)) {
6252 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006253 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 previous_is_cased = 1;
6255 cased = 1;
6256 }
6257 else
6258 previous_is_cased = 0;
6259 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006260 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261}
6262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006263PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006264"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006266Return True if all characters in S are whitespace\n\
6267and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
6269static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006270unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
6272 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6273 register const Py_UNICODE *e;
6274
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 /* Shortcut for single character strings */
6276 if (PyUnicode_GET_SIZE(self) == 1 &&
6277 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006278 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006280 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006281 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006282 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006283
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 e = p + PyUnicode_GET_SIZE(self);
6285 for (; p < e; p++) {
6286 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006287 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006289 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290}
6291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006292PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006293"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006294\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006295Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006297
6298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006299unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006300{
6301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6302 register const Py_UNICODE *e;
6303
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006304 /* Shortcut for single character strings */
6305 if (PyUnicode_GET_SIZE(self) == 1 &&
6306 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006307 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006308
6309 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006310 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006311 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006312
6313 e = p + PyUnicode_GET_SIZE(self);
6314 for (; p < e; p++) {
6315 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006316 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006317 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006318 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006319}
6320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006321PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006322"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006323\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006324Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006325and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006326
6327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006328unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006329{
6330 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6331 register const Py_UNICODE *e;
6332
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006333 /* Shortcut for single character strings */
6334 if (PyUnicode_GET_SIZE(self) == 1 &&
6335 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006336 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006337
6338 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006339 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006340 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006341
6342 e = p + PyUnicode_GET_SIZE(self);
6343 for (; p < e; p++) {
6344 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006345 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006346 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006347 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006348}
6349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006350PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006351"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006353Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006354False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
6356static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006357unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
6359 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6360 register const Py_UNICODE *e;
6361
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 /* Shortcut for single character strings */
6363 if (PyUnicode_GET_SIZE(self) == 1 &&
6364 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006365 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006367 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006368 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006369 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 e = p + PyUnicode_GET_SIZE(self);
6372 for (; p < e; p++) {
6373 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006374 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006376 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377}
6378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006379PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006380"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006382Return True if all characters in S are digits\n\
6383and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
6385static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006386unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387{
6388 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6389 register const Py_UNICODE *e;
6390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 /* Shortcut for single character strings */
6392 if (PyUnicode_GET_SIZE(self) == 1 &&
6393 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006396 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006397 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006398 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006399
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 e = p + PyUnicode_GET_SIZE(self);
6401 for (; p < e; p++) {
6402 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006403 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006405 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006408PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006409"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006412False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
6414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006415unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
6417 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6418 register const Py_UNICODE *e;
6419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 /* Shortcut for single character strings */
6421 if (PyUnicode_GET_SIZE(self) == 1 &&
6422 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006425 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006426 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 e = p + PyUnicode_GET_SIZE(self);
6430 for (; p < e; p++) {
6431 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006432 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006434 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435}
6436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006437PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438"S.join(sequence) -> unicode\n\
6439\n\
6440Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006441sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
6443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006444unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006446 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447}
6448
Martin v. Löwis18e16552006-02-15 17:27:45 +00006449static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450unicode_length(PyUnicodeObject *self)
6451{
6452 return self->length;
6453}
6454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006455PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006456"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457\n\
6458Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006459done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
6461static PyObject *
6462unicode_ljust(PyUnicodeObject *self, PyObject *args)
6463{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006464 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006465 Py_UNICODE fillchar = ' ';
6466
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006467 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 return NULL;
6469
Tim Peters7a29bd52001-09-12 03:03:31 +00006470 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 Py_INCREF(self);
6472 return (PyObject*) self;
6473 }
6474
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006475 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476}
6477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006478PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479"S.lower() -> unicode\n\
6480\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
6483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006484unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 return fixup(self, fixlower);
6487}
6488
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006489#define LEFTSTRIP 0
6490#define RIGHTSTRIP 1
6491#define BOTHSTRIP 2
6492
6493/* Arrays indexed by above */
6494static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6495
6496#define STRIPNAME(i) (stripformat[i]+3)
6497
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006498/* externally visible for str.strip(unicode) */
6499PyObject *
6500_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6501{
6502 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006504 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6506 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006507
Thomas Wouters477c8d52006-05-27 19:21:47 +00006508 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6509
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006510 i = 0;
6511 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6513 i++;
6514 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006515 }
6516
6517 j = len;
6518 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 do {
6520 j--;
6521 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6522 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006523 }
6524
6525 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 Py_INCREF(self);
6527 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006528 }
6529 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006531}
6532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
6534static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006535do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006537 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006538 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006539
6540 i = 0;
6541 if (striptype != RIGHTSTRIP) {
6542 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6543 i++;
6544 }
6545 }
6546
6547 j = len;
6548 if (striptype != LEFTSTRIP) {
6549 do {
6550 j--;
6551 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6552 j++;
6553 }
6554
6555 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6556 Py_INCREF(self);
6557 return (PyObject*)self;
6558 }
6559 else
6560 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006563
6564static PyObject *
6565do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6566{
6567 PyObject *sep = NULL;
6568
6569 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6570 return NULL;
6571
6572 if (sep != NULL && sep != Py_None) {
6573 if (PyUnicode_Check(sep))
6574 return _PyUnicode_XStrip(self, striptype, sep);
6575 else if (PyString_Check(sep)) {
6576 PyObject *res;
6577 sep = PyUnicode_FromObject(sep);
6578 if (sep==NULL)
6579 return NULL;
6580 res = _PyUnicode_XStrip(self, striptype, sep);
6581 Py_DECREF(sep);
6582 return res;
6583 }
6584 else {
6585 PyErr_Format(PyExc_TypeError,
6586 "%s arg must be None, unicode or str",
6587 STRIPNAME(striptype));
6588 return NULL;
6589 }
6590 }
6591
6592 return do_strip(self, striptype);
6593}
6594
6595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006597"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006598\n\
6599Return a copy of the string S with leading and trailing\n\
6600whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006601If chars is given and not None, remove characters in chars instead.\n\
6602If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006603
6604static PyObject *
6605unicode_strip(PyUnicodeObject *self, PyObject *args)
6606{
6607 if (PyTuple_GET_SIZE(args) == 0)
6608 return do_strip(self, BOTHSTRIP); /* Common case */
6609 else
6610 return do_argstrip(self, BOTHSTRIP, args);
6611}
6612
6613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006614PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006615"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006616\n\
6617Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006618If chars is given and not None, remove characters in chars instead.\n\
6619If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006620
6621static PyObject *
6622unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6623{
6624 if (PyTuple_GET_SIZE(args) == 0)
6625 return do_strip(self, LEFTSTRIP); /* Common case */
6626 else
6627 return do_argstrip(self, LEFTSTRIP, args);
6628}
6629
6630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006631PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006632"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006633\n\
6634Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006635If chars is given and not None, remove characters in chars instead.\n\
6636If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006637
6638static PyObject *
6639unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6640{
6641 if (PyTuple_GET_SIZE(args) == 0)
6642 return do_strip(self, RIGHTSTRIP); /* Common case */
6643 else
6644 return do_argstrip(self, RIGHTSTRIP, args);
6645}
6646
6647
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006649unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650{
6651 PyUnicodeObject *u;
6652 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006653 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006654 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656 if (len < 0)
6657 len = 0;
6658
Tim Peters7a29bd52001-09-12 03:03:31 +00006659 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 /* no repeat, return original string */
6661 Py_INCREF(str);
6662 return (PyObject*) str;
6663 }
Tim Peters8f422462000-09-09 06:13:41 +00006664
6665 /* ensure # of chars needed doesn't overflow int and # of bytes
6666 * needed doesn't overflow size_t
6667 */
6668 nchars = len * str->length;
6669 if (len && nchars / len != str->length) {
6670 PyErr_SetString(PyExc_OverflowError,
6671 "repeated string is too long");
6672 return NULL;
6673 }
6674 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6675 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6676 PyErr_SetString(PyExc_OverflowError,
6677 "repeated string is too long");
6678 return NULL;
6679 }
6680 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 if (!u)
6682 return NULL;
6683
6684 p = u->str;
6685
Thomas Wouters477c8d52006-05-27 19:21:47 +00006686 if (str->length == 1 && len > 0) {
6687 Py_UNICODE_FILL(p, str->str[0], len);
6688 } else {
6689 Py_ssize_t done = 0; /* number of characters copied this far */
6690 if (done < nchars) {
6691 Py_UNICODE_COPY(p, str->str, str->length);
6692 done = str->length;
6693 }
6694 while (done < nchars) {
6695 int n = (done <= nchars-done) ? done : nchars-done;
6696 Py_UNICODE_COPY(p+done, p, n);
6697 done += n;
6698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 }
6700
6701 return (PyObject*) u;
6702}
6703
6704PyObject *PyUnicode_Replace(PyObject *obj,
6705 PyObject *subobj,
6706 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006707 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
6709 PyObject *self;
6710 PyObject *str1;
6711 PyObject *str2;
6712 PyObject *result;
6713
6714 self = PyUnicode_FromObject(obj);
6715 if (self == NULL)
6716 return NULL;
6717 str1 = PyUnicode_FromObject(subobj);
6718 if (str1 == NULL) {
6719 Py_DECREF(self);
6720 return NULL;
6721 }
6722 str2 = PyUnicode_FromObject(replobj);
6723 if (str2 == NULL) {
6724 Py_DECREF(self);
6725 Py_DECREF(str1);
6726 return NULL;
6727 }
Tim Petersced69f82003-09-16 20:30:58 +00006728 result = replace((PyUnicodeObject *)self,
6729 (PyUnicodeObject *)str1,
6730 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 maxcount);
6732 Py_DECREF(self);
6733 Py_DECREF(str1);
6734 Py_DECREF(str2);
6735 return result;
6736}
6737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006738PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739"S.replace (old, new[, maxsplit]) -> unicode\n\
6740\n\
6741Return a copy of S with all occurrences of substring\n\
6742old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744
6745static PyObject*
6746unicode_replace(PyUnicodeObject *self, PyObject *args)
6747{
6748 PyUnicodeObject *str1;
6749 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006750 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 PyObject *result;
6752
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 return NULL;
6755 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6756 if (str1 == NULL)
6757 return NULL;
6758 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006759 if (str2 == NULL) {
6760 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763
6764 result = replace(self, str1, str2, maxcount);
6765
6766 Py_DECREF(str1);
6767 Py_DECREF(str2);
6768 return result;
6769}
6770
6771static
6772PyObject *unicode_repr(PyObject *unicode)
6773{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006774 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006775 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006776 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6777 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6778
6779 /* XXX(nnorwitz): rather than over-allocating, it would be
6780 better to choose a different scheme. Perhaps scan the
6781 first N-chars of the string and allocate based on that size.
6782 */
6783 /* Initial allocation is based on the longest-possible unichr
6784 escape.
6785
6786 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6787 unichr, so in this case it's the longest unichr escape. In
6788 narrow (UTF-16) builds this is five chars per source unichr
6789 since there are two unichrs in the surrogate pair, so in narrow
6790 (UTF-16) builds it's not the longest unichr escape.
6791
6792 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6793 so in the narrow (UTF-16) build case it's the longest unichr
6794 escape.
6795 */
6796
Walter Dörwald1ab83302007-05-18 17:15:44 +00006797 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006798 2 /* quotes */
6799#ifdef Py_UNICODE_WIDE
6800 + 10*size
6801#else
6802 + 6*size
6803#endif
6804 + 1);
6805 if (repr == NULL)
6806 return NULL;
6807
Walter Dörwald1ab83302007-05-18 17:15:44 +00006808 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006809
6810 /* Add quote */
6811 *p++ = (findchar(s, size, '\'') &&
6812 !findchar(s, size, '"')) ? '"' : '\'';
6813 while (size-- > 0) {
6814 Py_UNICODE ch = *s++;
6815
6816 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006817 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00006818 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006819 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006820 continue;
6821 }
6822
6823#ifdef Py_UNICODE_WIDE
6824 /* Map 21-bit characters to '\U00xxxxxx' */
6825 else if (ch >= 0x10000) {
6826 *p++ = '\\';
6827 *p++ = 'U';
6828 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6829 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6830 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6831 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6832 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6833 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6834 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6835 *p++ = hexdigits[ch & 0x0000000F];
6836 continue;
6837 }
6838#else
6839 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6840 else if (ch >= 0xD800 && ch < 0xDC00) {
6841 Py_UNICODE ch2;
6842 Py_UCS4 ucs;
6843
6844 ch2 = *s++;
6845 size--;
6846 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6847 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6848 *p++ = '\\';
6849 *p++ = 'U';
6850 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6851 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6852 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6853 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6854 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6855 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6856 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6857 *p++ = hexdigits[ucs & 0x0000000F];
6858 continue;
6859 }
6860 /* Fall through: isolated surrogates are copied as-is */
6861 s--;
6862 size++;
6863 }
6864#endif
6865
6866 /* Map 16-bit characters to '\uxxxx' */
6867 if (ch >= 256) {
6868 *p++ = '\\';
6869 *p++ = 'u';
6870 *p++ = hexdigits[(ch >> 12) & 0x000F];
6871 *p++ = hexdigits[(ch >> 8) & 0x000F];
6872 *p++ = hexdigits[(ch >> 4) & 0x000F];
6873 *p++ = hexdigits[ch & 0x000F];
6874 }
6875
6876 /* Map special whitespace to '\t', \n', '\r' */
6877 else if (ch == '\t') {
6878 *p++ = '\\';
6879 *p++ = 't';
6880 }
6881 else if (ch == '\n') {
6882 *p++ = '\\';
6883 *p++ = 'n';
6884 }
6885 else if (ch == '\r') {
6886 *p++ = '\\';
6887 *p++ = 'r';
6888 }
6889
6890 /* Map non-printable US ASCII to '\xhh' */
6891 else if (ch < ' ' || ch >= 0x7F) {
6892 *p++ = '\\';
6893 *p++ = 'x';
6894 *p++ = hexdigits[(ch >> 4) & 0x000F];
6895 *p++ = hexdigits[ch & 0x000F];
6896 }
6897
6898 /* Copy everything else as-is */
6899 else
6900 *p++ = (char) ch;
6901 }
6902 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006903 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00006904
6905 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006906 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00006907 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911"S.rfind(sub [,start [,end]]) -> int\n\
6912\n\
6913Return the highest index in S where substring sub is found,\n\
6914such that sub is contained within s[start,end]. Optional\n\
6915arguments start and end are interpreted as in slice notation.\n\
6916\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006917Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
6919static PyObject *
6920unicode_rfind(PyUnicodeObject *self, PyObject *args)
6921{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006922 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006923 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006924 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006925 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
Guido van Rossumb8872e62000-05-09 14:14:27 +00006927 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6928 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006930 substring = PyUnicode_FromObject(substring);
6931 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 return NULL;
6933
Thomas Wouters477c8d52006-05-27 19:21:47 +00006934 result = stringlib_rfind_slice(
6935 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6936 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6937 start, end
6938 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939
6940 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006941
6942 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943}
6944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006945PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946"S.rindex(sub [,start [,end]]) -> int\n\
6947\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject *
6951unicode_rindex(PyUnicodeObject *self, PyObject *args)
6952{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006953 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006954 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006955 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006956 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
Guido van Rossumb8872e62000-05-09 14:14:27 +00006958 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6959 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006961 substring = PyUnicode_FromObject(substring);
6962 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 return NULL;
6964
Thomas Wouters477c8d52006-05-27 19:21:47 +00006965 result = stringlib_rfind_slice(
6966 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6967 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6968 start, end
6969 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
6971 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006972
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (result < 0) {
6974 PyErr_SetString(PyExc_ValueError, "substring not found");
6975 return NULL;
6976 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006977 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006981"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982\n\
6983Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006984done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
6986static PyObject *
6987unicode_rjust(PyUnicodeObject *self, PyObject *args)
6988{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006989 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006990 Py_UNICODE fillchar = ' ';
6991
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006992 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 return NULL;
6994
Tim Peters7a29bd52001-09-12 03:03:31 +00006995 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 Py_INCREF(self);
6997 return (PyObject*) self;
6998 }
6999
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007000 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001}
7002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007004unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005{
7006 /* standard clamping */
7007 if (start < 0)
7008 start = 0;
7009 if (end < 0)
7010 end = 0;
7011 if (end > self->length)
7012 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007013 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 /* full slice, return original string */
7015 Py_INCREF(self);
7016 return (PyObject*) self;
7017 }
7018 if (start > end)
7019 start = end;
7020 /* copy slice */
7021 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7022 end - start);
7023}
7024
7025PyObject *PyUnicode_Split(PyObject *s,
7026 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007027 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
7029 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007030
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 s = PyUnicode_FromObject(s);
7032 if (s == NULL)
7033 return NULL;
7034 if (sep != NULL) {
7035 sep = PyUnicode_FromObject(sep);
7036 if (sep == NULL) {
7037 Py_DECREF(s);
7038 return NULL;
7039 }
7040 }
7041
7042 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7043
7044 Py_DECREF(s);
7045 Py_XDECREF(sep);
7046 return result;
7047}
7048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007049PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050"S.split([sep [,maxsplit]]) -> list of strings\n\
7051\n\
7052Return a list of the words in S, using sep as the\n\
7053delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007054splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007055any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056
7057static PyObject*
7058unicode_split(PyUnicodeObject *self, PyObject *args)
7059{
7060 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007061 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062
Martin v. Löwis18e16552006-02-15 17:27:45 +00007063 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return NULL;
7065
7066 if (substring == Py_None)
7067 return split(self, NULL, maxcount);
7068 else if (PyUnicode_Check(substring))
7069 return split(self, (PyUnicodeObject *)substring, maxcount);
7070 else
7071 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7072}
7073
Thomas Wouters477c8d52006-05-27 19:21:47 +00007074PyObject *
7075PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7076{
7077 PyObject* str_obj;
7078 PyObject* sep_obj;
7079 PyObject* out;
7080
7081 str_obj = PyUnicode_FromObject(str_in);
7082 if (!str_obj)
7083 return NULL;
7084 sep_obj = PyUnicode_FromObject(sep_in);
7085 if (!sep_obj) {
7086 Py_DECREF(str_obj);
7087 return NULL;
7088 }
7089
7090 out = stringlib_partition(
7091 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7092 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7093 );
7094
7095 Py_DECREF(sep_obj);
7096 Py_DECREF(str_obj);
7097
7098 return out;
7099}
7100
7101
7102PyObject *
7103PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7104{
7105 PyObject* str_obj;
7106 PyObject* sep_obj;
7107 PyObject* out;
7108
7109 str_obj = PyUnicode_FromObject(str_in);
7110 if (!str_obj)
7111 return NULL;
7112 sep_obj = PyUnicode_FromObject(sep_in);
7113 if (!sep_obj) {
7114 Py_DECREF(str_obj);
7115 return NULL;
7116 }
7117
7118 out = stringlib_rpartition(
7119 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7120 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7121 );
7122
7123 Py_DECREF(sep_obj);
7124 Py_DECREF(str_obj);
7125
7126 return out;
7127}
7128
7129PyDoc_STRVAR(partition__doc__,
7130"S.partition(sep) -> (head, sep, tail)\n\
7131\n\
7132Searches for the separator sep in S, and returns the part before it,\n\
7133the separator itself, and the part after it. If the separator is not\n\
7134found, returns S and two empty strings.");
7135
7136static PyObject*
7137unicode_partition(PyUnicodeObject *self, PyObject *separator)
7138{
7139 return PyUnicode_Partition((PyObject *)self, separator);
7140}
7141
7142PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007143"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007144\n\
7145Searches for the separator sep in S, starting at the end of S, and returns\n\
7146the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007147separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007148
7149static PyObject*
7150unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7151{
7152 return PyUnicode_RPartition((PyObject *)self, separator);
7153}
7154
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007155PyObject *PyUnicode_RSplit(PyObject *s,
7156 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007158{
7159 PyObject *result;
7160
7161 s = PyUnicode_FromObject(s);
7162 if (s == NULL)
7163 return NULL;
7164 if (sep != NULL) {
7165 sep = PyUnicode_FromObject(sep);
7166 if (sep == NULL) {
7167 Py_DECREF(s);
7168 return NULL;
7169 }
7170 }
7171
7172 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7173
7174 Py_DECREF(s);
7175 Py_XDECREF(sep);
7176 return result;
7177}
7178
7179PyDoc_STRVAR(rsplit__doc__,
7180"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7181\n\
7182Return a list of the words in S, using sep as the\n\
7183delimiter string, starting at the end of the string and\n\
7184working to the front. If maxsplit is given, at most maxsplit\n\
7185splits are done. If sep is not specified, any whitespace string\n\
7186is a separator.");
7187
7188static PyObject*
7189unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7190{
7191 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007193
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007195 return NULL;
7196
7197 if (substring == Py_None)
7198 return rsplit(self, NULL, maxcount);
7199 else if (PyUnicode_Check(substring))
7200 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7201 else
7202 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7203}
7204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007205PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007206"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207\n\
7208Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007209Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212static PyObject*
7213unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7214{
Guido van Rossum86662912000-04-11 15:38:46 +00007215 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216
Guido van Rossum86662912000-04-11 15:38:46 +00007217 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 return NULL;
7219
Guido van Rossum86662912000-04-11 15:38:46 +00007220 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221}
7222
7223static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007224PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007226 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
7227 Py_XINCREF(res);
7228 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229}
7230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007231PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232"S.swapcase() -> unicode\n\
7233\n\
7234Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007238unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 return fixup(self, fixswapcase);
7241}
7242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007243PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244"S.translate(table) -> unicode\n\
7245\n\
7246Return a copy of the string S, where all characters have been mapped\n\
7247through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007248Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7249Unmapped characters are left untouched. Characters mapped to None\n\
7250are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
7252static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007253unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
Tim Petersced69f82003-09-16 20:30:58 +00007255 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007257 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 "ignore");
7259}
7260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262"S.upper() -> unicode\n\
7263\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007264Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
7266static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007267unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 return fixup(self, fixupper);
7270}
7271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007272PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273"S.zfill(width) -> unicode\n\
7274\n\
7275Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007276of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
7278static PyObject *
7279unicode_zfill(PyUnicodeObject *self, PyObject *args)
7280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007281 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 PyUnicodeObject *u;
7283
Martin v. Löwis18e16552006-02-15 17:27:45 +00007284 Py_ssize_t width;
7285 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 return NULL;
7287
7288 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007289 if (PyUnicode_CheckExact(self)) {
7290 Py_INCREF(self);
7291 return (PyObject*) self;
7292 }
7293 else
7294 return PyUnicode_FromUnicode(
7295 PyUnicode_AS_UNICODE(self),
7296 PyUnicode_GET_SIZE(self)
7297 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 }
7299
7300 fill = width - self->length;
7301
7302 u = pad(self, fill, 0, '0');
7303
Walter Dörwald068325e2002-04-15 13:36:47 +00007304 if (u == NULL)
7305 return NULL;
7306
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 if (u->str[fill] == '+' || u->str[fill] == '-') {
7308 /* move sign to beginning of string */
7309 u->str[0] = u->str[fill];
7310 u->str[fill] = '0';
7311 }
7312
7313 return (PyObject*) u;
7314}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316#if 0
7317static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007318unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 return PyInt_FromLong(unicode_freelist_size);
7321}
7322#endif
7323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007324PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007325"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007327Return True if S starts with the specified prefix, False otherwise.\n\
7328With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329With optional end, stop comparing S at that position.\n\
7330prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
7332static PyObject *
7333unicode_startswith(PyUnicodeObject *self,
7334 PyObject *args)
7335{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007338 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007339 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007343 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345 if (PyTuple_Check(subobj)) {
7346 Py_ssize_t i;
7347 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7348 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7349 PyTuple_GET_ITEM(subobj, i));
7350 if (substring == NULL)
7351 return NULL;
7352 result = tailmatch(self, substring, start, end, -1);
7353 Py_DECREF(substring);
7354 if (result) {
7355 Py_RETURN_TRUE;
7356 }
7357 }
7358 /* nothing matched */
7359 Py_RETURN_FALSE;
7360 }
7361 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363 return NULL;
7364 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367}
7368
7369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007370PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007371"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007373Return True if S ends with the specified suffix, False otherwise.\n\
7374With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375With optional end, stop comparing S at that position.\n\
7376suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
7378static PyObject *
7379unicode_endswith(PyUnicodeObject *self,
7380 PyObject *args)
7381{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007384 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007385 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7389 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391 if (PyTuple_Check(subobj)) {
7392 Py_ssize_t i;
7393 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7394 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7395 PyTuple_GET_ITEM(subobj, i));
7396 if (substring == NULL)
7397 return NULL;
7398 result = tailmatch(self, substring, start, end, +1);
7399 Py_DECREF(substring);
7400 if (result) {
7401 Py_RETURN_TRUE;
7402 }
7403 }
7404 Py_RETURN_FALSE;
7405 }
7406 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007410 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413}
7414
7415
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007416
7417static PyObject *
7418unicode_getnewargs(PyUnicodeObject *v)
7419{
7420 return Py_BuildValue("(u#)", v->str, v->length);
7421}
7422
7423
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424static PyMethodDef unicode_methods[] = {
7425
7426 /* Order is according to common usage: often used methods should
7427 appear first, since lookup is done sequentially. */
7428
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007429 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7430 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7431 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007432 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007433 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7434 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7435 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7436 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7437 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7438 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7439 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007440 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007441 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7442 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7443 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007444 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007445 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007446/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7447 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7448 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7449 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007450 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007451 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007452 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007453 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007454 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7455 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7456 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7457 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7458 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7459 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7460 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7461 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7462 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7463 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7464 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7465 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7466 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7467 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007468 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007469#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007470 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471#endif
7472
7473#if 0
7474 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007475 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476#endif
7477
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007478 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 {NULL, NULL}
7480};
7481
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007482static PyObject *
7483unicode_mod(PyObject *v, PyObject *w)
7484{
7485 if (!PyUnicode_Check(v)) {
7486 Py_INCREF(Py_NotImplemented);
7487 return Py_NotImplemented;
7488 }
7489 return PyUnicode_Format(v, w);
7490}
7491
7492static PyNumberMethods unicode_as_number = {
7493 0, /*nb_add*/
7494 0, /*nb_subtract*/
7495 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007496 unicode_mod, /*nb_remainder*/
7497};
7498
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007500 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007501 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7503 (ssizeargfunc) unicode_getitem, /* sq_item */
7504 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 0, /* sq_ass_item */
7506 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007507 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508};
7509
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007510static PyObject*
7511unicode_subscript(PyUnicodeObject* self, PyObject* item)
7512{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007513 if (PyIndex_Check(item)) {
7514 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007515 if (i == -1 && PyErr_Occurred())
7516 return NULL;
7517 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007518 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007519 return unicode_getitem(self, i);
7520 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007521 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007522 Py_UNICODE* source_buf;
7523 Py_UNICODE* result_buf;
7524 PyObject* result;
7525
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007526 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007527 &start, &stop, &step, &slicelength) < 0) {
7528 return NULL;
7529 }
7530
7531 if (slicelength <= 0) {
7532 return PyUnicode_FromUnicode(NULL, 0);
7533 } else {
7534 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007535 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7536 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007537
7538 if (result_buf == NULL)
7539 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007540
7541 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7542 result_buf[i] = source_buf[cur];
7543 }
Tim Petersced69f82003-09-16 20:30:58 +00007544
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007545 result = PyUnicode_FromUnicode(result_buf, slicelength);
7546 PyMem_FREE(result_buf);
7547 return result;
7548 }
7549 } else {
7550 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7551 return NULL;
7552 }
7553}
7554
7555static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007556 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007557 (binaryfunc)unicode_subscript, /* mp_subscript */
7558 (objobjargproc)0, /* mp_ass_subscript */
7559};
7560
Martin v. Löwis18e16552006-02-15 17:27:45 +00007561static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 const void **ptr)
7565{
7566 if (index != 0) {
7567 PyErr_SetString(PyExc_SystemError,
7568 "accessing non-existent unicode segment");
7569 return -1;
7570 }
7571 *ptr = (void *) self->str;
7572 return PyUnicode_GET_DATA_SIZE(self);
7573}
7574
Martin v. Löwis18e16552006-02-15 17:27:45 +00007575static Py_ssize_t
7576unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 const void **ptr)
7578{
7579 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007580 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 return -1;
7582}
7583
7584static int
7585unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007586 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
7588 if (lenp)
7589 *lenp = PyUnicode_GET_DATA_SIZE(self);
7590 return 1;
7591}
7592
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007593static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007595 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 const void **ptr)
7597{
7598 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007599
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 if (index != 0) {
7601 PyErr_SetString(PyExc_SystemError,
7602 "accessing non-existent unicode segment");
7603 return -1;
7604 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007605 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 if (str == NULL)
7607 return -1;
7608 *ptr = (void *) PyString_AS_STRING(str);
7609 return PyString_GET_SIZE(str);
7610}
7611
7612/* Helpers for PyUnicode_Format() */
7613
7614static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007617 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 if (argidx < arglen) {
7619 (*p_argidx)++;
7620 if (arglen < 0)
7621 return args;
7622 else
7623 return PyTuple_GetItem(args, argidx);
7624 }
7625 PyErr_SetString(PyExc_TypeError,
7626 "not enough arguments for format string");
7627 return NULL;
7628}
7629
7630#define F_LJUST (1<<0)
7631#define F_SIGN (1<<1)
7632#define F_BLANK (1<<2)
7633#define F_ALT (1<<3)
7634#define F_ZERO (1<<4)
7635
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007637strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007639 register Py_ssize_t i;
7640 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 for (i = len - 1; i >= 0; i--)
7642 buffer[i] = (Py_UNICODE) charbuffer[i];
7643
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return len;
7645}
7646
Neal Norwitzfc76d632006-01-10 06:03:13 +00007647static int
7648doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7649{
Tim Peters15231542006-02-16 01:08:01 +00007650 Py_ssize_t result;
7651
Neal Norwitzfc76d632006-01-10 06:03:13 +00007652 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007653 result = strtounicode(buffer, (char *)buffer);
7654 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007655}
7656
7657static int
7658longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7659{
Tim Peters15231542006-02-16 01:08:01 +00007660 Py_ssize_t result;
7661
Neal Norwitzfc76d632006-01-10 06:03:13 +00007662 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007663 result = strtounicode(buffer, (char *)buffer);
7664 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007665}
7666
Guido van Rossum078151d2002-08-11 04:24:12 +00007667/* XXX To save some code duplication, formatfloat/long/int could have been
7668 shared with stringobject.c, converting from 8-bit to Unicode after the
7669 formatting is done. */
7670
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671static int
7672formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007673 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 int flags,
7675 int prec,
7676 int type,
7677 PyObject *v)
7678{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007679 /* fmt = '%#.' + `prec` + `type`
7680 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 char fmt[20];
7682 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007683
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 x = PyFloat_AsDouble(v);
7685 if (x == -1.0 && PyErr_Occurred())
7686 return -1;
7687 if (prec < 0)
7688 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7690 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007691 /* Worst case length calc to ensure no buffer overrun:
7692
7693 'g' formats:
7694 fmt = %#.<prec>g
7695 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7696 for any double rep.)
7697 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7698
7699 'f' formats:
7700 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7701 len = 1 + 50 + 1 + prec = 52 + prec
7702
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007703 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007704 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007705
7706 */
7707 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7708 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007709 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007710 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007711 return -1;
7712 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007713 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7714 (flags&F_ALT) ? "#" : "",
7715 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007716 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717}
7718
Tim Peters38fd5b62000-09-21 05:43:11 +00007719static PyObject*
7720formatlong(PyObject *val, int flags, int prec, int type)
7721{
7722 char *buf;
7723 int i, len;
7724 PyObject *str; /* temporary string object. */
7725 PyUnicodeObject *result;
7726
7727 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7728 if (!str)
7729 return NULL;
7730 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007731 if (!result) {
7732 Py_DECREF(str);
7733 return NULL;
7734 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007735 for (i = 0; i < len; i++)
7736 result->str[i] = buf[i];
7737 result->str[len] = 0;
7738 Py_DECREF(str);
7739 return (PyObject*)result;
7740}
7741
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742static int
7743formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007744 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 int flags,
7746 int prec,
7747 int type,
7748 PyObject *v)
7749{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007750 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007751 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7752 * + 1 + 1
7753 * = 24
7754 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007755 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007756 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 long x;
7758
7759 x = PyInt_AsLong(v);
7760 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007761 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007762 if (x < 0 && type == 'u') {
7763 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007764 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007765 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7766 sign = "-";
7767 else
7768 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007770 prec = 1;
7771
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007772 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7773 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007774 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007775 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007776 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007777 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007778 return -1;
7779 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007780
7781 if ((flags & F_ALT) &&
7782 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007783 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007784 * of issues that cause pain:
7785 * - when 0 is being converted, the C standard leaves off
7786 * the '0x' or '0X', which is inconsistent with other
7787 * %#x/%#X conversions and inconsistent with Python's
7788 * hex() function
7789 * - there are platforms that violate the standard and
7790 * convert 0 with the '0x' or '0X'
7791 * (Metrowerks, Compaq Tru64)
7792 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007793 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007794 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007795 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007796 * We can achieve the desired consistency by inserting our
7797 * own '0x' or '0X' prefix, and substituting %x/%X in place
7798 * of %#x/%#X.
7799 *
7800 * Note that this is the same approach as used in
7801 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007802 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007803 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7804 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007805 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007806 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007807 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7808 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007809 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007810 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007811 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007812 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007813 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007814 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815}
7816
7817static int
7818formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007819 size_t buflen,
7820 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007822 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007823 if (PyUnicode_Check(v)) {
7824 if (PyUnicode_GET_SIZE(v) != 1)
7825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007829 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007830 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007831 goto onError;
7832 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
7835 else {
7836 /* Integer input truncated to a character */
7837 long x;
7838 x = PyInt_AsLong(v);
7839 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007840 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007841#ifdef Py_UNICODE_WIDE
7842 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007843 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007844 "%c arg not in range(0x110000) "
7845 "(wide Python build)");
7846 return -1;
7847 }
7848#else
7849 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007850 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007851 "%c arg not in range(0x10000) "
7852 "(narrow Python build)");
7853 return -1;
7854 }
7855#endif
7856 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 }
7858 buf[1] = '\0';
7859 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007860
7861 onError:
7862 PyErr_SetString(PyExc_TypeError,
7863 "%c requires int or char");
7864 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865}
7866
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007867/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7868
7869 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7870 chars are formatted. XXX This is a magic number. Each formatting
7871 routine does bounds checking to ensure no overflow, but a better
7872 solution may be to malloc a buffer of appropriate size for each
7873 format. For now, the current solution is sufficient.
7874*/
7875#define FORMATBUFLEN (size_t)120
7876
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877PyObject *PyUnicode_Format(PyObject *format,
7878 PyObject *args)
7879{
7880 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007881 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 int args_owned = 0;
7883 PyUnicodeObject *result = NULL;
7884 PyObject *dict = NULL;
7885 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007886
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 if (format == NULL || args == NULL) {
7888 PyErr_BadInternalCall();
7889 return NULL;
7890 }
7891 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007892 if (uformat == NULL)
7893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 fmt = PyUnicode_AS_UNICODE(uformat);
7895 fmtcnt = PyUnicode_GET_SIZE(uformat);
7896
7897 reslen = rescnt = fmtcnt + 100;
7898 result = _PyUnicode_New(reslen);
7899 if (result == NULL)
7900 goto onError;
7901 res = PyUnicode_AS_UNICODE(result);
7902
7903 if (PyTuple_Check(args)) {
7904 arglen = PyTuple_Size(args);
7905 argidx = 0;
7906 }
7907 else {
7908 arglen = -1;
7909 argidx = -2;
7910 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007911 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7912 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 dict = args;
7914
7915 while (--fmtcnt >= 0) {
7916 if (*fmt != '%') {
7917 if (--rescnt < 0) {
7918 rescnt = fmtcnt + 100;
7919 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007920 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7923 --rescnt;
7924 }
7925 *res++ = *fmt++;
7926 }
7927 else {
7928 /* Got a format specifier */
7929 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007930 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 Py_UNICODE c = '\0';
7933 Py_UNICODE fill;
7934 PyObject *v = NULL;
7935 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007936 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007938 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007939 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940
7941 fmt++;
7942 if (*fmt == '(') {
7943 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007944 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 PyObject *key;
7946 int pcount = 1;
7947
7948 if (dict == NULL) {
7949 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007950 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 goto onError;
7952 }
7953 ++fmt;
7954 --fmtcnt;
7955 keystart = fmt;
7956 /* Skip over balanced parentheses */
7957 while (pcount > 0 && --fmtcnt >= 0) {
7958 if (*fmt == ')')
7959 --pcount;
7960 else if (*fmt == '(')
7961 ++pcount;
7962 fmt++;
7963 }
7964 keylen = fmt - keystart - 1;
7965 if (fmtcnt < 0 || pcount > 0) {
7966 PyErr_SetString(PyExc_ValueError,
7967 "incomplete format key");
7968 goto onError;
7969 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007970#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007971 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 then looked up since Python uses strings to hold
7973 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007974 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 key = PyUnicode_EncodeUTF8(keystart,
7976 keylen,
7977 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007978#else
7979 key = PyUnicode_FromUnicode(keystart, keylen);
7980#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (key == NULL)
7982 goto onError;
7983 if (args_owned) {
7984 Py_DECREF(args);
7985 args_owned = 0;
7986 }
7987 args = PyObject_GetItem(dict, key);
7988 Py_DECREF(key);
7989 if (args == NULL) {
7990 goto onError;
7991 }
7992 args_owned = 1;
7993 arglen = -1;
7994 argidx = -2;
7995 }
7996 while (--fmtcnt >= 0) {
7997 switch (c = *fmt++) {
7998 case '-': flags |= F_LJUST; continue;
7999 case '+': flags |= F_SIGN; continue;
8000 case ' ': flags |= F_BLANK; continue;
8001 case '#': flags |= F_ALT; continue;
8002 case '0': flags |= F_ZERO; continue;
8003 }
8004 break;
8005 }
8006 if (c == '*') {
8007 v = getnextarg(args, arglen, &argidx);
8008 if (v == NULL)
8009 goto onError;
8010 if (!PyInt_Check(v)) {
8011 PyErr_SetString(PyExc_TypeError,
8012 "* wants int");
8013 goto onError;
8014 }
8015 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008016 if (width == -1 && PyErr_Occurred())
8017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 if (width < 0) {
8019 flags |= F_LJUST;
8020 width = -width;
8021 }
8022 if (--fmtcnt >= 0)
8023 c = *fmt++;
8024 }
8025 else if (c >= '0' && c <= '9') {
8026 width = c - '0';
8027 while (--fmtcnt >= 0) {
8028 c = *fmt++;
8029 if (c < '0' || c > '9')
8030 break;
8031 if ((width*10) / 10 != width) {
8032 PyErr_SetString(PyExc_ValueError,
8033 "width too big");
8034 goto onError;
8035 }
8036 width = width*10 + (c - '0');
8037 }
8038 }
8039 if (c == '.') {
8040 prec = 0;
8041 if (--fmtcnt >= 0)
8042 c = *fmt++;
8043 if (c == '*') {
8044 v = getnextarg(args, arglen, &argidx);
8045 if (v == NULL)
8046 goto onError;
8047 if (!PyInt_Check(v)) {
8048 PyErr_SetString(PyExc_TypeError,
8049 "* wants int");
8050 goto onError;
8051 }
8052 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008053 if (prec == -1 && PyErr_Occurred())
8054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 if (prec < 0)
8056 prec = 0;
8057 if (--fmtcnt >= 0)
8058 c = *fmt++;
8059 }
8060 else if (c >= '0' && c <= '9') {
8061 prec = c - '0';
8062 while (--fmtcnt >= 0) {
8063 c = Py_CHARMASK(*fmt++);
8064 if (c < '0' || c > '9')
8065 break;
8066 if ((prec*10) / 10 != prec) {
8067 PyErr_SetString(PyExc_ValueError,
8068 "prec too big");
8069 goto onError;
8070 }
8071 prec = prec*10 + (c - '0');
8072 }
8073 }
8074 } /* prec */
8075 if (fmtcnt >= 0) {
8076 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 if (--fmtcnt >= 0)
8078 c = *fmt++;
8079 }
8080 }
8081 if (fmtcnt < 0) {
8082 PyErr_SetString(PyExc_ValueError,
8083 "incomplete format");
8084 goto onError;
8085 }
8086 if (c != '%') {
8087 v = getnextarg(args, arglen, &argidx);
8088 if (v == NULL)
8089 goto onError;
8090 }
8091 sign = 0;
8092 fill = ' ';
8093 switch (c) {
8094
8095 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008096 pbuf = formatbuf;
8097 /* presume that buffer length is at least 1 */
8098 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 len = 1;
8100 break;
8101
8102 case 's':
8103 case 'r':
8104 if (PyUnicode_Check(v) && c == 's') {
8105 temp = v;
8106 Py_INCREF(temp);
8107 }
8108 else {
8109 PyObject *unicode;
8110 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008111 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 else
8113 temp = PyObject_Repr(v);
8114 if (temp == NULL)
8115 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008116 if (PyUnicode_Check(temp))
8117 /* nothing to do */;
8118 else if (PyString_Check(temp)) {
8119 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008120 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008122 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008124 Py_DECREF(temp);
8125 temp = unicode;
8126 if (temp == NULL)
8127 goto onError;
8128 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008129 else {
8130 Py_DECREF(temp);
8131 PyErr_SetString(PyExc_TypeError,
8132 "%s argument has non-string str()");
8133 goto onError;
8134 }
8135 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008136 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 len = PyUnicode_GET_SIZE(temp);
8138 if (prec >= 0 && len > prec)
8139 len = prec;
8140 break;
8141
8142 case 'i':
8143 case 'd':
8144 case 'u':
8145 case 'o':
8146 case 'x':
8147 case 'X':
8148 if (c == 'i')
8149 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008150 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008151 temp = formatlong(v, flags, prec, c);
8152 if (!temp)
8153 goto onError;
8154 pbuf = PyUnicode_AS_UNICODE(temp);
8155 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008156 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008158 else {
8159 pbuf = formatbuf;
8160 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8161 flags, prec, c, v);
8162 if (len < 0)
8163 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008164 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008165 }
8166 if (flags & F_ZERO)
8167 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 break;
8169
8170 case 'e':
8171 case 'E':
8172 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008173 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 case 'g':
8175 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008176 if (c == 'F')
8177 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008178 pbuf = formatbuf;
8179 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8180 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 if (len < 0)
8182 goto onError;
8183 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008184 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 fill = '0';
8186 break;
8187
8188 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008189 pbuf = formatbuf;
8190 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 if (len < 0)
8192 goto onError;
8193 break;
8194
8195 default:
8196 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008197 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008198 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008199 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008200 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008201 (Py_ssize_t)(fmt - 1 -
8202 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 goto onError;
8204 }
8205 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008206 if (*pbuf == '-' || *pbuf == '+') {
8207 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 len--;
8209 }
8210 else if (flags & F_SIGN)
8211 sign = '+';
8212 else if (flags & F_BLANK)
8213 sign = ' ';
8214 else
8215 sign = 0;
8216 }
8217 if (width < len)
8218 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008219 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 reslen -= rescnt;
8221 rescnt = width + fmtcnt + 100;
8222 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008223 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008224 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008225 PyErr_NoMemory();
8226 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008227 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008228 if (_PyUnicode_Resize(&result, reslen) < 0) {
8229 Py_XDECREF(temp);
8230 goto onError;
8231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 res = PyUnicode_AS_UNICODE(result)
8233 + reslen - rescnt;
8234 }
8235 if (sign) {
8236 if (fill != ' ')
8237 *res++ = sign;
8238 rescnt--;
8239 if (width > len)
8240 width--;
8241 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008242 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8243 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008244 assert(pbuf[1] == c);
8245 if (fill != ' ') {
8246 *res++ = *pbuf++;
8247 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008248 }
Tim Petersfff53252001-04-12 18:38:48 +00008249 rescnt -= 2;
8250 width -= 2;
8251 if (width < 0)
8252 width = 0;
8253 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 if (width > len && !(flags & F_LJUST)) {
8256 do {
8257 --rescnt;
8258 *res++ = fill;
8259 } while (--width > len);
8260 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008261 if (fill == ' ') {
8262 if (sign)
8263 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008264 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008265 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008266 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008267 *res++ = *pbuf++;
8268 *res++ = *pbuf++;
8269 }
8270 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008271 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 res += len;
8273 rescnt -= len;
8274 while (--width >= len) {
8275 --rescnt;
8276 *res++ = ' ';
8277 }
8278 if (dict && (argidx < arglen) && c != '%') {
8279 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008280 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008281 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 goto onError;
8283 }
8284 Py_XDECREF(temp);
8285 } /* '%' */
8286 } /* until end */
8287 if (argidx < arglen && !dict) {
8288 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008289 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 goto onError;
8291 }
8292
Thomas Woutersa96affe2006-03-12 00:29:36 +00008293 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 if (args_owned) {
8296 Py_DECREF(args);
8297 }
8298 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 return (PyObject *)result;
8300
8301 onError:
8302 Py_XDECREF(result);
8303 Py_DECREF(uformat);
8304 if (args_owned) {
8305 Py_DECREF(args);
8306 }
8307 return NULL;
8308}
8309
8310static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008311 (readbufferproc) unicode_buffer_getreadbuf,
8312 (writebufferproc) unicode_buffer_getwritebuf,
8313 (segcountproc) unicode_buffer_getsegcount,
8314 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315};
8316
Jeremy Hylton938ace62002-07-17 16:30:39 +00008317static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008318unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8319
Tim Peters6d6c1a32001-08-02 04:15:00 +00008320static PyObject *
8321unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8322{
8323 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008324 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008325 char *encoding = NULL;
8326 char *errors = NULL;
8327
Guido van Rossume023fe02001-08-30 03:12:59 +00008328 if (type != &PyUnicode_Type)
8329 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008330 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8331 kwlist, &x, &encoding, &errors))
8332 return NULL;
8333 if (x == NULL)
8334 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008335 if (encoding == NULL && errors == NULL)
8336 return PyObject_Unicode(x);
8337 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008338 return PyUnicode_FromEncodedObject(x, encoding, errors);
8339}
8340
Guido van Rossume023fe02001-08-30 03:12:59 +00008341static PyObject *
8342unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8343{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008344 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008345 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008346
8347 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8348 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8349 if (tmp == NULL)
8350 return NULL;
8351 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008352 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008353 if (pnew == NULL) {
8354 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008355 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008356 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008357 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8358 if (pnew->str == NULL) {
8359 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008360 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008361 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008362 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008363 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008364 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8365 pnew->length = n;
8366 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008367 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008368 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008369}
8370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008371PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008372"unicode(string [, encoding[, errors]]) -> object\n\
8373\n\
8374Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008375encoding defaults to the current default string encoding.\n\
8376errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008377
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008378static PyObject *unicode_iter(PyObject *seq);
8379
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380PyTypeObject PyUnicode_Type = {
8381 PyObject_HEAD_INIT(&PyType_Type)
8382 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008383 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 sizeof(PyUnicodeObject), /* tp_size */
8385 0, /* tp_itemsize */
8386 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008387 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008389 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008391 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008392 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008393 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008395 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 (hashfunc) unicode_hash, /* tp_hash*/
8397 0, /* tp_call*/
8398 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008399 PyObject_GenericGetAttr, /* tp_getattro */
8400 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008402 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8403 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008404 unicode_doc, /* tp_doc */
8405 0, /* tp_traverse */
8406 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008407 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008408 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008409 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008410 0, /* tp_iternext */
8411 unicode_methods, /* tp_methods */
8412 0, /* tp_members */
8413 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008414 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008415 0, /* tp_dict */
8416 0, /* tp_descr_get */
8417 0, /* tp_descr_set */
8418 0, /* tp_dictoffset */
8419 0, /* tp_init */
8420 0, /* tp_alloc */
8421 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008422 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423};
8424
8425/* Initialize the Unicode implementation */
8426
Thomas Wouters78890102000-07-22 19:25:51 +00008427void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008429 int i;
8430
Thomas Wouters477c8d52006-05-27 19:21:47 +00008431 /* XXX - move this array to unicodectype.c ? */
8432 Py_UNICODE linebreak[] = {
8433 0x000A, /* LINE FEED */
8434 0x000D, /* CARRIAGE RETURN */
8435 0x001C, /* FILE SEPARATOR */
8436 0x001D, /* GROUP SEPARATOR */
8437 0x001E, /* RECORD SEPARATOR */
8438 0x0085, /* NEXT LINE */
8439 0x2028, /* LINE SEPARATOR */
8440 0x2029, /* PARAGRAPH SEPARATOR */
8441 };
8442
Fred Drakee4315f52000-05-09 19:53:39 +00008443 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008444 unicode_freelist = NULL;
8445 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008447 if (!unicode_empty)
8448 return;
8449
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008450 for (i = 0; i < 256; i++)
8451 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008452 if (PyType_Ready(&PyUnicode_Type) < 0)
8453 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008454
8455 /* initialize the linebreak bloom filter */
8456 bloom_linebreak = make_bloom_mask(
8457 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8458 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008459
8460 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461}
8462
8463/* Finalize the Unicode implementation */
8464
8465void
Thomas Wouters78890102000-07-22 19:25:51 +00008466_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008468 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008469 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008471 Py_XDECREF(unicode_empty);
8472 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008473
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008474 for (i = 0; i < 256; i++) {
8475 if (unicode_latin1[i]) {
8476 Py_DECREF(unicode_latin1[i]);
8477 unicode_latin1[i] = NULL;
8478 }
8479 }
8480
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008481 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 PyUnicodeObject *v = u;
8483 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008484 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008485 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008486 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008487 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008489 unicode_freelist = NULL;
8490 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008492
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008493
8494
8495/********************* Unicode Iterator **************************/
8496
8497typedef struct {
8498 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008499 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008500 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8501} unicodeiterobject;
8502
8503static void
8504unicodeiter_dealloc(unicodeiterobject *it)
8505{
8506 _PyObject_GC_UNTRACK(it);
8507 Py_XDECREF(it->it_seq);
8508 PyObject_GC_Del(it);
8509}
8510
8511static int
8512unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8513{
8514 Py_VISIT(it->it_seq);
8515 return 0;
8516}
8517
8518static PyObject *
8519unicodeiter_next(unicodeiterobject *it)
8520{
8521 PyUnicodeObject *seq;
8522 PyObject *item;
8523
8524 assert(it != NULL);
8525 seq = it->it_seq;
8526 if (seq == NULL)
8527 return NULL;
8528 assert(PyUnicode_Check(seq));
8529
8530 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008531 item = PyUnicode_FromUnicode(
8532 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008533 if (item != NULL)
8534 ++it->it_index;
8535 return item;
8536 }
8537
8538 Py_DECREF(seq);
8539 it->it_seq = NULL;
8540 return NULL;
8541}
8542
8543static PyObject *
8544unicodeiter_len(unicodeiterobject *it)
8545{
8546 Py_ssize_t len = 0;
8547 if (it->it_seq)
8548 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8549 return PyInt_FromSsize_t(len);
8550}
8551
8552PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8553
8554static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008555 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8556 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008557 {NULL, NULL} /* sentinel */
8558};
8559
8560PyTypeObject PyUnicodeIter_Type = {
8561 PyObject_HEAD_INIT(&PyType_Type)
8562 0, /* ob_size */
8563 "unicodeiterator", /* tp_name */
8564 sizeof(unicodeiterobject), /* tp_basicsize */
8565 0, /* tp_itemsize */
8566 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008567 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008568 0, /* tp_print */
8569 0, /* tp_getattr */
8570 0, /* tp_setattr */
8571 0, /* tp_compare */
8572 0, /* tp_repr */
8573 0, /* tp_as_number */
8574 0, /* tp_as_sequence */
8575 0, /* tp_as_mapping */
8576 0, /* tp_hash */
8577 0, /* tp_call */
8578 0, /* tp_str */
8579 PyObject_GenericGetAttr, /* tp_getattro */
8580 0, /* tp_setattro */
8581 0, /* tp_as_buffer */
8582 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8583 0, /* tp_doc */
8584 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8585 0, /* tp_clear */
8586 0, /* tp_richcompare */
8587 0, /* tp_weaklistoffset */
8588 PyObject_SelfIter, /* tp_iter */
8589 (iternextfunc)unicodeiter_next, /* tp_iternext */
8590 unicodeiter_methods, /* tp_methods */
8591 0,
8592};
8593
8594static PyObject *
8595unicode_iter(PyObject *seq)
8596{
8597 unicodeiterobject *it;
8598
8599 if (!PyUnicode_Check(seq)) {
8600 PyErr_BadInternalCall();
8601 return NULL;
8602 }
8603 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8604 if (it == NULL)
8605 return NULL;
8606 it->it_index = 0;
8607 Py_INCREF(seq);
8608 it->it_seq = (PyUnicodeObject *)seq;
8609 _PyObject_GC_TRACK(it);
8610 return (PyObject *)it;
8611}
8612
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008613#ifdef __cplusplus
8614}
8615#endif
8616
8617
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008618/*
8619Local variables:
8620c-basic-offset: 4
8621indent-tabs-mode: nil
8622End:
8623*/