blob: e77b65dd2c495b5a564b873020f2ac3344961353 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldd2034312007-05-18 16:29:38 +0000396PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000397{
398 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000399 /* If the Unicode data is known at construction time, we can apply
400 some optimizations which share commonly used objects. */
401 if (u != NULL) {
402
403 /* Optimization for empty strings */
404 if (size == 0 && unicode_empty != NULL) {
405 Py_INCREF(unicode_empty);
406 return (PyObject *)unicode_empty;
407 }
408
Walter Dörwald071b9da2007-05-05 14:21:20 +0000409 /* Single characters are shared when using this constructor */
410 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000411 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000412 if (!unicode) {
413 unicode = _PyUnicode_New(1);
414 if (!unicode)
415 return NULL;
416 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 }
419 Py_INCREF(unicode);
420 return (PyObject *)unicode;
421 }
422 }
423
Walter Dörwald55507312007-05-18 13:12:10 +0000424 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000425 if (!unicode)
426 return NULL;
427
428 /* Copy the Unicode data into the new object */
429 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000430 Py_UNICODE *p = unicode->str;
431 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 ;
433 }
434
435 return (PyObject *)unicode;
436}
437
Walter Dörwaldd2034312007-05-18 16:29:38 +0000438PyObject *PyUnicode_FromString(const char *u)
439{
440 size_t size = strlen(u);
441 if (size > PY_SSIZE_T_MAX) {
442 PyErr_SetString(PyExc_OverflowError, "input too long");
443 return NULL;
444 }
445
446 return PyUnicode_FromStringAndSize(u, size);
447}
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_WCHAR_H
450
451PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
456 if (w == NULL) {
457 PyErr_BadInternalCall();
458 return NULL;
459 }
460
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the wchar_t data into the new object */
466#ifdef HAVE_USABLE_WCHAR_T
467 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000468#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 {
470 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000471 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000473 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 *u++ = *w++;
475 }
476#endif
477
478 return (PyObject *)unicode;
479}
480
Walter Dörwaldd2034312007-05-18 16:29:38 +0000481#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
482
483PyObject *
484PyUnicode_FromFormatV(const char *format, va_list vargs)
485{
486 va_list count;
487 Py_ssize_t n = 0;
488 const char* f;
489 Py_UNICODE *s;
490 PyObject *string;
491 /* used by sprintf */
492 char buffer[21];
493 const char *copy;
494
495#ifdef VA_LIST_IS_ARRAY
496 Py_MEMCPY(count, vargs, sizeof(va_list));
497#else
498#ifdef __va_copy
499 __va_copy(count, vargs);
500#else
501 count = vargs;
502#endif
503#endif
504 /* step 1: figure out how large a buffer we need */
505 for (f = format; *f; f++) {
506 if (*f == '%') {
507 const char* p = f;
508 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
509 ;
510
511 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
512 * they don't affect the amount of space we reserve.
513 */
514 if ((*f == 'l' || *f == 'z') &&
515 (f[1] == 'd' || f[1] == 'u'))
516 ++f;
517
518 switch (*f) {
519 case 'c':
520 (void)va_arg(count, int);
521 /* fall through... */
522 case '%':
523 n++;
524 break;
525 case 'd': case 'u': case 'i': case 'x':
526 (void) va_arg(count, int);
527 /* 20 bytes is enough to hold a 64-bit
528 integer. Decimal takes the most space.
529 This isn't enough for octal. */
530 n += 20;
531 break;
532 case 's':
533 n += strlen(va_arg(count, char*));
534 break;
535 case 'U':
536 {
537 PyObject *obj = va_arg(count, PyObject *);
538 assert(obj && PyUnicode_Check(obj));
539 n += PyUnicode_GET_SIZE(obj);
540 break;
541 }
542 case 'p':
543 (void) va_arg(count, int);
544 /* maximum 64-bit pointer representation:
545 * 0xffffffffffffffff
546 * so 19 characters is enough.
547 * XXX I count 18 -- what's the extra for?
548 */
549 n += 19;
550 break;
551 default:
552 /* if we stumble upon an unknown
553 formatting code, copy the rest of
554 the format string to the output
555 string. (we cannot just skip the
556 code, since there's no way to know
557 what's in the argument list) */
558 n += strlen(p);
559 goto expand;
560 }
561 } else
562 n++;
563 }
564 expand:
565 /* step 2: fill the buffer */
566 /* Since we've analyzed how much space we need for the worst case,
567 we don't have to resize the string. */
568 string = PyUnicode_FromUnicode(NULL, n);
569 if (!string)
570 return NULL;
571
572 s = PyUnicode_AS_UNICODE(string);
573
574 for (f = format; *f; f++) {
575 if (*f == '%') {
576 const char* p = f++;
577 int longflag = 0;
578 int size_tflag = 0;
579 /* parse the width.precision part (we're only
580 interested in the precision value, if any) */
581 n = 0;
582 while (isdigit(Py_CHARMASK(*f)))
583 n = (n*10) + *f++ - '0';
584 if (*f == '.') {
585 f++;
586 n = 0;
587 while (isdigit(Py_CHARMASK(*f)))
588 n = (n*10) + *f++ - '0';
589 }
590 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
591 f++;
592 /* handle the long flag, but only for %ld and %lu.
593 others can be added when necessary. */
594 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
595 longflag = 1;
596 ++f;
597 }
598 /* handle the size_t flag. */
599 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
600 size_tflag = 1;
601 ++f;
602 }
603
604 switch (*f) {
605 case 'c':
606 *s++ = va_arg(vargs, int);
607 break;
608 case 'd':
609 if (longflag)
610 sprintf(buffer, "%ld", va_arg(vargs, long));
611 else if (size_tflag)
612 sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
613 va_arg(vargs, Py_ssize_t));
614 else
615 sprintf(buffer, "%d", va_arg(vargs, int));
616 appendstring(buffer);
617 break;
618 case 'u':
619 if (longflag)
620 sprintf(buffer, "%lu",
621 va_arg(vargs, unsigned long));
622 else if (size_tflag)
623 sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
624 va_arg(vargs, size_t));
625 else
626 sprintf(buffer, "%u",
627 va_arg(vargs, unsigned int));
628 appendstring(buffer);
629 break;
630 case 'i':
631 sprintf(buffer, "%i", va_arg(vargs, int));
632 appendstring(buffer);
633 break;
634 case 'x':
635 sprintf(buffer, "%x", va_arg(vargs, int));
636 appendstring(buffer);
637 break;
638 case 's':
639 p = va_arg(vargs, char*);
640 appendstring(p);
641 break;
642 case 'U':
643 {
644 PyObject *obj = va_arg(vargs, PyObject *);
645 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
646 Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
647 Py_ssize_t upos;
648 for (upos = 0; upos<usize;)
649 *s++ = ucopy[upos++];
650 break;
651 }
652 case 'p':
653 sprintf(buffer, "%p", va_arg(vargs, void*));
654 /* %p is ill-defined: ensure leading 0x. */
655 if (buffer[1] == 'X')
656 buffer[1] = 'x';
657 else if (buffer[1] != 'x') {
658 memmove(buffer+2, buffer, strlen(buffer)+1);
659 buffer[0] = '0';
660 buffer[1] = 'x';
661 }
662 appendstring(buffer);
663 break;
664 case '%':
665 *s++ = '%';
666 break;
667 default:
668 appendstring(p);
669 goto end;
670 }
671 } else
672 *s++ = *f;
673 }
674
675 end:
676 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
677 return string;
678}
679
680#undef appendstring
681
682PyObject *
683PyUnicode_FromFormat(const char *format, ...)
684{
685 PyObject* ret;
686 va_list vargs;
687
688#ifdef HAVE_STDARG_PROTOTYPES
689 va_start(vargs, format);
690#else
691 va_start(vargs);
692#endif
693 ret = PyUnicode_FromFormatV(format, vargs);
694 va_end(vargs);
695 return ret;
696}
697
Martin v. Löwis18e16552006-02-15 17:27:45 +0000698Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
699 wchar_t *w,
700 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701{
702 if (unicode == NULL) {
703 PyErr_BadInternalCall();
704 return -1;
705 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000706
707 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000709 size = PyUnicode_GET_SIZE(unicode) + 1;
710
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711#ifdef HAVE_USABLE_WCHAR_T
712 memcpy(w, unicode->str, size * sizeof(wchar_t));
713#else
714 {
715 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000716 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000718 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 *w++ = *u++;
720 }
721#endif
722
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000723 if (size > PyUnicode_GET_SIZE(unicode))
724 return PyUnicode_GET_SIZE(unicode);
725 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 return size;
727}
728
729#endif
730
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000731PyObject *PyUnicode_FromOrdinal(int ordinal)
732{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000733 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000734
735#ifdef Py_UNICODE_WIDE
736 if (ordinal < 0 || ordinal > 0x10ffff) {
737 PyErr_SetString(PyExc_ValueError,
738 "unichr() arg not in range(0x110000) "
739 "(wide Python build)");
740 return NULL;
741 }
742#else
743 if (ordinal < 0 || ordinal > 0xffff) {
744 PyErr_SetString(PyExc_ValueError,
745 "unichr() arg not in range(0x10000) "
746 "(narrow Python build)");
747 return NULL;
748 }
749#endif
750
Hye-Shik Chang40574832004-04-06 07:24:51 +0000751 s[0] = (Py_UNICODE)ordinal;
752 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000753}
754
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755PyObject *PyUnicode_FromObject(register PyObject *obj)
756{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000757 /* XXX Perhaps we should make this API an alias of
758 PyObject_Unicode() instead ?! */
759 if (PyUnicode_CheckExact(obj)) {
760 Py_INCREF(obj);
761 return obj;
762 }
763 if (PyUnicode_Check(obj)) {
764 /* For a Unicode subtype that's not a Unicode object,
765 return a true Unicode object with the same data. */
766 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
767 PyUnicode_GET_SIZE(obj));
768 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000769 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
770}
771
772PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
773 const char *encoding,
774 const char *errors)
775{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000776 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000777 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000778 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000779
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 if (obj == NULL) {
781 PyErr_BadInternalCall();
782 return NULL;
783 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000784
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000785#if 0
786 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000787 that no encodings is given and then redirect to
788 PyObject_Unicode() which then applies the additional logic for
789 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000790
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000791 NOTE: This API should really only be used for object which
792 represent *encoded* Unicode !
793
794 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000795 if (PyUnicode_Check(obj)) {
796 if (encoding) {
797 PyErr_SetString(PyExc_TypeError,
798 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000799 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000800 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000801 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000802 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000803#else
804 if (PyUnicode_Check(obj)) {
805 PyErr_SetString(PyExc_TypeError,
806 "decoding Unicode is not supported");
807 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000808 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000809#endif
810
811 /* Coerce object */
812 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000813 s = PyString_AS_STRING(obj);
814 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000815 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000816 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
817 /* Overwrite the error message with something more useful in
818 case of a TypeError. */
819 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000820 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000821 "coercing to Unicode: need string or buffer, "
822 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000823 obj->ob_type->tp_name);
824 goto onError;
825 }
Tim Petersced69f82003-09-16 20:30:58 +0000826
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000827 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828 if (len == 0) {
829 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000830 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 }
Tim Petersced69f82003-09-16 20:30:58 +0000832 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000833 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000834
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000835 return v;
836
837 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839}
840
841PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000842 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843 const char *encoding,
844 const char *errors)
845{
846 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847
848 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000849 encoding = PyUnicode_GetDefaultEncoding();
850
851 /* Shortcuts for common default encodings */
852 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000853 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000854 else if (strcmp(encoding, "latin-1") == 0)
855 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000856#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
857 else if (strcmp(encoding, "mbcs") == 0)
858 return PyUnicode_DecodeMBCS(s, size, errors);
859#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000860 else if (strcmp(encoding, "ascii") == 0)
861 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
863 /* Decode via the codec registry */
864 buffer = PyBuffer_FromMemory((void *)s, size);
865 if (buffer == NULL)
866 goto onError;
867 unicode = PyCodec_Decode(buffer, encoding, errors);
868 if (unicode == NULL)
869 goto onError;
870 if (!PyUnicode_Check(unicode)) {
871 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000872 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 unicode->ob_type->tp_name);
874 Py_DECREF(unicode);
875 goto onError;
876 }
877 Py_DECREF(buffer);
878 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000879
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 onError:
881 Py_XDECREF(buffer);
882 return NULL;
883}
884
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000885PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
886 const char *encoding,
887 const char *errors)
888{
889 PyObject *v;
890
891 if (!PyUnicode_Check(unicode)) {
892 PyErr_BadArgument();
893 goto onError;
894 }
895
896 if (encoding == NULL)
897 encoding = PyUnicode_GetDefaultEncoding();
898
899 /* Decode via the codec registry */
900 v = PyCodec_Decode(unicode, encoding, errors);
901 if (v == NULL)
902 goto onError;
903 return v;
904
905 onError:
906 return NULL;
907}
908
Guido van Rossumd57fd912000-03-10 22:53:23 +0000909PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000910 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 const char *encoding,
912 const char *errors)
913{
914 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916 unicode = PyUnicode_FromUnicode(s, size);
917 if (unicode == NULL)
918 return NULL;
919 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
920 Py_DECREF(unicode);
921 return v;
922}
923
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000924PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
925 const char *encoding,
926 const char *errors)
927{
928 PyObject *v;
929
930 if (!PyUnicode_Check(unicode)) {
931 PyErr_BadArgument();
932 goto onError;
933 }
934
935 if (encoding == NULL)
936 encoding = PyUnicode_GetDefaultEncoding();
937
938 /* Encode via the codec registry */
939 v = PyCodec_Encode(unicode, encoding, errors);
940 if (v == NULL)
941 goto onError;
942 return v;
943
944 onError:
945 return NULL;
946}
947
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
949 const char *encoding,
950 const char *errors)
951{
952 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000953
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954 if (!PyUnicode_Check(unicode)) {
955 PyErr_BadArgument();
956 goto onError;
957 }
Fred Drakee4315f52000-05-09 19:53:39 +0000958
Tim Petersced69f82003-09-16 20:30:58 +0000959 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000960 encoding = PyUnicode_GetDefaultEncoding();
961
962 /* Shortcuts for common default encodings */
963 if (errors == NULL) {
964 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000965 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000966 else if (strcmp(encoding, "latin-1") == 0)
967 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000968#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
969 else if (strcmp(encoding, "mbcs") == 0)
970 return PyUnicode_AsMBCSString(unicode);
971#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000972 else if (strcmp(encoding, "ascii") == 0)
973 return PyUnicode_AsASCIIString(unicode);
974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975
976 /* Encode via the codec registry */
977 v = PyCodec_Encode(unicode, encoding, errors);
978 if (v == NULL)
979 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000980 if (!PyBytes_Check(v)) {
981 if (PyString_Check(v)) {
982 /* Old codec, turn it into bytes */
983 PyObject *b = PyBytes_FromObject(v);
984 Py_DECREF(v);
985 return b;
986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000988 "encoder did not return a bytes object "
989 "(type=%.400s, encoding=%.20s, errors=%.20s)",
990 v->ob_type->tp_name,
991 encoding ? encoding : "NULL",
992 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993 Py_DECREF(v);
994 goto onError;
995 }
996 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000997
Guido van Rossumd57fd912000-03-10 22:53:23 +0000998 onError:
999 return NULL;
1000}
1001
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001002PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1003 const char *errors)
1004{
1005 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001006 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001007 if (v)
1008 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001009 if (errors != NULL)
1010 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1011 if (errors == NULL) {
1012 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1013 PyUnicode_GET_SIZE(unicode),
1014 NULL);
1015 }
1016 else {
1017 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1018 }
1019 if (!b)
1020 return NULL;
1021 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1022 PyBytes_Size(b));
1023 Py_DECREF(b);
1024 if (!errors) {
1025 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001026 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001027 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001028 return v;
1029}
1030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1032{
1033 if (!PyUnicode_Check(unicode)) {
1034 PyErr_BadArgument();
1035 goto onError;
1036 }
1037 return PyUnicode_AS_UNICODE(unicode);
1038
1039 onError:
1040 return NULL;
1041}
1042
Martin v. Löwis18e16552006-02-15 17:27:45 +00001043Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044{
1045 if (!PyUnicode_Check(unicode)) {
1046 PyErr_BadArgument();
1047 goto onError;
1048 }
1049 return PyUnicode_GET_SIZE(unicode);
1050
1051 onError:
1052 return -1;
1053}
1054
Thomas Wouters78890102000-07-22 19:25:51 +00001055const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001056{
1057 return unicode_default_encoding;
1058}
1059
1060int PyUnicode_SetDefaultEncoding(const char *encoding)
1061{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001062 if (strcmp(encoding, unicode_default_encoding) != 0) {
1063 PyErr_Format(PyExc_ValueError,
1064 "Can only set default encoding to %s",
1065 unicode_default_encoding);
1066 return -1;
1067 }
Fred Drakee4315f52000-05-09 19:53:39 +00001068 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001069}
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071/* error handling callback helper:
1072 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001073 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001074 and adjust various state variables.
1075 return 0 on success, -1 on error
1076*/
1077
1078static
1079int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1080 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1082 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001083{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001085
1086 PyObject *restuple = NULL;
1087 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1089 Py_ssize_t requiredsize;
1090 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001091 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 int res = -1;
1094
1095 if (*errorHandler == NULL) {
1096 *errorHandler = PyCodec_LookupError(errors);
1097 if (*errorHandler == NULL)
1098 goto onError;
1099 }
1100
1101 if (*exceptionObject == NULL) {
1102 *exceptionObject = PyUnicodeDecodeError_Create(
1103 encoding, input, insize, *startinpos, *endinpos, reason);
1104 if (*exceptionObject == NULL)
1105 goto onError;
1106 }
1107 else {
1108 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1109 goto onError;
1110 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1111 goto onError;
1112 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1113 goto onError;
1114 }
1115
1116 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1117 if (restuple == NULL)
1118 goto onError;
1119 if (!PyTuple_Check(restuple)) {
1120 PyErr_Format(PyExc_TypeError, &argparse[4]);
1121 goto onError;
1122 }
1123 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1124 goto onError;
1125 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001126 newpos = insize+newpos;
1127 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001128 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001129 goto onError;
1130 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001131
1132 /* need more space? (at least enough for what we
1133 have+the replacement+the rest of the string (starting
1134 at the new input position), so we won't have to check space
1135 when there are no errors in the rest of the string) */
1136 repptr = PyUnicode_AS_UNICODE(repunicode);
1137 repsize = PyUnicode_GET_SIZE(repunicode);
1138 requiredsize = *outpos + repsize + insize-newpos;
1139 if (requiredsize > outsize) {
1140 if (requiredsize<2*outsize)
1141 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001142 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001143 goto onError;
1144 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1145 }
1146 *endinpos = newpos;
1147 *inptr = input + newpos;
1148 Py_UNICODE_COPY(*outptr, repptr, repsize);
1149 *outptr += repsize;
1150 *outpos += repsize;
1151 /* we made it! */
1152 res = 0;
1153
1154 onError:
1155 Py_XDECREF(restuple);
1156 return res;
1157}
1158
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159/* --- UTF-7 Codec -------------------------------------------------------- */
1160
1161/* see RFC2152 for details */
1162
Tim Petersced69f82003-09-16 20:30:58 +00001163static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001164char utf7_special[128] = {
1165 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1166 encoded:
1167 0 - not special
1168 1 - special
1169 2 - whitespace (optional)
1170 3 - RFC2152 Set O (optional) */
1171 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1173 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1175 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1177 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1179
1180};
1181
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001182/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1183 warnings about the comparison always being false; since
1184 utf7_special[0] is 1, we can safely make that one comparison
1185 true */
1186
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001187#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001188 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001189 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001190 (encodeO && (utf7_special[(c)] == 3)))
1191
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001192#define B64(n) \
1193 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1194#define B64CHAR(c) \
1195 (isalnum(c) || (c) == '+' || (c) == '/')
1196#define UB64(c) \
1197 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1198 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001199
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001200#define ENCODE(out, ch, bits) \
1201 while (bits >= 6) { \
1202 *out++ = B64(ch >> (bits-6)); \
1203 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001204 }
1205
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001206#define DECODE(out, ch, bits, surrogate) \
1207 while (bits >= 16) { \
1208 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1209 bits -= 16; \
1210 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001211 /* We have already generated an error for the high surrogate \
1212 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001213 surrogate = 0; \
1214 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001215 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001216 it in a 16-bit character */ \
1217 surrogate = 1; \
1218 errmsg = "code pairs are not supported"; \
1219 goto utf7Error; \
1220 } else { \
1221 *out++ = outCh; \
1222 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001223 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001224
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001225PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001226 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001227 const char *errors)
1228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001230 Py_ssize_t startinpos;
1231 Py_ssize_t endinpos;
1232 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001233 const char *e;
1234 PyUnicodeObject *unicode;
1235 Py_UNICODE *p;
1236 const char *errmsg = "";
1237 int inShift = 0;
1238 unsigned int bitsleft = 0;
1239 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001240 int surrogate = 0;
1241 PyObject *errorHandler = NULL;
1242 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001243
1244 unicode = _PyUnicode_New(size);
1245 if (!unicode)
1246 return NULL;
1247 if (size == 0)
1248 return (PyObject *)unicode;
1249
1250 p = unicode->str;
1251 e = s + size;
1252
1253 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254 Py_UNICODE ch;
1255 restart:
1256 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001257
1258 if (inShift) {
1259 if ((ch == '-') || !B64CHAR(ch)) {
1260 inShift = 0;
1261 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001262
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001263 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1264 if (bitsleft >= 6) {
1265 /* The shift sequence has a partial character in it. If
1266 bitsleft < 6 then we could just classify it as padding
1267 but that is not the case here */
1268
1269 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001270 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001271 }
1272 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001273 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001274 here so indicate the potential of a misencoded character. */
1275
1276 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1277 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1278 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001279 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001280 }
1281
1282 if (ch == '-') {
1283 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001284 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001285 inShift = 1;
1286 }
1287 } else if (SPECIAL(ch,0,0)) {
1288 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001289 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001290 } else {
1291 *p++ = ch;
1292 }
1293 } else {
1294 charsleft = (charsleft << 6) | UB64(ch);
1295 bitsleft += 6;
1296 s++;
1297 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1298 }
1299 }
1300 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001302 s++;
1303 if (s < e && *s == '-') {
1304 s++;
1305 *p++ = '+';
1306 } else
1307 {
1308 inShift = 1;
1309 bitsleft = 0;
1310 }
1311 }
1312 else if (SPECIAL(ch,0,0)) {
1313 errmsg = "unexpected special character";
1314 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001315 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001316 }
1317 else {
1318 *p++ = ch;
1319 s++;
1320 }
1321 continue;
1322 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001323 outpos = p-PyUnicode_AS_UNICODE(unicode);
1324 endinpos = s-starts;
1325 if (unicode_decode_call_errorhandler(
1326 errors, &errorHandler,
1327 "utf7", errmsg,
1328 starts, size, &startinpos, &endinpos, &exc, &s,
1329 (PyObject **)&unicode, &outpos, &p))
1330 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001331 }
1332
1333 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 outpos = p-PyUnicode_AS_UNICODE(unicode);
1335 endinpos = size;
1336 if (unicode_decode_call_errorhandler(
1337 errors, &errorHandler,
1338 "utf7", "unterminated shift sequence",
1339 starts, size, &startinpos, &endinpos, &exc, &s,
1340 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001341 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001342 if (s < e)
1343 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001344 }
1345
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001346 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001347 goto onError;
1348
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 Py_XDECREF(errorHandler);
1350 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001351 return (PyObject *)unicode;
1352
1353onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 Py_XDECREF(errorHandler);
1355 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001356 Py_DECREF(unicode);
1357 return NULL;
1358}
1359
1360
1361PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001363 int encodeSetO,
1364 int encodeWhiteSpace,
1365 const char *errors)
1366{
1367 PyObject *v;
1368 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001369 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001370 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001371 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001372 unsigned int bitsleft = 0;
1373 unsigned long charsleft = 0;
1374 char * out;
1375 char * start;
1376
1377 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001378 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001379
Walter Dörwald51ab4142007-05-05 14:43:36 +00001380 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001381 if (v == NULL)
1382 return NULL;
1383
Walter Dörwald51ab4142007-05-05 14:43:36 +00001384 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001385 for (;i < size; ++i) {
1386 Py_UNICODE ch = s[i];
1387
1388 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001389 if (ch == '+') {
1390 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001391 *out++ = '-';
1392 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1393 charsleft = ch;
1394 bitsleft = 16;
1395 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001396 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001398 } else {
1399 *out++ = (char) ch;
1400 }
1401 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001402 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1403 *out++ = B64(charsleft << (6-bitsleft));
1404 charsleft = 0;
1405 bitsleft = 0;
1406 /* Characters not in the BASE64 set implicitly unshift the sequence
1407 so no '-' is required, except if the character is itself a '-' */
1408 if (B64CHAR(ch) || ch == '-') {
1409 *out++ = '-';
1410 }
1411 inShift = 0;
1412 *out++ = (char) ch;
1413 } else {
1414 bitsleft += 16;
1415 charsleft = (charsleft << 16) | ch;
1416 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1417
1418 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001419 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 or '-' then the shift sequence will be terminated implicitly and we
1421 don't have to insert a '-'. */
1422
1423 if (bitsleft == 0) {
1424 if (i + 1 < size) {
1425 Py_UNICODE ch2 = s[i+1];
1426
1427 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001428
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429 } else if (B64CHAR(ch2) || ch2 == '-') {
1430 *out++ = '-';
1431 inShift = 0;
1432 } else {
1433 inShift = 0;
1434 }
1435
1436 }
1437 else {
1438 *out++ = '-';
1439 inShift = 0;
1440 }
1441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001443 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001444 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001445 if (bitsleft) {
1446 *out++= B64(charsleft << (6-bitsleft) );
1447 *out++ = '-';
1448 }
1449
Walter Dörwald51ab4142007-05-05 14:43:36 +00001450 if (PyBytes_Resize(v, out - start)) {
1451 Py_DECREF(v);
1452 return NULL;
1453 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 return v;
1455}
1456
1457#undef SPECIAL
1458#undef B64
1459#undef B64CHAR
1460#undef UB64
1461#undef ENCODE
1462#undef DECODE
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464/* --- UTF-8 Codec -------------------------------------------------------- */
1465
Tim Petersced69f82003-09-16 20:30:58 +00001466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467char utf8_code_length[256] = {
1468 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1469 illegal prefix. see RFC 2279 for details */
1470 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1471 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1472 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1473 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1474 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1482 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1483 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1484 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1485 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1486};
1487
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001489 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 const char *errors)
1491{
Walter Dörwald69652032004-09-07 20:24:22 +00001492 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1493}
1494
1495PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001496 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001497 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001498 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001502 Py_ssize_t startinpos;
1503 Py_ssize_t endinpos;
1504 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 const char *e;
1506 PyUnicodeObject *unicode;
1507 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001508 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 PyObject *errorHandler = NULL;
1510 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511
1512 /* Note: size will always be longer than the resulting Unicode
1513 character count */
1514 unicode = _PyUnicode_New(size);
1515 if (!unicode)
1516 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001517 if (size == 0) {
1518 if (consumed)
1519 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522
1523 /* Unpack UTF-8 encoded data */
1524 p = unicode->str;
1525 e = s + size;
1526
1527 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529
1530 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001531 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 s++;
1533 continue;
1534 }
1535
1536 n = utf8_code_length[ch];
1537
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001539 if (consumed)
1540 break;
1541 else {
1542 errmsg = "unexpected end of data";
1543 startinpos = s-starts;
1544 endinpos = size;
1545 goto utf8Error;
1546 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548
1549 switch (n) {
1550
1551 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001552 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 startinpos = s-starts;
1554 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001555 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556
1557 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001558 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 startinpos = s-starts;
1560 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001561 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562
1563 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001564 if ((s[1] & 0xc0) != 0x80) {
1565 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001566 startinpos = s-starts;
1567 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001568 goto utf8Error;
1569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001571 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 startinpos = s-starts;
1573 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001574 errmsg = "illegal encoding";
1575 goto utf8Error;
1576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001578 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 break;
1580
1581 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001582 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001583 (s[2] & 0xc0) != 0x80) {
1584 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001585 startinpos = s-starts;
1586 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001587 goto utf8Error;
1588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001590 if (ch < 0x0800) {
1591 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001592 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001593
1594 XXX For wide builds (UCS-4) we should probably try
1595 to recombine the surrogates into a single code
1596 unit.
1597 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001599 startinpos = s-starts;
1600 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001601 goto utf8Error;
1602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001604 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001605 break;
1606
1607 case 4:
1608 if ((s[1] & 0xc0) != 0x80 ||
1609 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001610 (s[3] & 0xc0) != 0x80) {
1611 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 startinpos = s-starts;
1613 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001614 goto utf8Error;
1615 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001616 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1617 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1618 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001619 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001620 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001621 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001622 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001623 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001624 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001625 startinpos = s-starts;
1626 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 goto utf8Error;
1628 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001629#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001630 *p++ = (Py_UNICODE)ch;
1631#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001632 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001633
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001634 /* translate from 10000..10FFFF to 0..FFFF */
1635 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001636
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001637 /* high surrogate = top 10 bits added to D800 */
1638 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001639
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001640 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001641 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 break;
1644
1645 default:
1646 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001647 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 startinpos = s-starts;
1649 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001650 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 }
1652 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001654
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001655 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 outpos = p-PyUnicode_AS_UNICODE(unicode);
1657 if (unicode_decode_call_errorhandler(
1658 errors, &errorHandler,
1659 "utf8", errmsg,
1660 starts, size, &startinpos, &endinpos, &exc, &s,
1661 (PyObject **)&unicode, &outpos, &p))
1662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663 }
Walter Dörwald69652032004-09-07 20:24:22 +00001664 if (consumed)
1665 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666
1667 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001668 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 goto onError;
1670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 Py_XDECREF(errorHandler);
1672 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 return (PyObject *)unicode;
1674
1675onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 Py_XDECREF(errorHandler);
1677 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 Py_DECREF(unicode);
1679 return NULL;
1680}
1681
Tim Peters602f7402002-04-27 18:03:26 +00001682/* Allocation strategy: if the string is short, convert into a stack buffer
1683 and allocate exactly as much space needed at the end. Else allocate the
1684 maximum possible needed (4 result bytes per Unicode character), and return
1685 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001686*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001687PyObject *
1688PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691{
Tim Peters602f7402002-04-27 18:03:26 +00001692#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001693
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001695 PyObject *v; /* result string object */
1696 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001697 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001698 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001699 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001700
Tim Peters602f7402002-04-27 18:03:26 +00001701 assert(s != NULL);
1702 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703
Tim Peters602f7402002-04-27 18:03:26 +00001704 if (size <= MAX_SHORT_UNICHARS) {
1705 /* Write into the stack buffer; nallocated can't overflow.
1706 * At the end, we'll allocate exactly as much heap space as it
1707 * turns out we need.
1708 */
1709 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1710 v = NULL; /* will allocate after we're done */
1711 p = stackbuf;
1712 }
1713 else {
1714 /* Overallocate on the heap, and give the excess back at the end. */
1715 nallocated = size * 4;
1716 if (nallocated / 4 != size) /* overflow! */
1717 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001718 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001719 if (v == NULL)
1720 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001721 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001722 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001723
Tim Peters602f7402002-04-27 18:03:26 +00001724 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001725 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001726
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001727 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001728 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001730
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001732 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001733 *p++ = (char)(0xc0 | (ch >> 6));
1734 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001735 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001736 else {
Tim Peters602f7402002-04-27 18:03:26 +00001737 /* Encode UCS2 Unicode ordinals */
1738 if (ch < 0x10000) {
1739 /* Special case: check for high surrogate */
1740 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1741 Py_UCS4 ch2 = s[i];
1742 /* Check for low surrogate and combine the two to
1743 form a UCS4 value */
1744 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001745 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001746 i++;
1747 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001748 }
Tim Peters602f7402002-04-27 18:03:26 +00001749 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001750 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001751 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001752 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1753 *p++ = (char)(0x80 | (ch & 0x3f));
1754 continue;
1755 }
1756encodeUCS4:
1757 /* Encode UCS4 Unicode ordinals */
1758 *p++ = (char)(0xf0 | (ch >> 18));
1759 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1760 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1761 *p++ = (char)(0x80 | (ch & 0x3f));
1762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001764
Tim Peters602f7402002-04-27 18:03:26 +00001765 if (v == NULL) {
1766 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001767 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001768 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001769 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001770 }
1771 else {
1772 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001773 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001774 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001775 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001778
Tim Peters602f7402002-04-27 18:03:26 +00001779#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780}
1781
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1783{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if (!PyUnicode_Check(unicode)) {
1785 PyErr_BadArgument();
1786 return NULL;
1787 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001788 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1789 PyUnicode_GET_SIZE(unicode),
1790 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791}
1792
1793/* --- UTF-16 Codec ------------------------------------------------------- */
1794
Tim Peters772747b2001-08-09 22:21:55 +00001795PyObject *
1796PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001797 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001798 const char *errors,
1799 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800{
Walter Dörwald69652032004-09-07 20:24:22 +00001801 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1802}
1803
1804PyObject *
1805PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001806 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001807 const char *errors,
1808 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001809 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001812 Py_ssize_t startinpos;
1813 Py_ssize_t endinpos;
1814 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 PyUnicodeObject *unicode;
1816 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001817 const unsigned char *q, *e;
1818 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001819 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001820 /* Offsets from q for retrieving byte pairs in the right order. */
1821#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1822 int ihi = 1, ilo = 0;
1823#else
1824 int ihi = 0, ilo = 1;
1825#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 PyObject *errorHandler = NULL;
1827 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828
1829 /* Note: size will always be longer than the resulting Unicode
1830 character count */
1831 unicode = _PyUnicode_New(size);
1832 if (!unicode)
1833 return NULL;
1834 if (size == 0)
1835 return (PyObject *)unicode;
1836
1837 /* Unpack UTF-16 encoded data */
1838 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001839 q = (unsigned char *)s;
1840 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841
1842 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001843 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001845 /* Check for BOM marks (U+FEFF) in the input and adjust current
1846 byte order setting accordingly. In native mode, the leading BOM
1847 mark is skipped, in all other modes, it is copied to the output
1848 stream as-is (giving a ZWNBSP character). */
1849 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001850 if (size >= 2) {
1851 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001852#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001853 if (bom == 0xFEFF) {
1854 q += 2;
1855 bo = -1;
1856 }
1857 else if (bom == 0xFFFE) {
1858 q += 2;
1859 bo = 1;
1860 }
Tim Petersced69f82003-09-16 20:30:58 +00001861#else
Walter Dörwald69652032004-09-07 20:24:22 +00001862 if (bom == 0xFEFF) {
1863 q += 2;
1864 bo = 1;
1865 }
1866 else if (bom == 0xFFFE) {
1867 q += 2;
1868 bo = -1;
1869 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001870#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001871 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
Tim Peters772747b2001-08-09 22:21:55 +00001874 if (bo == -1) {
1875 /* force LE */
1876 ihi = 1;
1877 ilo = 0;
1878 }
1879 else if (bo == 1) {
1880 /* force BE */
1881 ihi = 0;
1882 ilo = 1;
1883 }
1884
1885 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001887 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001889 if (consumed)
1890 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 errmsg = "truncated data";
1892 startinpos = ((const char *)q)-starts;
1893 endinpos = ((const char *)e)-starts;
1894 goto utf16Error;
1895 /* The remaining input chars are ignored if the callback
1896 chooses to skip the input */
1897 }
1898 ch = (q[ihi] << 8) | q[ilo];
1899
Tim Peters772747b2001-08-09 22:21:55 +00001900 q += 2;
1901
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 if (ch < 0xD800 || ch > 0xDFFF) {
1903 *p++ = ch;
1904 continue;
1905 }
1906
1907 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001908 if (q >= e) {
1909 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001910 startinpos = (((const char *)q)-2)-starts;
1911 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001912 goto utf16Error;
1913 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001914 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001915 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1916 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001917 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001918#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001919 *p++ = ch;
1920 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001921#else
1922 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001923#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001924 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925 }
1926 else {
1927 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001928 startinpos = (((const char *)q)-4)-starts;
1929 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001930 goto utf16Error;
1931 }
1932
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001934 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 startinpos = (((const char *)q)-2)-starts;
1936 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 /* Fall through to report the error */
1938
1939 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 outpos = p-PyUnicode_AS_UNICODE(unicode);
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "utf16", errmsg,
1944 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1945 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 }
1948
1949 if (byteorder)
1950 *byteorder = bo;
1951
Walter Dörwald69652032004-09-07 20:24:22 +00001952 if (consumed)
1953 *consumed = (const char *)q-starts;
1954
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001956 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 goto onError;
1958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 Py_XDECREF(errorHandler);
1960 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 return (PyObject *)unicode;
1962
1963onError:
1964 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 Py_XDECREF(errorHandler);
1966 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 return NULL;
1968}
1969
Tim Peters772747b2001-08-09 22:21:55 +00001970PyObject *
1971PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001972 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001973 const char *errors,
1974 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975{
1976 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001977 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001978#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001979 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001980#else
1981 const int pairs = 0;
1982#endif
Tim Peters772747b2001-08-09 22:21:55 +00001983 /* Offsets from p for storing byte pairs in the right order. */
1984#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1985 int ihi = 1, ilo = 0;
1986#else
1987 int ihi = 0, ilo = 1;
1988#endif
1989
1990#define STORECHAR(CH) \
1991 do { \
1992 p[ihi] = ((CH) >> 8) & 0xff; \
1993 p[ilo] = (CH) & 0xff; \
1994 p += 2; \
1995 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001997#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001998 for (i = pairs = 0; i < size; i++)
1999 if (s[i] >= 0x10000)
2000 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002001#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002002 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002003 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 if (v == NULL)
2005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006
Walter Dörwald3cc34522007-05-04 10:48:27 +00002007 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002009 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002010 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002011 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002012
2013 if (byteorder == -1) {
2014 /* force LE */
2015 ihi = 1;
2016 ilo = 0;
2017 }
2018 else if (byteorder == 1) {
2019 /* force BE */
2020 ihi = 0;
2021 ilo = 1;
2022 }
2023
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002024 while (size-- > 0) {
2025 Py_UNICODE ch = *s++;
2026 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002027#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002028 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002029 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2030 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002032#endif
Tim Peters772747b2001-08-09 22:21:55 +00002033 STORECHAR(ch);
2034 if (ch2)
2035 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002038#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039}
2040
2041PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2042{
2043 if (!PyUnicode_Check(unicode)) {
2044 PyErr_BadArgument();
2045 return NULL;
2046 }
2047 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2048 PyUnicode_GET_SIZE(unicode),
2049 NULL,
2050 0);
2051}
2052
2053/* --- Unicode Escape Codec ----------------------------------------------- */
2054
Fredrik Lundh06d12682001-01-24 07:59:11 +00002055static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002056
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002058 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 const char *errors)
2060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002062 Py_ssize_t startinpos;
2063 Py_ssize_t endinpos;
2064 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002065 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002069 char* message;
2070 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 PyObject *errorHandler = NULL;
2072 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002073
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 /* Escaped strings will always be longer than the resulting
2075 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 length after conversion to the true value.
2077 (but if the error callback returns a long replacement string
2078 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 v = _PyUnicode_New(size);
2080 if (v == NULL)
2081 goto onError;
2082 if (size == 0)
2083 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002087
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088 while (s < end) {
2089 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002090 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092
2093 /* Non-escape characters are interpreted as Unicode ordinals */
2094 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002095 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 continue;
2097 }
2098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 /* \ - Escapes */
2101 s++;
2102 switch (*s++) {
2103
2104 /* \x escapes */
2105 case '\n': break;
2106 case '\\': *p++ = '\\'; break;
2107 case '\'': *p++ = '\''; break;
2108 case '\"': *p++ = '\"'; break;
2109 case 'b': *p++ = '\b'; break;
2110 case 'f': *p++ = '\014'; break; /* FF */
2111 case 't': *p++ = '\t'; break;
2112 case 'n': *p++ = '\n'; break;
2113 case 'r': *p++ = '\r'; break;
2114 case 'v': *p++ = '\013'; break; /* VT */
2115 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2116
2117 /* \OOO (octal) escapes */
2118 case '0': case '1': case '2': case '3':
2119 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002120 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002122 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002124 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002126 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 break;
2128
Fredrik Lundhccc74732001-02-18 22:13:49 +00002129 /* hex escapes */
2130 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002132 digits = 2;
2133 message = "truncated \\xXX escape";
2134 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135
Fredrik Lundhccc74732001-02-18 22:13:49 +00002136 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002138 digits = 4;
2139 message = "truncated \\uXXXX escape";
2140 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141
Fredrik Lundhccc74732001-02-18 22:13:49 +00002142 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002143 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002144 digits = 8;
2145 message = "truncated \\UXXXXXXXX escape";
2146 hexescape:
2147 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002148 outpos = p-PyUnicode_AS_UNICODE(v);
2149 if (s+digits>end) {
2150 endinpos = size;
2151 if (unicode_decode_call_errorhandler(
2152 errors, &errorHandler,
2153 "unicodeescape", "end of string in escape sequence",
2154 starts, size, &startinpos, &endinpos, &exc, &s,
2155 (PyObject **)&v, &outpos, &p))
2156 goto onError;
2157 goto nextByte;
2158 }
2159 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002160 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002161 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 endinpos = (s+i+1)-starts;
2163 if (unicode_decode_call_errorhandler(
2164 errors, &errorHandler,
2165 "unicodeescape", message,
2166 starts, size, &startinpos, &endinpos, &exc, &s,
2167 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002168 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002170 }
2171 chr = (chr<<4) & ~0xF;
2172 if (c >= '0' && c <= '9')
2173 chr += c - '0';
2174 else if (c >= 'a' && c <= 'f')
2175 chr += 10 + c - 'a';
2176 else
2177 chr += 10 + c - 'A';
2178 }
2179 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002180 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 /* _decoding_error will have already written into the
2182 target buffer. */
2183 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002184 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002185 /* when we get here, chr is a 32-bit unicode character */
2186 if (chr <= 0xffff)
2187 /* UCS-2 character */
2188 *p++ = (Py_UNICODE) chr;
2189 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002190 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002191 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002193 *p++ = chr;
2194#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002195 chr -= 0x10000L;
2196 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002197 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002198#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002199 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 endinpos = s-starts;
2201 outpos = p-PyUnicode_AS_UNICODE(v);
2202 if (unicode_decode_call_errorhandler(
2203 errors, &errorHandler,
2204 "unicodeescape", "illegal Unicode character",
2205 starts, size, &startinpos, &endinpos, &exc, &s,
2206 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002207 goto onError;
2208 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002209 break;
2210
2211 /* \N{name} */
2212 case 'N':
2213 message = "malformed \\N character escape";
2214 if (ucnhash_CAPI == NULL) {
2215 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002216 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002217 m = PyImport_ImportModule("unicodedata");
2218 if (m == NULL)
2219 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002220 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002221 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002222 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002223 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002224 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002225 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002226 if (ucnhash_CAPI == NULL)
2227 goto ucnhashError;
2228 }
2229 if (*s == '{') {
2230 const char *start = s+1;
2231 /* look for the closing brace */
2232 while (*s != '}' && s < end)
2233 s++;
2234 if (s > start && s < end && *s == '}') {
2235 /* found a name. look it up in the unicode database */
2236 message = "unknown Unicode character name";
2237 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002238 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002239 goto store;
2240 }
2241 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 endinpos = s-starts;
2243 outpos = p-PyUnicode_AS_UNICODE(v);
2244 if (unicode_decode_call_errorhandler(
2245 errors, &errorHandler,
2246 "unicodeescape", message,
2247 starts, size, &startinpos, &endinpos, &exc, &s,
2248 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002249 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002250 break;
2251
2252 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002253 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 message = "\\ at end of string";
2255 s--;
2256 endinpos = s-starts;
2257 outpos = p-PyUnicode_AS_UNICODE(v);
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "unicodeescape", message,
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002263 goto onError;
2264 }
2265 else {
2266 *p++ = '\\';
2267 *p++ = (unsigned char)s[-1];
2268 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002269 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 nextByte:
2272 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002274 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002276 Py_XDECREF(errorHandler);
2277 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002279
Fredrik Lundhccc74732001-02-18 22:13:49 +00002280ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002281 PyErr_SetString(
2282 PyExc_UnicodeError,
2283 "\\N escapes not supported (can't load unicodedata module)"
2284 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002285 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 Py_XDECREF(errorHandler);
2287 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002288 return NULL;
2289
Fredrik Lundhccc74732001-02-18 22:13:49 +00002290onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 Py_XDECREF(errorHandler);
2293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 return NULL;
2295}
2296
2297/* Return a Unicode-Escape string version of the Unicode object.
2298
2299 If quotes is true, the string is enclosed in u"" or u'' quotes as
2300 appropriate.
2301
2302*/
2303
Thomas Wouters477c8d52006-05-27 19:21:47 +00002304Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2305 Py_ssize_t size,
2306 Py_UNICODE ch)
2307{
2308 /* like wcschr, but doesn't stop at NULL characters */
2309
2310 while (size-- > 0) {
2311 if (*s == ch)
2312 return s;
2313 s++;
2314 }
2315
2316 return NULL;
2317}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002318
Walter Dörwald79e913e2007-05-12 11:08:06 +00002319static const char *hexdigits = "0123456789abcdef";
2320
2321PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2322 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323{
2324 PyObject *repr;
2325 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326
Thomas Wouters89f507f2006-12-13 04:49:30 +00002327 /* XXX(nnorwitz): rather than over-allocating, it would be
2328 better to choose a different scheme. Perhaps scan the
2329 first N-chars of the string and allocate based on that size.
2330 */
2331 /* Initial allocation is based on the longest-possible unichr
2332 escape.
2333
2334 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2335 unichr, so in this case it's the longest unichr escape. In
2336 narrow (UTF-16) builds this is five chars per source unichr
2337 since there are two unichrs in the surrogate pair, so in narrow
2338 (UTF-16) builds it's not the longest unichr escape.
2339
2340 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2341 so in the narrow (UTF-16) build case it's the longest unichr
2342 escape.
2343 */
2344
Walter Dörwald79e913e2007-05-12 11:08:06 +00002345 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002346#ifdef Py_UNICODE_WIDE
2347 + 10*size
2348#else
2349 + 6*size
2350#endif
2351 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 if (repr == NULL)
2353 return NULL;
2354
Walter Dörwald79e913e2007-05-12 11:08:06 +00002355 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 while (size-- > 0) {
2358 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002359
Walter Dörwald79e913e2007-05-12 11:08:06 +00002360 /* Escape backslashes */
2361 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 *p++ = '\\';
2363 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002364 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002365 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002366
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002367#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002368 /* Map 21-bit characters to '\U00xxxxxx' */
2369 else if (ch >= 0x10000) {
2370 *p++ = '\\';
2371 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002372 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2373 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2374 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2375 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2376 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2377 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2378 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2379 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002380 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002381 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002382#else
2383 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002384 else if (ch >= 0xD800 && ch < 0xDC00) {
2385 Py_UNICODE ch2;
2386 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002388 ch2 = *s++;
2389 size--;
2390 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2391 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2392 *p++ = '\\';
2393 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002394 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2395 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2396 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2397 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2398 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2399 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2400 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2401 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002402 continue;
2403 }
2404 /* Fall through: isolated surrogates are copied as-is */
2405 s--;
2406 size++;
2407 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002408#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002409
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002411 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 *p++ = '\\';
2413 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002414 *p++ = hexdigits[(ch >> 12) & 0x000F];
2415 *p++ = hexdigits[(ch >> 8) & 0x000F];
2416 *p++ = hexdigits[(ch >> 4) & 0x000F];
2417 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002419
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002420 /* Map special whitespace to '\t', \n', '\r' */
2421 else if (ch == '\t') {
2422 *p++ = '\\';
2423 *p++ = 't';
2424 }
2425 else if (ch == '\n') {
2426 *p++ = '\\';
2427 *p++ = 'n';
2428 }
2429 else if (ch == '\r') {
2430 *p++ = '\\';
2431 *p++ = 'r';
2432 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002433
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002434 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002435 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002437 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002438 *p++ = hexdigits[(ch >> 4) & 0x000F];
2439 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002440 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002441
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 /* Copy everything else as-is */
2443 else
2444 *p++ = (char) ch;
2445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446
2447 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002448 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2449 Py_DECREF(repr);
2450 return NULL;
2451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 return repr;
2453}
2454
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2456{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002457 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 if (!PyUnicode_Check(unicode)) {
2459 PyErr_BadArgument();
2460 return NULL;
2461 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002462 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2463 PyUnicode_GET_SIZE(unicode));
2464
2465 if (!s)
2466 return NULL;
2467 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2468 PyBytes_GET_SIZE(s));
2469 Py_DECREF(s);
2470 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471}
2472
2473/* --- Raw Unicode Escape Codec ------------------------------------------- */
2474
2475PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002476 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 const char *errors)
2478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002480 Py_ssize_t startinpos;
2481 Py_ssize_t endinpos;
2482 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 const char *end;
2486 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 PyObject *errorHandler = NULL;
2488 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002489
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 /* Escaped strings will always be longer than the resulting
2491 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 length after conversion to the true value. (But decoding error
2493 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 v = _PyUnicode_New(size);
2495 if (v == NULL)
2496 goto onError;
2497 if (size == 0)
2498 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 end = s + size;
2501 while (s < end) {
2502 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002503 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002505 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 /* Non-escape characters are interpreted as Unicode ordinals */
2508 if (*s != '\\') {
2509 *p++ = (unsigned char)*s++;
2510 continue;
2511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002512 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
2514 /* \u-escapes are only interpreted iff the number of leading
2515 backslashes if odd */
2516 bs = s;
2517 for (;s < end;) {
2518 if (*s != '\\')
2519 break;
2520 *p++ = (unsigned char)*s++;
2521 }
2522 if (((s - bs) & 1) == 0 ||
2523 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002524 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 continue;
2526 }
2527 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002528 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 s++;
2530
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002531 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002533 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 endinpos = s-starts;
2537 if (unicode_decode_call_errorhandler(
2538 errors, &errorHandler,
2539 "rawunicodeescape", "truncated \\uXXXX",
2540 starts, size, &startinpos, &endinpos, &exc, &s,
2541 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 }
2545 x = (x<<4) & ~0xF;
2546 if (c >= '0' && c <= '9')
2547 x += c - '0';
2548 else if (c >= 'a' && c <= 'f')
2549 x += 10 + c - 'a';
2550 else
2551 x += 10 + c - 'A';
2552 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002553#ifndef Py_UNICODE_WIDE
2554 if (x > 0x10000) {
2555 if (unicode_decode_call_errorhandler(
2556 errors, &errorHandler,
2557 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2558 starts, size, &startinpos, &endinpos, &exc, &s,
2559 (PyObject **)&v, &outpos, &p))
2560 goto onError;
2561 }
2562#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 *p++ = x;
2564 nextByte:
2565 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002567 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002568 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002569 Py_XDECREF(errorHandler);
2570 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002572
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 onError:
2574 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002575 Py_XDECREF(errorHandler);
2576 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577 return NULL;
2578}
2579
2580PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582{
2583 PyObject *repr;
2584 char *p;
2585 char *q;
2586
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002587#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002588 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002589#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002590 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002591#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 if (repr == NULL)
2593 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002594 if (size == 0)
2595 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596
Walter Dörwald711005d2007-05-12 12:03:26 +00002597 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 while (size-- > 0) {
2599 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002600#ifdef Py_UNICODE_WIDE
2601 /* Map 32-bit characters to '\Uxxxxxxxx' */
2602 if (ch >= 0x10000) {
2603 *p++ = '\\';
2604 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002605 *p++ = hexdigits[(ch >> 28) & 0xf];
2606 *p++ = hexdigits[(ch >> 24) & 0xf];
2607 *p++ = hexdigits[(ch >> 20) & 0xf];
2608 *p++ = hexdigits[(ch >> 16) & 0xf];
2609 *p++ = hexdigits[(ch >> 12) & 0xf];
2610 *p++ = hexdigits[(ch >> 8) & 0xf];
2611 *p++ = hexdigits[(ch >> 4) & 0xf];
2612 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002613 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002614 else
2615#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 /* Map 16-bit characters to '\uxxxx' */
2617 if (ch >= 256) {
2618 *p++ = '\\';
2619 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002620 *p++ = hexdigits[(ch >> 12) & 0xf];
2621 *p++ = hexdigits[(ch >> 8) & 0xf];
2622 *p++ = hexdigits[(ch >> 4) & 0xf];
2623 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 }
2625 /* Copy everything else as-is */
2626 else
2627 *p++ = (char) ch;
2628 }
2629 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002630 if (PyBytes_Resize(repr, p - q)) {
2631 Py_DECREF(repr);
2632 return NULL;
2633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 return repr;
2635}
2636
2637PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2638{
Walter Dörwald711005d2007-05-12 12:03:26 +00002639 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002641 PyErr_BadArgument();
2642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002644 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2645 PyUnicode_GET_SIZE(unicode));
2646
2647 if (!s)
2648 return NULL;
2649 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2650 PyBytes_GET_SIZE(s));
2651 Py_DECREF(s);
2652 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653}
2654
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002655/* --- Unicode Internal Codec ------------------------------------------- */
2656
2657PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002658 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002659 const char *errors)
2660{
2661 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002662 Py_ssize_t startinpos;
2663 Py_ssize_t endinpos;
2664 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002665 PyUnicodeObject *v;
2666 Py_UNICODE *p;
2667 const char *end;
2668 const char *reason;
2669 PyObject *errorHandler = NULL;
2670 PyObject *exc = NULL;
2671
Neal Norwitzd43069c2006-01-08 01:12:10 +00002672#ifdef Py_UNICODE_WIDE
2673 Py_UNICODE unimax = PyUnicode_GetMax();
2674#endif
2675
Thomas Wouters89f507f2006-12-13 04:49:30 +00002676 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002677 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2678 if (v == NULL)
2679 goto onError;
2680 if (PyUnicode_GetSize((PyObject *)v) == 0)
2681 return (PyObject *)v;
2682 p = PyUnicode_AS_UNICODE(v);
2683 end = s + size;
2684
2685 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002686 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002687 /* We have to sanity check the raw data, otherwise doom looms for
2688 some malformed UCS-4 data. */
2689 if (
2690 #ifdef Py_UNICODE_WIDE
2691 *p > unimax || *p < 0 ||
2692 #endif
2693 end-s < Py_UNICODE_SIZE
2694 )
2695 {
2696 startinpos = s - starts;
2697 if (end-s < Py_UNICODE_SIZE) {
2698 endinpos = end-starts;
2699 reason = "truncated input";
2700 }
2701 else {
2702 endinpos = s - starts + Py_UNICODE_SIZE;
2703 reason = "illegal code point (> 0x10FFFF)";
2704 }
2705 outpos = p - PyUnicode_AS_UNICODE(v);
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "unicode_internal", reason,
2709 starts, size, &startinpos, &endinpos, &exc, &s,
2710 (PyObject **)&v, &outpos, &p)) {
2711 goto onError;
2712 }
2713 }
2714 else {
2715 p++;
2716 s += Py_UNICODE_SIZE;
2717 }
2718 }
2719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002720 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002721 goto onError;
2722 Py_XDECREF(errorHandler);
2723 Py_XDECREF(exc);
2724 return (PyObject *)v;
2725
2726 onError:
2727 Py_XDECREF(v);
2728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
2730 return NULL;
2731}
2732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733/* --- Latin-1 Codec ------------------------------------------------------ */
2734
2735PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002736 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 const char *errors)
2738{
2739 PyUnicodeObject *v;
2740 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002741
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002743 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002744 Py_UNICODE r = *(unsigned char*)s;
2745 return PyUnicode_FromUnicode(&r, 1);
2746 }
2747
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 v = _PyUnicode_New(size);
2749 if (v == NULL)
2750 goto onError;
2751 if (size == 0)
2752 return (PyObject *)v;
2753 p = PyUnicode_AS_UNICODE(v);
2754 while (size-- > 0)
2755 *p++ = (unsigned char)*s++;
2756 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002757
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 onError:
2759 Py_XDECREF(v);
2760 return NULL;
2761}
2762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763/* create or adjust a UnicodeEncodeError */
2764static void make_encode_exception(PyObject **exceptionObject,
2765 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 const Py_UNICODE *unicode, Py_ssize_t size,
2767 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 if (*exceptionObject == NULL) {
2771 *exceptionObject = PyUnicodeEncodeError_Create(
2772 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2776 goto onError;
2777 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2778 goto onError;
2779 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2780 goto onError;
2781 return;
2782 onError:
2783 Py_DECREF(*exceptionObject);
2784 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 }
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* raises a UnicodeEncodeError */
2789static void raise_encode_exception(PyObject **exceptionObject,
2790 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002791 const Py_UNICODE *unicode, Py_ssize_t size,
2792 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 const char *reason)
2794{
2795 make_encode_exception(exceptionObject,
2796 encoding, unicode, size, startpos, endpos, reason);
2797 if (*exceptionObject != NULL)
2798 PyCodec_StrictErrors(*exceptionObject);
2799}
2800
2801/* error handling callback helper:
2802 build arguments, call the callback and check the arguments,
2803 put the result into newpos and return the replacement string, which
2804 has to be freed by the caller */
2805static PyObject *unicode_encode_call_errorhandler(const char *errors,
2806 PyObject **errorHandler,
2807 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002808 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2809 Py_ssize_t startpos, Py_ssize_t endpos,
2810 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002812 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813
2814 PyObject *restuple;
2815 PyObject *resunicode;
2816
2817 if (*errorHandler == NULL) {
2818 *errorHandler = PyCodec_LookupError(errors);
2819 if (*errorHandler == NULL)
2820 return NULL;
2821 }
2822
2823 make_encode_exception(exceptionObject,
2824 encoding, unicode, size, startpos, endpos, reason);
2825 if (*exceptionObject == NULL)
2826 return NULL;
2827
2828 restuple = PyObject_CallFunctionObjArgs(
2829 *errorHandler, *exceptionObject, NULL);
2830 if (restuple == NULL)
2831 return NULL;
2832 if (!PyTuple_Check(restuple)) {
2833 PyErr_Format(PyExc_TypeError, &argparse[4]);
2834 Py_DECREF(restuple);
2835 return NULL;
2836 }
2837 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2838 &resunicode, newpos)) {
2839 Py_DECREF(restuple);
2840 return NULL;
2841 }
2842 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002843 *newpos = size+*newpos;
2844 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002846 Py_DECREF(restuple);
2847 return NULL;
2848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 Py_INCREF(resunicode);
2850 Py_DECREF(restuple);
2851 return resunicode;
2852}
2853
2854static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 const char *errors,
2857 int limit)
2858{
2859 /* output object */
2860 PyObject *res;
2861 /* pointers to the beginning and end+1 of input */
2862 const Py_UNICODE *startp = p;
2863 const Py_UNICODE *endp = p + size;
2864 /* pointer to the beginning of the unencodable characters */
2865 /* const Py_UNICODE *badp = NULL; */
2866 /* pointer into the output */
2867 char *str;
2868 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002869 Py_ssize_t respos = 0;
2870 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002871 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2872 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 PyObject *errorHandler = NULL;
2874 PyObject *exc = NULL;
2875 /* the following variable is used for caching string comparisons
2876 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2877 int known_errorHandler = -1;
2878
2879 /* allocate enough for a simple encoding without
2880 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002881 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 if (res == NULL)
2883 goto onError;
2884 if (size == 0)
2885 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002886 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 ressize = size;
2888
2889 while (p<endp) {
2890 Py_UNICODE c = *p;
2891
2892 /* can we encode this? */
2893 if (c<limit) {
2894 /* no overflow check, because we know that the space is enough */
2895 *str++ = (char)c;
2896 ++p;
2897 }
2898 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002899 Py_ssize_t unicodepos = p-startp;
2900 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002902 Py_ssize_t repsize;
2903 Py_ssize_t newpos;
2904 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905 Py_UNICODE *uni2;
2906 /* startpos for collecting unencodable chars */
2907 const Py_UNICODE *collstart = p;
2908 const Py_UNICODE *collend = p;
2909 /* find all unecodable characters */
2910 while ((collend < endp) && ((*collend)>=limit))
2911 ++collend;
2912 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2913 if (known_errorHandler==-1) {
2914 if ((errors==NULL) || (!strcmp(errors, "strict")))
2915 known_errorHandler = 1;
2916 else if (!strcmp(errors, "replace"))
2917 known_errorHandler = 2;
2918 else if (!strcmp(errors, "ignore"))
2919 known_errorHandler = 3;
2920 else if (!strcmp(errors, "xmlcharrefreplace"))
2921 known_errorHandler = 4;
2922 else
2923 known_errorHandler = 0;
2924 }
2925 switch (known_errorHandler) {
2926 case 1: /* strict */
2927 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2928 goto onError;
2929 case 2: /* replace */
2930 while (collstart++<collend)
2931 *str++ = '?'; /* fall through */
2932 case 3: /* ignore */
2933 p = collend;
2934 break;
2935 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002936 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 /* determine replacement size (temporarily (mis)uses p) */
2938 for (p = collstart, repsize = 0; p < collend; ++p) {
2939 if (*p<10)
2940 repsize += 2+1+1;
2941 else if (*p<100)
2942 repsize += 2+2+1;
2943 else if (*p<1000)
2944 repsize += 2+3+1;
2945 else if (*p<10000)
2946 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002947#ifndef Py_UNICODE_WIDE
2948 else
2949 repsize += 2+5+1;
2950#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 else if (*p<100000)
2952 repsize += 2+5+1;
2953 else if (*p<1000000)
2954 repsize += 2+6+1;
2955 else
2956 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002957#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 }
2959 requiredsize = respos+repsize+(endp-collend);
2960 if (requiredsize > ressize) {
2961 if (requiredsize<2*ressize)
2962 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002963 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002965 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 ressize = requiredsize;
2967 }
2968 /* generate replacement (temporarily (mis)uses p) */
2969 for (p = collstart; p < collend; ++p) {
2970 str += sprintf(str, "&#%d;", (int)*p);
2971 }
2972 p = collend;
2973 break;
2974 default:
2975 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2976 encoding, reason, startp, size, &exc,
2977 collstart-startp, collend-startp, &newpos);
2978 if (repunicode == NULL)
2979 goto onError;
2980 /* need more space? (at least enough for what we
2981 have+the replacement+the rest of the string, so
2982 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002983 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 repsize = PyUnicode_GET_SIZE(repunicode);
2985 requiredsize = respos+repsize+(endp-collend);
2986 if (requiredsize > ressize) {
2987 if (requiredsize<2*ressize)
2988 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002989 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 Py_DECREF(repunicode);
2991 goto onError;
2992 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002993 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 ressize = requiredsize;
2995 }
2996 /* check if there is anything unencodable in the replacement
2997 and copy it to the output */
2998 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2999 c = *uni2;
3000 if (c >= limit) {
3001 raise_encode_exception(&exc, encoding, startp, size,
3002 unicodepos, unicodepos+1, reason);
3003 Py_DECREF(repunicode);
3004 goto onError;
3005 }
3006 *str = (char)c;
3007 }
3008 p = startp + newpos;
3009 Py_DECREF(repunicode);
3010 }
3011 }
3012 }
3013 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003014 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015 if (respos<ressize)
3016 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003017 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 Py_XDECREF(errorHandler);
3019 Py_XDECREF(exc);
3020 return res;
3021
3022 onError:
3023 Py_XDECREF(res);
3024 Py_XDECREF(errorHandler);
3025 Py_XDECREF(exc);
3026 return NULL;
3027}
3028
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003030 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 const char *errors)
3032{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034}
3035
3036PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3037{
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 return NULL;
3041 }
3042 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3043 PyUnicode_GET_SIZE(unicode),
3044 NULL);
3045}
3046
3047/* --- 7-bit ASCII Codec -------------------------------------------------- */
3048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 const char *errors)
3052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 PyUnicodeObject *v;
3055 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003056 Py_ssize_t startinpos;
3057 Py_ssize_t endinpos;
3058 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *e;
3060 PyObject *errorHandler = NULL;
3061 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003062
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003064 if (size == 1 && *(unsigned char*)s < 128) {
3065 Py_UNICODE r = *(unsigned char*)s;
3066 return PyUnicode_FromUnicode(&r, 1);
3067 }
Tim Petersced69f82003-09-16 20:30:58 +00003068
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 v = _PyUnicode_New(size);
3070 if (v == NULL)
3071 goto onError;
3072 if (size == 0)
3073 return (PyObject *)v;
3074 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 e = s + size;
3076 while (s < e) {
3077 register unsigned char c = (unsigned char)*s;
3078 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 ++s;
3081 }
3082 else {
3083 startinpos = s-starts;
3084 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003085 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 if (unicode_decode_call_errorhandler(
3087 errors, &errorHandler,
3088 "ascii", "ordinal not in range(128)",
3089 starts, size, &startinpos, &endinpos, &exc, &s,
3090 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003094 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003095 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003096 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 Py_XDECREF(errorHandler);
3098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003100
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 onError:
3102 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 Py_XDECREF(errorHandler);
3104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 return NULL;
3106}
3107
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003109 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 const char *errors)
3111{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113}
3114
3115PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3116{
3117 if (!PyUnicode_Check(unicode)) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 NULL);
3124}
3125
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003126#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003127
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003128/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003129
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003130#if SIZEOF_INT < SIZEOF_SSIZE_T
3131#define NEED_RETRY
3132#endif
3133
3134/* XXX This code is limited to "true" double-byte encodings, as
3135 a) it assumes an incomplete character consists of a single byte, and
3136 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3137 encodings, see IsDBCSLeadByteEx documentation. */
3138
3139static int is_dbcs_lead_byte(const char *s, int offset)
3140{
3141 const char *curr = s + offset;
3142
3143 if (IsDBCSLeadByte(*curr)) {
3144 const char *prev = CharPrev(s, curr);
3145 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3146 }
3147 return 0;
3148}
3149
3150/*
3151 * Decode MBCS string into unicode object. If 'final' is set, converts
3152 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3153 */
3154static int decode_mbcs(PyUnicodeObject **v,
3155 const char *s, /* MBCS string */
3156 int size, /* sizeof MBCS string */
3157 int final)
3158{
3159 Py_UNICODE *p;
3160 Py_ssize_t n = 0;
3161 int usize = 0;
3162
3163 assert(size >= 0);
3164
3165 /* Skip trailing lead-byte unless 'final' is set */
3166 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3167 --size;
3168
3169 /* First get the size of the result */
3170 if (size > 0) {
3171 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3172 if (usize == 0) {
3173 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3174 return -1;
3175 }
3176 }
3177
3178 if (*v == NULL) {
3179 /* Create unicode object */
3180 *v = _PyUnicode_New(usize);
3181 if (*v == NULL)
3182 return -1;
3183 }
3184 else {
3185 /* Extend unicode object */
3186 n = PyUnicode_GET_SIZE(*v);
3187 if (_PyUnicode_Resize(v, n + usize) < 0)
3188 return -1;
3189 }
3190
3191 /* Do the conversion */
3192 if (size > 0) {
3193 p = PyUnicode_AS_UNICODE(*v) + n;
3194 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3195 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3196 return -1;
3197 }
3198 }
3199
3200 return size;
3201}
3202
3203PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3204 Py_ssize_t size,
3205 const char *errors,
3206 Py_ssize_t *consumed)
3207{
3208 PyUnicodeObject *v = NULL;
3209 int done;
3210
3211 if (consumed)
3212 *consumed = 0;
3213
3214#ifdef NEED_RETRY
3215 retry:
3216 if (size > INT_MAX)
3217 done = decode_mbcs(&v, s, INT_MAX, 0);
3218 else
3219#endif
3220 done = decode_mbcs(&v, s, (int)size, !consumed);
3221
3222 if (done < 0) {
3223 Py_XDECREF(v);
3224 return NULL;
3225 }
3226
3227 if (consumed)
3228 *consumed += done;
3229
3230#ifdef NEED_RETRY
3231 if (size > INT_MAX) {
3232 s += done;
3233 size -= done;
3234 goto retry;
3235 }
3236#endif
3237
3238 return (PyObject *)v;
3239}
3240
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003241PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003242 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003243 const char *errors)
3244{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003245 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3246}
3247
3248/*
3249 * Convert unicode into string object (MBCS).
3250 * Returns 0 if succeed, -1 otherwise.
3251 */
3252static int encode_mbcs(PyObject **repr,
3253 const Py_UNICODE *p, /* unicode */
3254 int size) /* size of unicode */
3255{
3256 int mbcssize = 0;
3257 Py_ssize_t n = 0;
3258
3259 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003260
3261 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003262 if (size > 0) {
3263 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3264 if (mbcssize == 0) {
3265 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3266 return -1;
3267 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003268 }
3269
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003270 if (*repr == NULL) {
3271 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003272 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003273 if (*repr == NULL)
3274 return -1;
3275 }
3276 else {
3277 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003278 n = PyBytes_Size(*repr);
3279 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003280 return -1;
3281 }
3282
3283 /* Do the conversion */
3284 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003285 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003286 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3287 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3288 return -1;
3289 }
3290 }
3291
3292 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003293}
3294
3295PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003296 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003297 const char *errors)
3298{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003299 PyObject *repr = NULL;
3300 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003302#ifdef NEED_RETRY
3303 retry:
3304 if (size > INT_MAX)
3305 ret = encode_mbcs(&repr, p, INT_MAX);
3306 else
3307#endif
3308 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003309
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003310 if (ret < 0) {
3311 Py_XDECREF(repr);
3312 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003313 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003314
3315#ifdef NEED_RETRY
3316 if (size > INT_MAX) {
3317 p += INT_MAX;
3318 size -= INT_MAX;
3319 goto retry;
3320 }
3321#endif
3322
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003323 return repr;
3324}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003325
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003326PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3327{
3328 if (!PyUnicode_Check(unicode)) {
3329 PyErr_BadArgument();
3330 return NULL;
3331 }
3332 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3333 PyUnicode_GET_SIZE(unicode),
3334 NULL);
3335}
3336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003337#undef NEED_RETRY
3338
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003339#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341/* --- Character Mapping Codec -------------------------------------------- */
3342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 PyObject *mapping,
3346 const char *errors)
3347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003349 Py_ssize_t startinpos;
3350 Py_ssize_t endinpos;
3351 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 PyUnicodeObject *v;
3354 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003355 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 PyObject *errorHandler = NULL;
3357 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003358 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003359 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003360
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 /* Default to Latin-1 */
3362 if (mapping == NULL)
3363 return PyUnicode_DecodeLatin1(s, size, errors);
3364
3365 v = _PyUnicode_New(size);
3366 if (v == NULL)
3367 goto onError;
3368 if (size == 0)
3369 return (PyObject *)v;
3370 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003372 if (PyUnicode_CheckExact(mapping)) {
3373 mapstring = PyUnicode_AS_UNICODE(mapping);
3374 maplen = PyUnicode_GET_SIZE(mapping);
3375 while (s < e) {
3376 unsigned char ch = *s;
3377 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003379 if (ch < maplen)
3380 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003382 if (x == 0xfffe) {
3383 /* undefined mapping */
3384 outpos = p-PyUnicode_AS_UNICODE(v);
3385 startinpos = s-starts;
3386 endinpos = startinpos+1;
3387 if (unicode_decode_call_errorhandler(
3388 errors, &errorHandler,
3389 "charmap", "character maps to <undefined>",
3390 starts, size, &startinpos, &endinpos, &exc, &s,
3391 (PyObject **)&v, &outpos, &p)) {
3392 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003393 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003394 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003395 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003396 *p++ = x;
3397 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003399 }
3400 else {
3401 while (s < e) {
3402 unsigned char ch = *s;
3403 PyObject *w, *x;
3404
3405 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3406 w = PyInt_FromLong((long)ch);
3407 if (w == NULL)
3408 goto onError;
3409 x = PyObject_GetItem(mapping, w);
3410 Py_DECREF(w);
3411 if (x == NULL) {
3412 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3413 /* No mapping found means: mapping is undefined. */
3414 PyErr_Clear();
3415 x = Py_None;
3416 Py_INCREF(x);
3417 } else
3418 goto onError;
3419 }
3420
3421 /* Apply mapping */
3422 if (PyInt_Check(x)) {
3423 long value = PyInt_AS_LONG(x);
3424 if (value < 0 || value > 65535) {
3425 PyErr_SetString(PyExc_TypeError,
3426 "character mapping must be in range(65536)");
3427 Py_DECREF(x);
3428 goto onError;
3429 }
3430 *p++ = (Py_UNICODE)value;
3431 }
3432 else if (x == Py_None) {
3433 /* undefined mapping */
3434 outpos = p-PyUnicode_AS_UNICODE(v);
3435 startinpos = s-starts;
3436 endinpos = startinpos+1;
3437 if (unicode_decode_call_errorhandler(
3438 errors, &errorHandler,
3439 "charmap", "character maps to <undefined>",
3440 starts, size, &startinpos, &endinpos, &exc, &s,
3441 (PyObject **)&v, &outpos, &p)) {
3442 Py_DECREF(x);
3443 goto onError;
3444 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003445 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003446 continue;
3447 }
3448 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003449 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003450
3451 if (targetsize == 1)
3452 /* 1-1 mapping */
3453 *p++ = *PyUnicode_AS_UNICODE(x);
3454
3455 else if (targetsize > 1) {
3456 /* 1-n mapping */
3457 if (targetsize > extrachars) {
3458 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003459 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3460 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003461 (targetsize << 2);
3462 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003463 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003464 if (_PyUnicode_Resize(&v,
3465 PyUnicode_GET_SIZE(v) + needed) < 0) {
3466 Py_DECREF(x);
3467 goto onError;
3468 }
3469 p = PyUnicode_AS_UNICODE(v) + oldpos;
3470 }
3471 Py_UNICODE_COPY(p,
3472 PyUnicode_AS_UNICODE(x),
3473 targetsize);
3474 p += targetsize;
3475 extrachars -= targetsize;
3476 }
3477 /* 1-0 mapping: skip the character */
3478 }
3479 else {
3480 /* wrong return value */
3481 PyErr_SetString(PyExc_TypeError,
3482 "character mapping must return integer, None or unicode");
3483 Py_DECREF(x);
3484 goto onError;
3485 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003487 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
3490 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003491 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 Py_XDECREF(errorHandler);
3494 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003496
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 Py_XDECREF(errorHandler);
3499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 Py_XDECREF(v);
3501 return NULL;
3502}
3503
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003504/* Charmap encoding: the lookup table */
3505
3506struct encoding_map{
3507 PyObject_HEAD
3508 unsigned char level1[32];
3509 int count2, count3;
3510 unsigned char level23[1];
3511};
3512
3513static PyObject*
3514encoding_map_size(PyObject *obj, PyObject* args)
3515{
3516 struct encoding_map *map = (struct encoding_map*)obj;
3517 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3518 128*map->count3);
3519}
3520
3521static PyMethodDef encoding_map_methods[] = {
3522 {"size", encoding_map_size, METH_NOARGS,
3523 PyDoc_STR("Return the size (in bytes) of this object") },
3524 { 0 }
3525};
3526
3527static void
3528encoding_map_dealloc(PyObject* o)
3529{
3530 PyObject_FREE(o);
3531}
3532
3533static PyTypeObject EncodingMapType = {
3534 PyObject_HEAD_INIT(NULL)
3535 0, /*ob_size*/
3536 "EncodingMap", /*tp_name*/
3537 sizeof(struct encoding_map), /*tp_basicsize*/
3538 0, /*tp_itemsize*/
3539 /* methods */
3540 encoding_map_dealloc, /*tp_dealloc*/
3541 0, /*tp_print*/
3542 0, /*tp_getattr*/
3543 0, /*tp_setattr*/
3544 0, /*tp_compare*/
3545 0, /*tp_repr*/
3546 0, /*tp_as_number*/
3547 0, /*tp_as_sequence*/
3548 0, /*tp_as_mapping*/
3549 0, /*tp_hash*/
3550 0, /*tp_call*/
3551 0, /*tp_str*/
3552 0, /*tp_getattro*/
3553 0, /*tp_setattro*/
3554 0, /*tp_as_buffer*/
3555 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3556 0, /*tp_doc*/
3557 0, /*tp_traverse*/
3558 0, /*tp_clear*/
3559 0, /*tp_richcompare*/
3560 0, /*tp_weaklistoffset*/
3561 0, /*tp_iter*/
3562 0, /*tp_iternext*/
3563 encoding_map_methods, /*tp_methods*/
3564 0, /*tp_members*/
3565 0, /*tp_getset*/
3566 0, /*tp_base*/
3567 0, /*tp_dict*/
3568 0, /*tp_descr_get*/
3569 0, /*tp_descr_set*/
3570 0, /*tp_dictoffset*/
3571 0, /*tp_init*/
3572 0, /*tp_alloc*/
3573 0, /*tp_new*/
3574 0, /*tp_free*/
3575 0, /*tp_is_gc*/
3576};
3577
3578PyObject*
3579PyUnicode_BuildEncodingMap(PyObject* string)
3580{
3581 Py_UNICODE *decode;
3582 PyObject *result;
3583 struct encoding_map *mresult;
3584 int i;
3585 int need_dict = 0;
3586 unsigned char level1[32];
3587 unsigned char level2[512];
3588 unsigned char *mlevel1, *mlevel2, *mlevel3;
3589 int count2 = 0, count3 = 0;
3590
3591 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3592 PyErr_BadArgument();
3593 return NULL;
3594 }
3595 decode = PyUnicode_AS_UNICODE(string);
3596 memset(level1, 0xFF, sizeof level1);
3597 memset(level2, 0xFF, sizeof level2);
3598
3599 /* If there isn't a one-to-one mapping of NULL to \0,
3600 or if there are non-BMP characters, we need to use
3601 a mapping dictionary. */
3602 if (decode[0] != 0)
3603 need_dict = 1;
3604 for (i = 1; i < 256; i++) {
3605 int l1, l2;
3606 if (decode[i] == 0
3607 #ifdef Py_UNICODE_WIDE
3608 || decode[i] > 0xFFFF
3609 #endif
3610 ) {
3611 need_dict = 1;
3612 break;
3613 }
3614 if (decode[i] == 0xFFFE)
3615 /* unmapped character */
3616 continue;
3617 l1 = decode[i] >> 11;
3618 l2 = decode[i] >> 7;
3619 if (level1[l1] == 0xFF)
3620 level1[l1] = count2++;
3621 if (level2[l2] == 0xFF)
3622 level2[l2] = count3++;
3623 }
3624
3625 if (count2 >= 0xFF || count3 >= 0xFF)
3626 need_dict = 1;
3627
3628 if (need_dict) {
3629 PyObject *result = PyDict_New();
3630 PyObject *key, *value;
3631 if (!result)
3632 return NULL;
3633 for (i = 0; i < 256; i++) {
3634 key = value = NULL;
3635 key = PyInt_FromLong(decode[i]);
3636 value = PyInt_FromLong(i);
3637 if (!key || !value)
3638 goto failed1;
3639 if (PyDict_SetItem(result, key, value) == -1)
3640 goto failed1;
3641 Py_DECREF(key);
3642 Py_DECREF(value);
3643 }
3644 return result;
3645 failed1:
3646 Py_XDECREF(key);
3647 Py_XDECREF(value);
3648 Py_DECREF(result);
3649 return NULL;
3650 }
3651
3652 /* Create a three-level trie */
3653 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3654 16*count2 + 128*count3 - 1);
3655 if (!result)
3656 return PyErr_NoMemory();
3657 PyObject_Init(result, &EncodingMapType);
3658 mresult = (struct encoding_map*)result;
3659 mresult->count2 = count2;
3660 mresult->count3 = count3;
3661 mlevel1 = mresult->level1;
3662 mlevel2 = mresult->level23;
3663 mlevel3 = mresult->level23 + 16*count2;
3664 memcpy(mlevel1, level1, 32);
3665 memset(mlevel2, 0xFF, 16*count2);
3666 memset(mlevel3, 0, 128*count3);
3667 count3 = 0;
3668 for (i = 1; i < 256; i++) {
3669 int o1, o2, o3, i2, i3;
3670 if (decode[i] == 0xFFFE)
3671 /* unmapped character */
3672 continue;
3673 o1 = decode[i]>>11;
3674 o2 = (decode[i]>>7) & 0xF;
3675 i2 = 16*mlevel1[o1] + o2;
3676 if (mlevel2[i2] == 0xFF)
3677 mlevel2[i2] = count3++;
3678 o3 = decode[i] & 0x7F;
3679 i3 = 128*mlevel2[i2] + o3;
3680 mlevel3[i3] = i;
3681 }
3682 return result;
3683}
3684
3685static int
3686encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3687{
3688 struct encoding_map *map = (struct encoding_map*)mapping;
3689 int l1 = c>>11;
3690 int l2 = (c>>7) & 0xF;
3691 int l3 = c & 0x7F;
3692 int i;
3693
3694#ifdef Py_UNICODE_WIDE
3695 if (c > 0xFFFF) {
3696 return -1;
3697 }
3698#endif
3699 if (c == 0)
3700 return 0;
3701 /* level 1*/
3702 i = map->level1[l1];
3703 if (i == 0xFF) {
3704 return -1;
3705 }
3706 /* level 2*/
3707 i = map->level23[16*i+l2];
3708 if (i == 0xFF) {
3709 return -1;
3710 }
3711 /* level 3 */
3712 i = map->level23[16*map->count2 + 128*i + l3];
3713 if (i == 0) {
3714 return -1;
3715 }
3716 return i;
3717}
3718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719/* Lookup the character ch in the mapping. If the character
3720 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003721 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 PyObject *w = PyInt_FromLong((long)c);
3725 PyObject *x;
3726
3727 if (w == NULL)
3728 return NULL;
3729 x = PyObject_GetItem(mapping, w);
3730 Py_DECREF(w);
3731 if (x == NULL) {
3732 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3733 /* No mapping found means: mapping is undefined. */
3734 PyErr_Clear();
3735 x = Py_None;
3736 Py_INCREF(x);
3737 return x;
3738 } else
3739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003741 else if (x == Py_None)
3742 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 else if (PyInt_Check(x)) {
3744 long value = PyInt_AS_LONG(x);
3745 if (value < 0 || value > 255) {
3746 PyErr_SetString(PyExc_TypeError,
3747 "character mapping must be in range(256)");
3748 Py_DECREF(x);
3749 return NULL;
3750 }
3751 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 else if (PyString_Check(x))
3754 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003757 PyErr_Format(PyExc_TypeError,
3758 "character mapping must return integer, None or str8, not %.400s",
3759 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 Py_DECREF(x);
3761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 }
3763}
3764
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003765static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003766charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003767{
Walter Dörwald827b0552007-05-12 13:23:53 +00003768 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003769 /* exponentially overallocate to minimize reallocations */
3770 if (requiredsize < 2*outsize)
3771 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003772 if (PyBytes_Resize(outobj, requiredsize)) {
3773 Py_DECREF(outobj);
3774 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003775 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003776 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003777}
3778
3779typedef enum charmapencode_result {
3780 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3781}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003783 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 space is available. Return a new reference to the object that
3785 was put in the output buffer, or Py_None, if the mapping was undefined
3786 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003787 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003789charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003790 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003792 PyObject *rep;
3793 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003794 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003796 if (mapping->ob_type == &EncodingMapType) {
3797 int res = encoding_map_lookup(c, mapping);
3798 Py_ssize_t requiredsize = *outpos+1;
3799 if (res == -1)
3800 return enc_FAILED;
3801 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003802 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003803 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003804 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003805 outstart[(*outpos)++] = (char)res;
3806 return enc_SUCCESS;
3807 }
3808
3809 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003811 return enc_EXCEPTION;
3812 else if (rep==Py_None) {
3813 Py_DECREF(rep);
3814 return enc_FAILED;
3815 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003818 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003819 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003821 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003823 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3825 }
3826 else {
3827 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3829 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003830 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003831 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003833 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003835 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 memcpy(outstart + *outpos, repchars, repsize);
3837 *outpos += repsize;
3838 }
3839 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003840 Py_DECREF(rep);
3841 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842}
3843
3844/* handle an error in PyUnicode_EncodeCharmap
3845 Return 0 on success, -1 on error */
3846static
3847int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003850 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003851 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852{
3853 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854 Py_ssize_t repsize;
3855 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 Py_UNICODE *uni2;
3857 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003858 Py_ssize_t collstartpos = *inpos;
3859 Py_ssize_t collendpos = *inpos+1;
3860 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 char *encoding = "charmap";
3862 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003863 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 /* find all unencodable characters */
3866 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003867 PyObject *rep;
3868 if (mapping->ob_type == &EncodingMapType) {
3869 int res = encoding_map_lookup(p[collendpos], mapping);
3870 if (res != -1)
3871 break;
3872 ++collendpos;
3873 continue;
3874 }
3875
3876 rep = charmapencode_lookup(p[collendpos], mapping);
3877 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003879 else if (rep!=Py_None) {
3880 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 break;
3882 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003883 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 ++collendpos;
3885 }
3886 /* cache callback name lookup
3887 * (if not done yet, i.e. it's the first error) */
3888 if (*known_errorHandler==-1) {
3889 if ((errors==NULL) || (!strcmp(errors, "strict")))
3890 *known_errorHandler = 1;
3891 else if (!strcmp(errors, "replace"))
3892 *known_errorHandler = 2;
3893 else if (!strcmp(errors, "ignore"))
3894 *known_errorHandler = 3;
3895 else if (!strcmp(errors, "xmlcharrefreplace"))
3896 *known_errorHandler = 4;
3897 else
3898 *known_errorHandler = 0;
3899 }
3900 switch (*known_errorHandler) {
3901 case 1: /* strict */
3902 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3903 return -1;
3904 case 2: /* replace */
3905 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3906 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003907 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 return -1;
3909 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003910 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3912 return -1;
3913 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 }
3915 /* fall through */
3916 case 3: /* ignore */
3917 *inpos = collendpos;
3918 break;
3919 case 4: /* xmlcharrefreplace */
3920 /* generate replacement (temporarily (mis)uses p) */
3921 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3922 char buffer[2+29+1+1];
3923 char *cp;
3924 sprintf(buffer, "&#%d;", (int)p[collpos]);
3925 for (cp = buffer; *cp; ++cp) {
3926 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003927 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003929 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3931 return -1;
3932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 }
3934 }
3935 *inpos = collendpos;
3936 break;
3937 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003938 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 encoding, reason, p, size, exceptionObject,
3940 collstartpos, collendpos, &newpos);
3941 if (repunicode == NULL)
3942 return -1;
3943 /* generate replacement */
3944 repsize = PyUnicode_GET_SIZE(repunicode);
3945 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3946 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003947 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 return -1;
3949 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003950 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3953 return -1;
3954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 }
3956 *inpos = newpos;
3957 Py_DECREF(repunicode);
3958 }
3959 return 0;
3960}
3961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 PyObject *mapping,
3965 const char *errors)
3966{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 /* output object */
3968 PyObject *res = NULL;
3969 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003970 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003972 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 PyObject *errorHandler = NULL;
3974 PyObject *exc = NULL;
3975 /* the following variable is used for caching string comparisons
3976 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3977 * 3=ignore, 4=xmlcharrefreplace */
3978 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979
3980 /* Default to Latin-1 */
3981 if (mapping == NULL)
3982 return PyUnicode_EncodeLatin1(p, size, errors);
3983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 /* allocate enough for a simple encoding without
3985 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00003986 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 if (res == NULL)
3988 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003989 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 while (inpos<size) {
3993 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00003994 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003995 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 if (charmap_encoding_error(p, size, &inpos, mapping,
3999 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004000 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004001 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004002 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 else
4006 /* done with this character => adjust input position */
4007 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004011 if (respos<PyBytes_GET_SIZE(res)) {
4012 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 goto onError;
4014 }
4015 Py_XDECREF(exc);
4016 Py_XDECREF(errorHandler);
4017 return res;
4018
4019 onError:
4020 Py_XDECREF(res);
4021 Py_XDECREF(exc);
4022 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 return NULL;
4024}
4025
4026PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4027 PyObject *mapping)
4028{
4029 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4030 PyErr_BadArgument();
4031 return NULL;
4032 }
4033 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4034 PyUnicode_GET_SIZE(unicode),
4035 mapping,
4036 NULL);
4037}
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039/* create or adjust a UnicodeTranslateError */
4040static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004041 const Py_UNICODE *unicode, Py_ssize_t size,
4042 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 if (*exceptionObject == NULL) {
4046 *exceptionObject = PyUnicodeTranslateError_Create(
4047 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 }
4049 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4051 goto onError;
4052 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4053 goto onError;
4054 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4055 goto onError;
4056 return;
4057 onError:
4058 Py_DECREF(*exceptionObject);
4059 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 }
4061}
4062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063/* raises a UnicodeTranslateError */
4064static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 const Py_UNICODE *unicode, Py_ssize_t size,
4066 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 const char *reason)
4068{
4069 make_translate_exception(exceptionObject,
4070 unicode, size, startpos, endpos, reason);
4071 if (*exceptionObject != NULL)
4072 PyCodec_StrictErrors(*exceptionObject);
4073}
4074
4075/* error handling callback helper:
4076 build arguments, call the callback and check the arguments,
4077 put the result into newpos and return the replacement string, which
4078 has to be freed by the caller */
4079static PyObject *unicode_translate_call_errorhandler(const char *errors,
4080 PyObject **errorHandler,
4081 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004082 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4083 Py_ssize_t startpos, Py_ssize_t endpos,
4084 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004086 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004088 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 PyObject *restuple;
4090 PyObject *resunicode;
4091
4092 if (*errorHandler == NULL) {
4093 *errorHandler = PyCodec_LookupError(errors);
4094 if (*errorHandler == NULL)
4095 return NULL;
4096 }
4097
4098 make_translate_exception(exceptionObject,
4099 unicode, size, startpos, endpos, reason);
4100 if (*exceptionObject == NULL)
4101 return NULL;
4102
4103 restuple = PyObject_CallFunctionObjArgs(
4104 *errorHandler, *exceptionObject, NULL);
4105 if (restuple == NULL)
4106 return NULL;
4107 if (!PyTuple_Check(restuple)) {
4108 PyErr_Format(PyExc_TypeError, &argparse[4]);
4109 Py_DECREF(restuple);
4110 return NULL;
4111 }
4112 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004113 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 Py_DECREF(restuple);
4115 return NULL;
4116 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004117 if (i_newpos<0)
4118 *newpos = size+i_newpos;
4119 else
4120 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004121 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004122 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004123 Py_DECREF(restuple);
4124 return NULL;
4125 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 Py_INCREF(resunicode);
4127 Py_DECREF(restuple);
4128 return resunicode;
4129}
4130
4131/* Lookup the character ch in the mapping and put the result in result,
4132 which must be decrefed by the caller.
4133 Return 0 on success, -1 on error */
4134static
4135int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4136{
4137 PyObject *w = PyInt_FromLong((long)c);
4138 PyObject *x;
4139
4140 if (w == NULL)
4141 return -1;
4142 x = PyObject_GetItem(mapping, w);
4143 Py_DECREF(w);
4144 if (x == NULL) {
4145 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4146 /* No mapping found means: use 1:1 mapping. */
4147 PyErr_Clear();
4148 *result = NULL;
4149 return 0;
4150 } else
4151 return -1;
4152 }
4153 else if (x == Py_None) {
4154 *result = x;
4155 return 0;
4156 }
4157 else if (PyInt_Check(x)) {
4158 long value = PyInt_AS_LONG(x);
4159 long max = PyUnicode_GetMax();
4160 if (value < 0 || value > max) {
4161 PyErr_Format(PyExc_TypeError,
4162 "character mapping must be in range(0x%lx)", max+1);
4163 Py_DECREF(x);
4164 return -1;
4165 }
4166 *result = x;
4167 return 0;
4168 }
4169 else if (PyUnicode_Check(x)) {
4170 *result = x;
4171 return 0;
4172 }
4173 else {
4174 /* wrong return value */
4175 PyErr_SetString(PyExc_TypeError,
4176 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004177 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 return -1;
4179 }
4180}
4181/* ensure that *outobj is at least requiredsize characters long,
4182if not reallocate and adjust various state variables.
4183Return 0 on success, -1 on error */
4184static
Walter Dörwald4894c302003-10-24 14:25:28 +00004185int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004189 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004191 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004193 if (requiredsize < 2 * oldsize)
4194 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004195 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 return -1;
4197 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 }
4199 return 0;
4200}
4201/* lookup the character, put the result in the output string and adjust
4202 various state variables. Return a new reference to the object that
4203 was put in the output buffer in *result, or Py_None, if the mapping was
4204 undefined (in which case no character was written).
4205 The called must decref result.
4206 Return 0 on success, -1 on error. */
4207static
Walter Dörwald4894c302003-10-24 14:25:28 +00004208int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004209 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004210 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211{
Walter Dörwald4894c302003-10-24 14:25:28 +00004212 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 return -1;
4214 if (*res==NULL) {
4215 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004216 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 }
4218 else if (*res==Py_None)
4219 ;
4220 else if (PyInt_Check(*res)) {
4221 /* no overflow check, because we know that the space is enough */
4222 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4223 }
4224 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004225 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 if (repsize==1) {
4227 /* no overflow check, because we know that the space is enough */
4228 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4229 }
4230 else if (repsize!=0) {
4231 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004233 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004234 repsize - 1;
4235 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 return -1;
4237 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4238 *outp += repsize;
4239 }
4240 }
4241 else
4242 return -1;
4243 return 0;
4244}
4245
4246PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004247 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248 PyObject *mapping,
4249 const char *errors)
4250{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 /* output object */
4252 PyObject *res = NULL;
4253 /* pointers to the beginning and end+1 of input */
4254 const Py_UNICODE *startp = p;
4255 const Py_UNICODE *endp = p + size;
4256 /* pointer into the output */
4257 Py_UNICODE *str;
4258 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004259 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 char *reason = "character maps to <undefined>";
4261 PyObject *errorHandler = NULL;
4262 PyObject *exc = NULL;
4263 /* the following variable is used for caching string comparisons
4264 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4265 * 3=ignore, 4=xmlcharrefreplace */
4266 int known_errorHandler = -1;
4267
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 if (mapping == NULL) {
4269 PyErr_BadArgument();
4270 return NULL;
4271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272
4273 /* allocate enough for a simple 1:1 translation without
4274 replacements, if we need more, we'll resize */
4275 res = PyUnicode_FromUnicode(NULL, size);
4276 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 return res;
4280 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 while (p<endp) {
4283 /* try to encode it */
4284 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004285 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 goto onError;
4288 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004289 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 if (x!=Py_None) /* it worked => adjust input pointer */
4291 ++p;
4292 else { /* untranslatable character */
4293 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004294 Py_ssize_t repsize;
4295 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_UNICODE *uni2;
4297 /* startpos for collecting untranslatable chars */
4298 const Py_UNICODE *collstart = p;
4299 const Py_UNICODE *collend = p+1;
4300 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 /* find all untranslatable characters */
4303 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004304 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 goto onError;
4306 Py_XDECREF(x);
4307 if (x!=Py_None)
4308 break;
4309 ++collend;
4310 }
4311 /* cache callback name lookup
4312 * (if not done yet, i.e. it's the first error) */
4313 if (known_errorHandler==-1) {
4314 if ((errors==NULL) || (!strcmp(errors, "strict")))
4315 known_errorHandler = 1;
4316 else if (!strcmp(errors, "replace"))
4317 known_errorHandler = 2;
4318 else if (!strcmp(errors, "ignore"))
4319 known_errorHandler = 3;
4320 else if (!strcmp(errors, "xmlcharrefreplace"))
4321 known_errorHandler = 4;
4322 else
4323 known_errorHandler = 0;
4324 }
4325 switch (known_errorHandler) {
4326 case 1: /* strict */
4327 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4328 goto onError;
4329 case 2: /* replace */
4330 /* No need to check for space, this is a 1:1 replacement */
4331 for (coll = collstart; coll<collend; ++coll)
4332 *str++ = '?';
4333 /* fall through */
4334 case 3: /* ignore */
4335 p = collend;
4336 break;
4337 case 4: /* xmlcharrefreplace */
4338 /* generate replacement (temporarily (mis)uses p) */
4339 for (p = collstart; p < collend; ++p) {
4340 char buffer[2+29+1+1];
4341 char *cp;
4342 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004343 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4345 goto onError;
4346 for (cp = buffer; *cp; ++cp)
4347 *str++ = *cp;
4348 }
4349 p = collend;
4350 break;
4351 default:
4352 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4353 reason, startp, size, &exc,
4354 collstart-startp, collend-startp, &newpos);
4355 if (repunicode == NULL)
4356 goto onError;
4357 /* generate replacement */
4358 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004359 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4361 Py_DECREF(repunicode);
4362 goto onError;
4363 }
4364 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4365 *str++ = *uni2;
4366 p = startp + newpos;
4367 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 }
4369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 /* Resize if we allocated to much */
4372 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004373 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004374 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 }
4377 Py_XDECREF(exc);
4378 Py_XDECREF(errorHandler);
4379 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 onError:
4382 Py_XDECREF(res);
4383 Py_XDECREF(exc);
4384 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 return NULL;
4386}
4387
4388PyObject *PyUnicode_Translate(PyObject *str,
4389 PyObject *mapping,
4390 const char *errors)
4391{
4392 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004393
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 str = PyUnicode_FromObject(str);
4395 if (str == NULL)
4396 goto onError;
4397 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4398 PyUnicode_GET_SIZE(str),
4399 mapping,
4400 errors);
4401 Py_DECREF(str);
4402 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004403
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 onError:
4405 Py_XDECREF(str);
4406 return NULL;
4407}
Tim Petersced69f82003-09-16 20:30:58 +00004408
Guido van Rossum9e896b32000-04-05 20:11:21 +00004409/* --- Decimal Encoder ---------------------------------------------------- */
4410
4411int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004413 char *output,
4414 const char *errors)
4415{
4416 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 PyObject *errorHandler = NULL;
4418 PyObject *exc = NULL;
4419 const char *encoding = "decimal";
4420 const char *reason = "invalid decimal Unicode string";
4421 /* the following variable is used for caching string comparisons
4422 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4423 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004424
4425 if (output == NULL) {
4426 PyErr_BadArgument();
4427 return -1;
4428 }
4429
4430 p = s;
4431 end = s + length;
4432 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004434 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t repsize;
4437 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 Py_UNICODE *uni2;
4439 Py_UNICODE *collstart;
4440 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004441
Guido van Rossum9e896b32000-04-05 20:11:21 +00004442 if (Py_UNICODE_ISSPACE(ch)) {
4443 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004445 continue;
4446 }
4447 decimal = Py_UNICODE_TODECIMAL(ch);
4448 if (decimal >= 0) {
4449 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004451 continue;
4452 }
Guido van Rossumba477042000-04-06 18:18:10 +00004453 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004454 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004456 continue;
4457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 /* All other characters are considered unencodable */
4459 collstart = p;
4460 collend = p+1;
4461 while (collend < end) {
4462 if ((0 < *collend && *collend < 256) ||
4463 !Py_UNICODE_ISSPACE(*collend) ||
4464 Py_UNICODE_TODECIMAL(*collend))
4465 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004466 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 /* cache callback name lookup
4468 * (if not done yet, i.e. it's the first error) */
4469 if (known_errorHandler==-1) {
4470 if ((errors==NULL) || (!strcmp(errors, "strict")))
4471 known_errorHandler = 1;
4472 else if (!strcmp(errors, "replace"))
4473 known_errorHandler = 2;
4474 else if (!strcmp(errors, "ignore"))
4475 known_errorHandler = 3;
4476 else if (!strcmp(errors, "xmlcharrefreplace"))
4477 known_errorHandler = 4;
4478 else
4479 known_errorHandler = 0;
4480 }
4481 switch (known_errorHandler) {
4482 case 1: /* strict */
4483 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4484 goto onError;
4485 case 2: /* replace */
4486 for (p = collstart; p < collend; ++p)
4487 *output++ = '?';
4488 /* fall through */
4489 case 3: /* ignore */
4490 p = collend;
4491 break;
4492 case 4: /* xmlcharrefreplace */
4493 /* generate replacement (temporarily (mis)uses p) */
4494 for (p = collstart; p < collend; ++p)
4495 output += sprintf(output, "&#%d;", (int)*p);
4496 p = collend;
4497 break;
4498 default:
4499 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4500 encoding, reason, s, length, &exc,
4501 collstart-s, collend-s, &newpos);
4502 if (repunicode == NULL)
4503 goto onError;
4504 /* generate replacement */
4505 repsize = PyUnicode_GET_SIZE(repunicode);
4506 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4507 Py_UNICODE ch = *uni2;
4508 if (Py_UNICODE_ISSPACE(ch))
4509 *output++ = ' ';
4510 else {
4511 decimal = Py_UNICODE_TODECIMAL(ch);
4512 if (decimal >= 0)
4513 *output++ = '0' + decimal;
4514 else if (0 < ch && ch < 256)
4515 *output++ = (char)ch;
4516 else {
4517 Py_DECREF(repunicode);
4518 raise_encode_exception(&exc, encoding,
4519 s, length, collstart-s, collend-s, reason);
4520 goto onError;
4521 }
4522 }
4523 }
4524 p = s + newpos;
4525 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004526 }
4527 }
4528 /* 0-terminate the output string */
4529 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_XDECREF(exc);
4531 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004532 return 0;
4533
4534 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 Py_XDECREF(exc);
4536 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004537 return -1;
4538}
4539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540/* --- Helpers ------------------------------------------------------------ */
4541
Thomas Wouters477c8d52006-05-27 19:21:47 +00004542#define STRINGLIB_CHAR Py_UNICODE
4543
4544#define STRINGLIB_LEN PyUnicode_GET_SIZE
4545#define STRINGLIB_NEW PyUnicode_FromUnicode
4546#define STRINGLIB_STR PyUnicode_AS_UNICODE
4547
4548Py_LOCAL_INLINE(int)
4549STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004551 if (str[0] != other[0])
4552 return 1;
4553 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554}
4555
Thomas Wouters477c8d52006-05-27 19:21:47 +00004556#define STRINGLIB_EMPTY unicode_empty
4557
4558#include "stringlib/fastsearch.h"
4559
4560#include "stringlib/count.h"
4561#include "stringlib/find.h"
4562#include "stringlib/partition.h"
4563
4564/* helper macro to fixup start/end slice values */
4565#define FIX_START_END(obj) \
4566 if (start < 0) \
4567 start += (obj)->length; \
4568 if (start < 0) \
4569 start = 0; \
4570 if (end > (obj)->length) \
4571 end = (obj)->length; \
4572 if (end < 0) \
4573 end += (obj)->length; \
4574 if (end < 0) \
4575 end = 0;
4576
Martin v. Löwis18e16552006-02-15 17:27:45 +00004577Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004578 PyObject *substr,
4579 Py_ssize_t start,
4580 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004582 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004583 PyUnicodeObject* str_obj;
4584 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004585
Thomas Wouters477c8d52006-05-27 19:21:47 +00004586 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4587 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004589 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4590 if (!sub_obj) {
4591 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 return -1;
4593 }
Tim Petersced69f82003-09-16 20:30:58 +00004594
Thomas Wouters477c8d52006-05-27 19:21:47 +00004595 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004596
Thomas Wouters477c8d52006-05-27 19:21:47 +00004597 result = stringlib_count(
4598 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4599 );
4600
4601 Py_DECREF(sub_obj);
4602 Py_DECREF(str_obj);
4603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 return result;
4605}
4606
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004608 PyObject *sub,
4609 Py_ssize_t start,
4610 Py_ssize_t end,
4611 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004613 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004614
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004616 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004617 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004618 sub = PyUnicode_FromObject(sub);
4619 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004620 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004621 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 }
Tim Petersced69f82003-09-16 20:30:58 +00004623
Thomas Wouters477c8d52006-05-27 19:21:47 +00004624 if (direction > 0)
4625 result = stringlib_find_slice(
4626 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4627 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4628 start, end
4629 );
4630 else
4631 result = stringlib_rfind_slice(
4632 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4633 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4634 start, end
4635 );
4636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004638 Py_DECREF(sub);
4639
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 return result;
4641}
4642
Tim Petersced69f82003-09-16 20:30:58 +00004643static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644int tailmatch(PyUnicodeObject *self,
4645 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 Py_ssize_t start,
4647 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 int direction)
4649{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 if (substring->length == 0)
4651 return 1;
4652
Thomas Wouters477c8d52006-05-27 19:21:47 +00004653 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654
4655 end -= substring->length;
4656 if (end < start)
4657 return 0;
4658
4659 if (direction > 0) {
4660 if (Py_UNICODE_MATCH(self, end, substring))
4661 return 1;
4662 } else {
4663 if (Py_UNICODE_MATCH(self, start, substring))
4664 return 1;
4665 }
4666
4667 return 0;
4668}
4669
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004672 Py_ssize_t start,
4673 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 int direction)
4675{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004677
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 str = PyUnicode_FromObject(str);
4679 if (str == NULL)
4680 return -1;
4681 substr = PyUnicode_FromObject(substr);
4682 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004683 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 return -1;
4685 }
Tim Petersced69f82003-09-16 20:30:58 +00004686
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 result = tailmatch((PyUnicodeObject *)str,
4688 (PyUnicodeObject *)substr,
4689 start, end, direction);
4690 Py_DECREF(str);
4691 Py_DECREF(substr);
4692 return result;
4693}
4694
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695/* Apply fixfct filter to the Unicode object self and return a
4696 reference to the modified object */
4697
Tim Petersced69f82003-09-16 20:30:58 +00004698static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699PyObject *fixup(PyUnicodeObject *self,
4700 int (*fixfct)(PyUnicodeObject *s))
4701{
4702
4703 PyUnicodeObject *u;
4704
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004705 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 if (u == NULL)
4707 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004708
4709 Py_UNICODE_COPY(u->str, self->str, self->length);
4710
Tim Peters7a29bd52001-09-12 03:03:31 +00004711 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 /* fixfct should return TRUE if it modified the buffer. If
4713 FALSE, return a reference to the original buffer instead
4714 (to save space, not time) */
4715 Py_INCREF(self);
4716 Py_DECREF(u);
4717 return (PyObject*) self;
4718 }
4719 return (PyObject*) u;
4720}
4721
Tim Petersced69f82003-09-16 20:30:58 +00004722static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723int fixupper(PyUnicodeObject *self)
4724{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004725 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 Py_UNICODE *s = self->str;
4727 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 while (len-- > 0) {
4730 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004731
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 ch = Py_UNICODE_TOUPPER(*s);
4733 if (ch != *s) {
4734 status = 1;
4735 *s = ch;
4736 }
4737 s++;
4738 }
4739
4740 return status;
4741}
4742
Tim Petersced69f82003-09-16 20:30:58 +00004743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744int fixlower(PyUnicodeObject *self)
4745{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 Py_UNICODE *s = self->str;
4748 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 while (len-- > 0) {
4751 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004752
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 ch = Py_UNICODE_TOLOWER(*s);
4754 if (ch != *s) {
4755 status = 1;
4756 *s = ch;
4757 }
4758 s++;
4759 }
4760
4761 return status;
4762}
4763
Tim Petersced69f82003-09-16 20:30:58 +00004764static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765int fixswapcase(PyUnicodeObject *self)
4766{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 Py_UNICODE *s = self->str;
4769 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004770
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 while (len-- > 0) {
4772 if (Py_UNICODE_ISUPPER(*s)) {
4773 *s = Py_UNICODE_TOLOWER(*s);
4774 status = 1;
4775 } else if (Py_UNICODE_ISLOWER(*s)) {
4776 *s = Py_UNICODE_TOUPPER(*s);
4777 status = 1;
4778 }
4779 s++;
4780 }
4781
4782 return status;
4783}
4784
Tim Petersced69f82003-09-16 20:30:58 +00004785static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786int fixcapitalize(PyUnicodeObject *self)
4787{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004789 Py_UNICODE *s = self->str;
4790 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004791
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004792 if (len == 0)
4793 return 0;
4794 if (Py_UNICODE_ISLOWER(*s)) {
4795 *s = Py_UNICODE_TOUPPER(*s);
4796 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004798 s++;
4799 while (--len > 0) {
4800 if (Py_UNICODE_ISUPPER(*s)) {
4801 *s = Py_UNICODE_TOLOWER(*s);
4802 status = 1;
4803 }
4804 s++;
4805 }
4806 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807}
4808
4809static
4810int fixtitle(PyUnicodeObject *self)
4811{
4812 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4813 register Py_UNICODE *e;
4814 int previous_is_cased;
4815
4816 /* Shortcut for single character strings */
4817 if (PyUnicode_GET_SIZE(self) == 1) {
4818 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4819 if (*p != ch) {
4820 *p = ch;
4821 return 1;
4822 }
4823 else
4824 return 0;
4825 }
Tim Petersced69f82003-09-16 20:30:58 +00004826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 e = p + PyUnicode_GET_SIZE(self);
4828 previous_is_cased = 0;
4829 for (; p < e; p++) {
4830 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 if (previous_is_cased)
4833 *p = Py_UNICODE_TOLOWER(ch);
4834 else
4835 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004836
4837 if (Py_UNICODE_ISLOWER(ch) ||
4838 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 Py_UNICODE_ISTITLE(ch))
4840 previous_is_cased = 1;
4841 else
4842 previous_is_cased = 0;
4843 }
4844 return 1;
4845}
4846
Tim Peters8ce9f162004-08-27 01:49:32 +00004847PyObject *
4848PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Tim Peters8ce9f162004-08-27 01:49:32 +00004850 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004851 const Py_UNICODE blank = ' ';
4852 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004853 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004854 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004855 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4856 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004857 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4858 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004859 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004860 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004861 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
Tim Peters05eba1f2004-08-27 21:32:02 +00004863 fseq = PySequence_Fast(seq, "");
4864 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004865 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004866 }
4867
Tim Peters91879ab2004-08-27 22:35:44 +00004868 /* Grrrr. A codec may be invoked to convert str objects to
4869 * Unicode, and so it's possible to call back into Python code
4870 * during PyUnicode_FromObject(), and so it's possible for a sick
4871 * codec to change the size of fseq (if seq is a list). Therefore
4872 * we have to keep refetching the size -- can't assume seqlen
4873 * is invariant.
4874 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004875 seqlen = PySequence_Fast_GET_SIZE(fseq);
4876 /* If empty sequence, return u"". */
4877 if (seqlen == 0) {
4878 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4879 goto Done;
4880 }
4881 /* If singleton sequence with an exact Unicode, return that. */
4882 if (seqlen == 1) {
4883 item = PySequence_Fast_GET_ITEM(fseq, 0);
4884 if (PyUnicode_CheckExact(item)) {
4885 Py_INCREF(item);
4886 res = (PyUnicodeObject *)item;
4887 goto Done;
4888 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004889 }
4890
Tim Peters05eba1f2004-08-27 21:32:02 +00004891 /* At least two items to join, or one that isn't exact Unicode. */
4892 if (seqlen > 1) {
4893 /* Set up sep and seplen -- they're needed. */
4894 if (separator == NULL) {
4895 sep = &blank;
4896 seplen = 1;
4897 }
4898 else {
4899 internal_separator = PyUnicode_FromObject(separator);
4900 if (internal_separator == NULL)
4901 goto onError;
4902 sep = PyUnicode_AS_UNICODE(internal_separator);
4903 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004904 /* In case PyUnicode_FromObject() mutated seq. */
4905 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004906 }
4907 }
4908
4909 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004910 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004911 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004912 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004913 res_p = PyUnicode_AS_UNICODE(res);
4914 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004915
Tim Peters05eba1f2004-08-27 21:32:02 +00004916 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004917 Py_ssize_t itemlen;
4918 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004919
4920 item = PySequence_Fast_GET_ITEM(fseq, i);
4921 /* Convert item to Unicode. */
4922 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4923 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004924 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004925 " %.80s found",
4926 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004927 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004928 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004929 item = PyUnicode_FromObject(item);
4930 if (item == NULL)
4931 goto onError;
4932 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004933
Tim Peters91879ab2004-08-27 22:35:44 +00004934 /* In case PyUnicode_FromObject() mutated seq. */
4935 seqlen = PySequence_Fast_GET_SIZE(fseq);
4936
Tim Peters8ce9f162004-08-27 01:49:32 +00004937 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004939 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004940 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004941 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004942 if (i < seqlen - 1) {
4943 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004944 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004945 goto Overflow;
4946 }
4947 if (new_res_used > res_alloc) {
4948 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004949 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004950 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004951 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004952 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004953 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004954 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004955 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004957 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004958 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004960
4961 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004962 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004963 res_p += itemlen;
4964 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004965 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004966 res_p += seplen;
4967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004969 res_used = new_res_used;
4970 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004971
Tim Peters05eba1f2004-08-27 21:32:02 +00004972 /* Shrink res to match the used area; this probably can't fail,
4973 * but it's cheap to check.
4974 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004975 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004976 goto onError;
4977
4978 Done:
4979 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004980 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 return (PyObject *)res;
4982
Tim Peters8ce9f162004-08-27 01:49:32 +00004983 Overflow:
4984 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004986 Py_DECREF(item);
4987 /* fall through */
4988
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004990 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004991 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004992 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 return NULL;
4994}
4995
Tim Petersced69f82003-09-16 20:30:58 +00004996static
4997PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004998 Py_ssize_t left,
4999 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 Py_UNICODE fill)
5001{
5002 PyUnicodeObject *u;
5003
5004 if (left < 0)
5005 left = 0;
5006 if (right < 0)
5007 right = 0;
5008
Tim Peters7a29bd52001-09-12 03:03:31 +00005009 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 Py_INCREF(self);
5011 return self;
5012 }
5013
5014 u = _PyUnicode_New(left + self->length + right);
5015 if (u) {
5016 if (left)
5017 Py_UNICODE_FILL(u->str, fill, left);
5018 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5019 if (right)
5020 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5021 }
5022
5023 return u;
5024}
5025
5026#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005027 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 if (!str) \
5029 goto onError; \
5030 if (PyList_Append(list, str)) { \
5031 Py_DECREF(str); \
5032 goto onError; \
5033 } \
5034 else \
5035 Py_DECREF(str);
5036
5037static
5038PyObject *split_whitespace(PyUnicodeObject *self,
5039 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005040 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 register Py_ssize_t i;
5043 register Py_ssize_t j;
5044 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 PyObject *str;
5046
5047 for (i = j = 0; i < len; ) {
5048 /* find a token */
5049 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5050 i++;
5051 j = i;
5052 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5053 i++;
5054 if (j < i) {
5055 if (maxcount-- <= 0)
5056 break;
5057 SPLIT_APPEND(self->str, j, i);
5058 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5059 i++;
5060 j = i;
5061 }
5062 }
5063 if (j < len) {
5064 SPLIT_APPEND(self->str, j, len);
5065 }
5066 return list;
5067
5068 onError:
5069 Py_DECREF(list);
5070 return NULL;
5071}
5072
5073PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005074 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005076 register Py_ssize_t i;
5077 register Py_ssize_t j;
5078 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 PyObject *list;
5080 PyObject *str;
5081 Py_UNICODE *data;
5082
5083 string = PyUnicode_FromObject(string);
5084 if (string == NULL)
5085 return NULL;
5086 data = PyUnicode_AS_UNICODE(string);
5087 len = PyUnicode_GET_SIZE(string);
5088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 list = PyList_New(0);
5090 if (!list)
5091 goto onError;
5092
5093 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005095
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005097 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099
5100 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005101 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (i < len) {
5103 if (data[i] == '\r' && i + 1 < len &&
5104 data[i+1] == '\n')
5105 i += 2;
5106 else
5107 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005108 if (keepends)
5109 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 }
Guido van Rossum86662912000-04-11 15:38:46 +00005111 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 j = i;
5113 }
5114 if (j < len) {
5115 SPLIT_APPEND(data, j, len);
5116 }
5117
5118 Py_DECREF(string);
5119 return list;
5120
5121 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005122 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 Py_DECREF(string);
5124 return NULL;
5125}
5126
Tim Petersced69f82003-09-16 20:30:58 +00005127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128PyObject *split_char(PyUnicodeObject *self,
5129 PyObject *list,
5130 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005131 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005133 register Py_ssize_t i;
5134 register Py_ssize_t j;
5135 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 PyObject *str;
5137
5138 for (i = j = 0; i < len; ) {
5139 if (self->str[i] == ch) {
5140 if (maxcount-- <= 0)
5141 break;
5142 SPLIT_APPEND(self->str, j, i);
5143 i = j = i + 1;
5144 } else
5145 i++;
5146 }
5147 if (j <= len) {
5148 SPLIT_APPEND(self->str, j, len);
5149 }
5150 return list;
5151
5152 onError:
5153 Py_DECREF(list);
5154 return NULL;
5155}
5156
Tim Petersced69f82003-09-16 20:30:58 +00005157static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158PyObject *split_substring(PyUnicodeObject *self,
5159 PyObject *list,
5160 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005161 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005163 register Py_ssize_t i;
5164 register Py_ssize_t j;
5165 Py_ssize_t len = self->length;
5166 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 PyObject *str;
5168
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005169 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 if (Py_UNICODE_MATCH(self, i, substring)) {
5171 if (maxcount-- <= 0)
5172 break;
5173 SPLIT_APPEND(self->str, j, i);
5174 i = j = i + sublen;
5175 } else
5176 i++;
5177 }
5178 if (j <= len) {
5179 SPLIT_APPEND(self->str, j, len);
5180 }
5181 return list;
5182
5183 onError:
5184 Py_DECREF(list);
5185 return NULL;
5186}
5187
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005188static
5189PyObject *rsplit_whitespace(PyUnicodeObject *self,
5190 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005191 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 register Py_ssize_t i;
5194 register Py_ssize_t j;
5195 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005196 PyObject *str;
5197
5198 for (i = j = len - 1; i >= 0; ) {
5199 /* find a token */
5200 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5201 i--;
5202 j = i;
5203 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5204 i--;
5205 if (j > i) {
5206 if (maxcount-- <= 0)
5207 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005208 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005209 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5210 i--;
5211 j = i;
5212 }
5213 }
5214 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005215 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005216 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005217 if (PyList_Reverse(list) < 0)
5218 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005219 return list;
5220
5221 onError:
5222 Py_DECREF(list);
5223 return NULL;
5224}
5225
5226static
5227PyObject *rsplit_char(PyUnicodeObject *self,
5228 PyObject *list,
5229 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 register Py_ssize_t i;
5233 register Py_ssize_t j;
5234 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005235 PyObject *str;
5236
5237 for (i = j = len - 1; i >= 0; ) {
5238 if (self->str[i] == ch) {
5239 if (maxcount-- <= 0)
5240 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005241 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005242 j = i = i - 1;
5243 } else
5244 i--;
5245 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005246 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005247 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005248 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005249 if (PyList_Reverse(list) < 0)
5250 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005251 return list;
5252
5253 onError:
5254 Py_DECREF(list);
5255 return NULL;
5256}
5257
5258static
5259PyObject *rsplit_substring(PyUnicodeObject *self,
5260 PyObject *list,
5261 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005262 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 register Py_ssize_t i;
5265 register Py_ssize_t j;
5266 Py_ssize_t len = self->length;
5267 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005268 PyObject *str;
5269
5270 for (i = len - sublen, j = len; i >= 0; ) {
5271 if (Py_UNICODE_MATCH(self, i, substring)) {
5272 if (maxcount-- <= 0)
5273 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005274 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005275 j = i;
5276 i -= sublen;
5277 } else
5278 i--;
5279 }
5280 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005281 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005282 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005283 if (PyList_Reverse(list) < 0)
5284 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005285 return list;
5286
5287 onError:
5288 Py_DECREF(list);
5289 return NULL;
5290}
5291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292#undef SPLIT_APPEND
5293
5294static
5295PyObject *split(PyUnicodeObject *self,
5296 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298{
5299 PyObject *list;
5300
5301 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005302 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303
5304 list = PyList_New(0);
5305 if (!list)
5306 return NULL;
5307
5308 if (substring == NULL)
5309 return split_whitespace(self,list,maxcount);
5310
5311 else if (substring->length == 1)
5312 return split_char(self,list,substring->str[0],maxcount);
5313
5314 else if (substring->length == 0) {
5315 Py_DECREF(list);
5316 PyErr_SetString(PyExc_ValueError, "empty separator");
5317 return NULL;
5318 }
5319 else
5320 return split_substring(self,list,substring,maxcount);
5321}
5322
Tim Petersced69f82003-09-16 20:30:58 +00005323static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005324PyObject *rsplit(PyUnicodeObject *self,
5325 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005327{
5328 PyObject *list;
5329
5330 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005331 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005332
5333 list = PyList_New(0);
5334 if (!list)
5335 return NULL;
5336
5337 if (substring == NULL)
5338 return rsplit_whitespace(self,list,maxcount);
5339
5340 else if (substring->length == 1)
5341 return rsplit_char(self,list,substring->str[0],maxcount);
5342
5343 else if (substring->length == 0) {
5344 Py_DECREF(list);
5345 PyErr_SetString(PyExc_ValueError, "empty separator");
5346 return NULL;
5347 }
5348 else
5349 return rsplit_substring(self,list,substring,maxcount);
5350}
5351
5352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353PyObject *replace(PyUnicodeObject *self,
5354 PyUnicodeObject *str1,
5355 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357{
5358 PyUnicodeObject *u;
5359
5360 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005361 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
Thomas Wouters477c8d52006-05-27 19:21:47 +00005363 if (str1->length == str2->length) {
5364 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005365 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005366 if (str1->length == 1) {
5367 /* replace characters */
5368 Py_UNICODE u1, u2;
5369 if (!findchar(self->str, self->length, str1->str[0]))
5370 goto nothing;
5371 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5372 if (!u)
5373 return NULL;
5374 Py_UNICODE_COPY(u->str, self->str, self->length);
5375 u1 = str1->str[0];
5376 u2 = str2->str[0];
5377 for (i = 0; i < u->length; i++)
5378 if (u->str[i] == u1) {
5379 if (--maxcount < 0)
5380 break;
5381 u->str[i] = u2;
5382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005384 i = fastsearch(
5385 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005387 if (i < 0)
5388 goto nothing;
5389 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5390 if (!u)
5391 return NULL;
5392 Py_UNICODE_COPY(u->str, self->str, self->length);
5393 while (i <= self->length - str1->length)
5394 if (Py_UNICODE_MATCH(self, i, str1)) {
5395 if (--maxcount < 0)
5396 break;
5397 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5398 i += str1->length;
5399 } else
5400 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005403
5404 Py_ssize_t n, i, j, e;
5405 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 Py_UNICODE *p;
5407
5408 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 if (n > maxcount)
5411 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005412 if (n == 0)
5413 goto nothing;
5414 /* new_size = self->length + n * (str2->length - str1->length)); */
5415 delta = (str2->length - str1->length);
5416 if (delta == 0) {
5417 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 product = n * (str2->length - str1->length);
5420 if ((product / (str2->length - str1->length)) != n) {
5421 PyErr_SetString(PyExc_OverflowError,
5422 "replace string is too long");
5423 return NULL;
5424 }
5425 new_size = self->length + product;
5426 if (new_size < 0) {
5427 PyErr_SetString(PyExc_OverflowError,
5428 "replace string is too long");
5429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 }
5431 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005432 u = _PyUnicode_New(new_size);
5433 if (!u)
5434 return NULL;
5435 i = 0;
5436 p = u->str;
5437 e = self->length - str1->length;
5438 if (str1->length > 0) {
5439 while (n-- > 0) {
5440 /* look for next match */
5441 j = i;
5442 while (j <= e) {
5443 if (Py_UNICODE_MATCH(self, j, str1))
5444 break;
5445 j++;
5446 }
5447 if (j > i) {
5448 if (j > e)
5449 break;
5450 /* copy unchanged part [i:j] */
5451 Py_UNICODE_COPY(p, self->str+i, j-i);
5452 p += j - i;
5453 }
5454 /* copy substitution string */
5455 if (str2->length > 0) {
5456 Py_UNICODE_COPY(p, str2->str, str2->length);
5457 p += str2->length;
5458 }
5459 i = j + str1->length;
5460 }
5461 if (i < self->length)
5462 /* copy tail [i:] */
5463 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5464 } else {
5465 /* interleave */
5466 while (n > 0) {
5467 Py_UNICODE_COPY(p, str2->str, str2->length);
5468 p += str2->length;
5469 if (--n <= 0)
5470 break;
5471 *p++ = self->str[i++];
5472 }
5473 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005477
5478nothing:
5479 /* nothing to replace; return original string (when possible) */
5480 if (PyUnicode_CheckExact(self)) {
5481 Py_INCREF(self);
5482 return (PyObject *) self;
5483 }
5484 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485}
5486
5487/* --- Unicode Object Methods --------------------------------------------- */
5488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005489PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490"S.title() -> unicode\n\
5491\n\
5492Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005493characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
5495static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005496unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 return fixup(self, fixtitle);
5499}
5500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005501PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502"S.capitalize() -> unicode\n\
5503\n\
5504Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005505have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506
5507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005508unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 return fixup(self, fixcapitalize);
5511}
5512
5513#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005514PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515"S.capwords() -> unicode\n\
5516\n\
5517Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005518normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
5520static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005521unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522{
5523 PyObject *list;
5524 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 /* Split into words */
5528 list = split(self, NULL, -1);
5529 if (!list)
5530 return NULL;
5531
5532 /* Capitalize each word */
5533 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5534 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5535 fixcapitalize);
5536 if (item == NULL)
5537 goto onError;
5538 Py_DECREF(PyList_GET_ITEM(list, i));
5539 PyList_SET_ITEM(list, i, item);
5540 }
5541
5542 /* Join the words to form a new string */
5543 item = PyUnicode_Join(NULL, list);
5544
5545onError:
5546 Py_DECREF(list);
5547 return (PyObject *)item;
5548}
5549#endif
5550
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005551/* Argument converter. Coerces to a single unicode character */
5552
5553static int
5554convert_uc(PyObject *obj, void *addr)
5555{
5556 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5557 PyObject *uniobj;
5558 Py_UNICODE *unistr;
5559
5560 uniobj = PyUnicode_FromObject(obj);
5561 if (uniobj == NULL) {
5562 PyErr_SetString(PyExc_TypeError,
5563 "The fill character cannot be converted to Unicode");
5564 return 0;
5565 }
5566 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5567 PyErr_SetString(PyExc_TypeError,
5568 "The fill character must be exactly one character long");
5569 Py_DECREF(uniobj);
5570 return 0;
5571 }
5572 unistr = PyUnicode_AS_UNICODE(uniobj);
5573 *fillcharloc = unistr[0];
5574 Py_DECREF(uniobj);
5575 return 1;
5576}
5577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005578PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005579"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005581Return S centered in a Unicode string of length width. Padding is\n\
5582done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
5584static PyObject *
5585unicode_center(PyUnicodeObject *self, PyObject *args)
5586{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 Py_ssize_t marg, left;
5588 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005589 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590
Thomas Woutersde017742006-02-16 19:34:37 +00005591 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 return NULL;
5593
Tim Peters7a29bd52001-09-12 03:03:31 +00005594 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 Py_INCREF(self);
5596 return (PyObject*) self;
5597 }
5598
5599 marg = width - self->length;
5600 left = marg / 2 + (marg & width & 1);
5601
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005602 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603}
5604
Marc-André Lemburge5034372000-08-08 08:04:29 +00005605#if 0
5606
5607/* This code should go into some future Unicode collation support
5608 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005609 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005610
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005611/* speedy UTF-16 code point order comparison */
5612/* gleaned from: */
5613/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5614
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005615static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005616{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005617 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005618 0, 0, 0, 0, 0, 0, 0, 0,
5619 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005620 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005621};
5622
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623static int
5624unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005626 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 Py_UNICODE *s1 = str1->str;
5629 Py_UNICODE *s2 = str2->str;
5630
5631 len1 = str1->length;
5632 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005633
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005635 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005636
5637 c1 = *s1++;
5638 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005639
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005640 if (c1 > (1<<11) * 26)
5641 c1 += utf16Fixup[c1>>11];
5642 if (c2 > (1<<11) * 26)
5643 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005644 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005645
5646 if (c1 != c2)
5647 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005648
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005649 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 }
5651
5652 return (len1 < len2) ? -1 : (len1 != len2);
5653}
5654
Marc-André Lemburge5034372000-08-08 08:04:29 +00005655#else
5656
5657static int
5658unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005661
5662 Py_UNICODE *s1 = str1->str;
5663 Py_UNICODE *s2 = str2->str;
5664
5665 len1 = str1->length;
5666 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005667
Marc-André Lemburge5034372000-08-08 08:04:29 +00005668 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005669 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005670
Fredrik Lundh45714e92001-06-26 16:39:36 +00005671 c1 = *s1++;
5672 c2 = *s2++;
5673
5674 if (c1 != c2)
5675 return (c1 < c2) ? -1 : 1;
5676
Marc-André Lemburge5034372000-08-08 08:04:29 +00005677 len1--; len2--;
5678 }
5679
5680 return (len1 < len2) ? -1 : (len1 != len2);
5681}
5682
5683#endif
5684
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685int PyUnicode_Compare(PyObject *left,
5686 PyObject *right)
5687{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005688 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5689 return unicode_compare((PyUnicodeObject *)left,
5690 (PyUnicodeObject *)right);
5691 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5692 (PyUnicode_Check(left) && PyString_Check(right))) {
5693 if (PyUnicode_Check(left))
5694 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5695 if (PyUnicode_Check(right))
5696 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5697 assert(PyString_Check(left));
5698 assert(PyString_Check(right));
5699 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005701 PyErr_Format(PyExc_TypeError,
5702 "Can't compare %.100s and %.100s",
5703 left->ob_type->tp_name,
5704 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return -1;
5706}
5707
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005708PyObject *PyUnicode_RichCompare(PyObject *left,
5709 PyObject *right,
5710 int op)
5711{
5712 int result;
5713
5714 result = PyUnicode_Compare(left, right);
5715 if (result == -1 && PyErr_Occurred())
5716 goto onError;
5717
5718 /* Convert the return value to a Boolean */
5719 switch (op) {
5720 case Py_EQ:
5721 result = (result == 0);
5722 break;
5723 case Py_NE:
5724 result = (result != 0);
5725 break;
5726 case Py_LE:
5727 result = (result <= 0);
5728 break;
5729 case Py_GE:
5730 result = (result >= 0);
5731 break;
5732 case Py_LT:
5733 result = (result == -1);
5734 break;
5735 case Py_GT:
5736 result = (result == 1);
5737 break;
5738 }
5739 return PyBool_FromLong(result);
5740
5741 onError:
5742
5743 /* Standard case
5744
5745 Type errors mean that PyUnicode_FromObject() could not convert
5746 one of the arguments (usually the right hand side) to Unicode,
5747 ie. we can't handle the comparison request. However, it is
5748 possible that the other object knows a comparison method, which
5749 is why we return Py_NotImplemented to give the other object a
5750 chance.
5751
5752 */
5753 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5754 PyErr_Clear();
5755 Py_INCREF(Py_NotImplemented);
5756 return Py_NotImplemented;
5757 }
5758 if (op != Py_EQ && op != Py_NE)
5759 return NULL;
5760
5761 /* Equality comparison.
5762
5763 This is a special case: we silence any PyExc_UnicodeDecodeError
5764 and instead turn it into a PyErr_UnicodeWarning.
5765
5766 */
5767 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5768 return NULL;
5769 PyErr_Clear();
5770 if (PyErr_Warn(PyExc_UnicodeWarning,
5771 (op == Py_EQ) ?
5772 "Unicode equal comparison "
5773 "failed to convert both arguments to Unicode - "
5774 "interpreting them as being unequal" :
5775 "Unicode unequal comparison "
5776 "failed to convert both arguments to Unicode - "
5777 "interpreting them as being unequal"
5778 ) < 0)
5779 return NULL;
5780 result = (op == Py_NE);
5781 return PyBool_FromLong(result);
5782}
5783
Guido van Rossum403d68b2000-03-13 15:55:09 +00005784int PyUnicode_Contains(PyObject *container,
5785 PyObject *element)
5786{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005787 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005788 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005789
5790 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005791 sub = PyUnicode_FromObject(element);
5792 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005793 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005794 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005795 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005796 }
5797
Thomas Wouters477c8d52006-05-27 19:21:47 +00005798 str = PyUnicode_FromObject(container);
5799 if (!str) {
5800 Py_DECREF(sub);
5801 return -1;
5802 }
5803
5804 result = stringlib_contains_obj(str, sub);
5805
5806 Py_DECREF(str);
5807 Py_DECREF(sub);
5808
Guido van Rossum403d68b2000-03-13 15:55:09 +00005809 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005810}
5811
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812/* Concat to string or Unicode object giving a new Unicode object. */
5813
5814PyObject *PyUnicode_Concat(PyObject *left,
5815 PyObject *right)
5816{
5817 PyUnicodeObject *u = NULL, *v = NULL, *w;
5818
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005819 if (PyBytes_Check(left) || PyBytes_Check(right))
5820 return PyBytes_Concat(left, right);
5821
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 /* Coerce the two arguments */
5823 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5824 if (u == NULL)
5825 goto onError;
5826 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5827 if (v == NULL)
5828 goto onError;
5829
5830 /* Shortcuts */
5831 if (v == unicode_empty) {
5832 Py_DECREF(v);
5833 return (PyObject *)u;
5834 }
5835 if (u == unicode_empty) {
5836 Py_DECREF(u);
5837 return (PyObject *)v;
5838 }
5839
5840 /* Concat the two Unicode strings */
5841 w = _PyUnicode_New(u->length + v->length);
5842 if (w == NULL)
5843 goto onError;
5844 Py_UNICODE_COPY(w->str, u->str, u->length);
5845 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5846
5847 Py_DECREF(u);
5848 Py_DECREF(v);
5849 return (PyObject *)w;
5850
5851onError:
5852 Py_XDECREF(u);
5853 Py_XDECREF(v);
5854 return NULL;
5855}
5856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005857PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858"S.count(sub[, start[, end]]) -> int\n\
5859\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005860Return the number of non-overlapping occurrences of substring sub in\n\
5861Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005862interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
5864static PyObject *
5865unicode_count(PyUnicodeObject *self, PyObject *args)
5866{
5867 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005869 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 PyObject *result;
5871
Guido van Rossumb8872e62000-05-09 14:14:27 +00005872 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5873 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 return NULL;
5875
5876 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 if (substring == NULL)
5879 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005880
Thomas Wouters477c8d52006-05-27 19:21:47 +00005881 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882
Thomas Wouters477c8d52006-05-27 19:21:47 +00005883 result = PyInt_FromSsize_t(
5884 stringlib_count(self->str + start, end - start,
5885 substring->str, substring->length)
5886 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
5888 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 return result;
5891}
5892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005893PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005894"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005896Encodes S using the codec registered for encoding. encoding defaults\n\
5897to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005898handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5900'xmlcharrefreplace' as well as any other name registered with\n\
5901codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
5903static PyObject *
5904unicode_encode(PyUnicodeObject *self, PyObject *args)
5905{
5906 char *encoding = NULL;
5907 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005908 PyObject *v;
5909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5911 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005912 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005913 if (v == NULL)
5914 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005915 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005916 if (PyString_Check(v)) {
5917 /* Old codec, turn it into bytes */
5918 PyObject *b = PyBytes_FromObject(v);
5919 Py_DECREF(v);
5920 return b;
5921 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005922 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005923 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005924 "(type=%.400s)",
5925 v->ob_type->tp_name);
5926 Py_DECREF(v);
5927 return NULL;
5928 }
5929 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005930
5931 onError:
5932 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005933}
5934
5935PyDoc_STRVAR(decode__doc__,
5936"S.decode([encoding[,errors]]) -> string or unicode\n\
5937\n\
5938Decodes S using the codec registered for encoding. encoding defaults\n\
5939to the default encoding. errors may be given to set a different error\n\
5940handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5941a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5942as well as any other name registerd with codecs.register_error that is\n\
5943able to handle UnicodeDecodeErrors.");
5944
5945static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005946unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005947{
5948 char *encoding = NULL;
5949 char *errors = NULL;
5950 PyObject *v;
5951
5952 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5953 return NULL;
5954 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005955 if (v == NULL)
5956 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005957 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5958 PyErr_Format(PyExc_TypeError,
5959 "decoder did not return a string/unicode object "
5960 "(type=%.400s)",
5961 v->ob_type->tp_name);
5962 Py_DECREF(v);
5963 return NULL;
5964 }
5965 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005966
5967 onError:
5968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969}
5970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005971PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972"S.expandtabs([tabsize]) -> unicode\n\
5973\n\
5974Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005975If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977static PyObject*
5978unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5979{
5980 Py_UNICODE *e;
5981 Py_UNICODE *p;
5982 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005983 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 PyUnicodeObject *u;
5985 int tabsize = 8;
5986
5987 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5988 return NULL;
5989
Thomas Wouters7e474022000-07-16 12:04:32 +00005990 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 i = j = 0;
5992 e = self->str + self->length;
5993 for (p = self->str; p < e; p++)
5994 if (*p == '\t') {
5995 if (tabsize > 0)
5996 j += tabsize - (j % tabsize);
5997 }
5998 else {
5999 j++;
6000 if (*p == '\n' || *p == '\r') {
6001 i += j;
6002 j = 0;
6003 }
6004 }
6005
6006 /* Second pass: create output string and fill it */
6007 u = _PyUnicode_New(i + j);
6008 if (!u)
6009 return NULL;
6010
6011 j = 0;
6012 q = u->str;
6013
6014 for (p = self->str; p < e; p++)
6015 if (*p == '\t') {
6016 if (tabsize > 0) {
6017 i = tabsize - (j % tabsize);
6018 j += i;
6019 while (i--)
6020 *q++ = ' ';
6021 }
6022 }
6023 else {
6024 j++;
6025 *q++ = *p;
6026 if (*p == '\n' || *p == '\r')
6027 j = 0;
6028 }
6029
6030 return (PyObject*) u;
6031}
6032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006033PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034"S.find(sub [,start [,end]]) -> int\n\
6035\n\
6036Return the lowest index in S where substring sub is found,\n\
6037such that sub is contained within s[start,end]. Optional\n\
6038arguments start and end are interpreted as in slice notation.\n\
6039\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006040Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
6042static PyObject *
6043unicode_find(PyUnicodeObject *self, PyObject *args)
6044{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006045 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006046 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006047 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006048 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049
Guido van Rossumb8872e62000-05-09 14:14:27 +00006050 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6051 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006053 substring = PyUnicode_FromObject(substring);
6054 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 return NULL;
6056
Thomas Wouters477c8d52006-05-27 19:21:47 +00006057 result = stringlib_find_slice(
6058 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6059 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6060 start, end
6061 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
6063 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006064
6065 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
6068static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006069unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070{
6071 if (index < 0 || index >= self->length) {
6072 PyErr_SetString(PyExc_IndexError, "string index out of range");
6073 return NULL;
6074 }
6075
6076 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6077}
6078
6079static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006080unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006082 /* Since Unicode objects compare equal to their UTF-8 string
6083 counterparts, we hash the UTF-8 string. */
6084 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6085 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086}
6087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089"S.index(sub [,start [,end]]) -> int\n\
6090\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092
6093static PyObject *
6094unicode_index(PyUnicodeObject *self, PyObject *args)
6095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006097 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006098 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006099 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
Guido van Rossumb8872e62000-05-09 14:14:27 +00006101 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6102 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006104 substring = PyUnicode_FromObject(substring);
6105 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return NULL;
6107
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108 result = stringlib_find_slice(
6109 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6110 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6111 start, end
6112 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
6114 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 if (result < 0) {
6117 PyErr_SetString(PyExc_ValueError, "substring not found");
6118 return NULL;
6119 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006120
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122}
6123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006124PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006125"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006127Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006128at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
6130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006131unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
6133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6134 register const Py_UNICODE *e;
6135 int cased;
6136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 /* Shortcut for single character strings */
6138 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006139 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006141 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006142 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006144
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 e = p + PyUnicode_GET_SIZE(self);
6146 cased = 0;
6147 for (; p < e; p++) {
6148 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006149
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006151 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 else if (!cased && Py_UNICODE_ISLOWER(ch))
6153 cased = 1;
6154 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006155 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156}
6157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006158PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006159"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006161Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006162at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
6164static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006165unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
6167 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6168 register const Py_UNICODE *e;
6169 int cased;
6170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 /* Shortcut for single character strings */
6172 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006173 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006175 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006176 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006178
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 e = p + PyUnicode_GET_SIZE(self);
6180 cased = 0;
6181 for (; p < e; p++) {
6182 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006185 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 else if (!cased && Py_UNICODE_ISUPPER(ch))
6187 cased = 1;
6188 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006189 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190}
6191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006192PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006193"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006195Return True if S is a titlecased string and there is at least one\n\
6196character in S, i.e. upper- and titlecase characters may only\n\
6197follow uncased characters and lowercase characters only cased ones.\n\
6198Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
6200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006201unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202{
6203 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6204 register const Py_UNICODE *e;
6205 int cased, previous_is_cased;
6206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 /* Shortcut for single character strings */
6208 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006209 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6210 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006212 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006213 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006214 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006215
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 e = p + PyUnicode_GET_SIZE(self);
6217 cased = 0;
6218 previous_is_cased = 0;
6219 for (; p < e; p++) {
6220 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006221
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6223 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006224 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 previous_is_cased = 1;
6226 cased = 1;
6227 }
6228 else if (Py_UNICODE_ISLOWER(ch)) {
6229 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006230 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 previous_is_cased = 1;
6232 cased = 1;
6233 }
6234 else
6235 previous_is_cased = 0;
6236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006237 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238}
6239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006240PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006241"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006243Return True if all characters in S are whitespace\n\
6244and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245
6246static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006247unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248{
6249 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6250 register const Py_UNICODE *e;
6251
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 /* Shortcut for single character strings */
6253 if (PyUnicode_GET_SIZE(self) == 1 &&
6254 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006255 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006257 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006258 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006259 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 e = p + PyUnicode_GET_SIZE(self);
6262 for (; p < e; p++) {
6263 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006264 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006266 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267}
6268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006269PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006270"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006271\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006272Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006273and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006274
6275static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006276unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006277{
6278 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6279 register const Py_UNICODE *e;
6280
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006281 /* Shortcut for single character strings */
6282 if (PyUnicode_GET_SIZE(self) == 1 &&
6283 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006284 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006285
6286 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006287 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006288 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006289
6290 e = p + PyUnicode_GET_SIZE(self);
6291 for (; p < e; p++) {
6292 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006293 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006294 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006295 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006296}
6297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006299"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006300\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006301Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006302and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006303
6304static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006305unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006306{
6307 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6308 register const Py_UNICODE *e;
6309
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006310 /* Shortcut for single character strings */
6311 if (PyUnicode_GET_SIZE(self) == 1 &&
6312 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006313 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006314
6315 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006316 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006317 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006318
6319 e = p + PyUnicode_GET_SIZE(self);
6320 for (; p < e; p++) {
6321 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006322 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006323 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006324 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006325}
6326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006327PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006328"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006330Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006331False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006334unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335{
6336 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6337 register const Py_UNICODE *e;
6338
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 /* Shortcut for single character strings */
6340 if (PyUnicode_GET_SIZE(self) == 1 &&
6341 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006342 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006344 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006345 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006346 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006347
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 e = p + PyUnicode_GET_SIZE(self);
6349 for (; p < e; p++) {
6350 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006351 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006353 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354}
6355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006356PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006357"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006359Return True if all characters in S are digits\n\
6360and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
6362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006363unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364{
6365 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6366 register const Py_UNICODE *e;
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 /* Shortcut for single character strings */
6369 if (PyUnicode_GET_SIZE(self) == 1 &&
6370 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006371 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006373 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006374 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 e = p + PyUnicode_GET_SIZE(self);
6378 for (; p < e; p++) {
6379 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006385PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006388Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390
6391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006392unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6395 register const Py_UNICODE *e;
6396
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 /* Shortcut for single character strings */
6398 if (PyUnicode_GET_SIZE(self) == 1 &&
6399 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006400 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006402 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006403 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 e = p + PyUnicode_GET_SIZE(self);
6407 for (; p < e; p++) {
6408 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006409 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412}
6413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415"S.join(sequence) -> unicode\n\
6416\n\
6417Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006418sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
6420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006421unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006423 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424}
6425
Martin v. Löwis18e16552006-02-15 17:27:45 +00006426static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427unicode_length(PyUnicodeObject *self)
6428{
6429 return self->length;
6430}
6431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006432PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006433"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434\n\
6435Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006436done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
6438static PyObject *
6439unicode_ljust(PyUnicodeObject *self, PyObject *args)
6440{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006441 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006442 Py_UNICODE fillchar = ' ';
6443
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006444 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 return NULL;
6446
Tim Peters7a29bd52001-09-12 03:03:31 +00006447 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 Py_INCREF(self);
6449 return (PyObject*) self;
6450 }
6451
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006452 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453}
6454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006455PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456"S.lower() -> unicode\n\
6457\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006458Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
6460static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006461unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 return fixup(self, fixlower);
6464}
6465
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006466#define LEFTSTRIP 0
6467#define RIGHTSTRIP 1
6468#define BOTHSTRIP 2
6469
6470/* Arrays indexed by above */
6471static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6472
6473#define STRIPNAME(i) (stripformat[i]+3)
6474
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006475/* externally visible for str.strip(unicode) */
6476PyObject *
6477_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6478{
6479 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006480 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006481 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006482 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6483 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006484
Thomas Wouters477c8d52006-05-27 19:21:47 +00006485 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6486
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006487 i = 0;
6488 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006489 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6490 i++;
6491 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006492 }
6493
6494 j = len;
6495 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006496 do {
6497 j--;
6498 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6499 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006500 }
6501
6502 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 Py_INCREF(self);
6504 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006505 }
6506 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006507 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006508}
6509
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006512do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006514 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006515 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006516
6517 i = 0;
6518 if (striptype != RIGHTSTRIP) {
6519 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6520 i++;
6521 }
6522 }
6523
6524 j = len;
6525 if (striptype != LEFTSTRIP) {
6526 do {
6527 j--;
6528 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6529 j++;
6530 }
6531
6532 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6533 Py_INCREF(self);
6534 return (PyObject*)self;
6535 }
6536 else
6537 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538}
6539
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006540
6541static PyObject *
6542do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6543{
6544 PyObject *sep = NULL;
6545
6546 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6547 return NULL;
6548
6549 if (sep != NULL && sep != Py_None) {
6550 if (PyUnicode_Check(sep))
6551 return _PyUnicode_XStrip(self, striptype, sep);
6552 else if (PyString_Check(sep)) {
6553 PyObject *res;
6554 sep = PyUnicode_FromObject(sep);
6555 if (sep==NULL)
6556 return NULL;
6557 res = _PyUnicode_XStrip(self, striptype, sep);
6558 Py_DECREF(sep);
6559 return res;
6560 }
6561 else {
6562 PyErr_Format(PyExc_TypeError,
6563 "%s arg must be None, unicode or str",
6564 STRIPNAME(striptype));
6565 return NULL;
6566 }
6567 }
6568
6569 return do_strip(self, striptype);
6570}
6571
6572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006573PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006574"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006575\n\
6576Return a copy of the string S with leading and trailing\n\
6577whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006578If chars is given and not None, remove characters in chars instead.\n\
6579If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006580
6581static PyObject *
6582unicode_strip(PyUnicodeObject *self, PyObject *args)
6583{
6584 if (PyTuple_GET_SIZE(args) == 0)
6585 return do_strip(self, BOTHSTRIP); /* Common case */
6586 else
6587 return do_argstrip(self, BOTHSTRIP, args);
6588}
6589
6590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006591PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006592"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006593\n\
6594Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006595If chars is given and not None, remove characters in chars instead.\n\
6596If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006597
6598static PyObject *
6599unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6600{
6601 if (PyTuple_GET_SIZE(args) == 0)
6602 return do_strip(self, LEFTSTRIP); /* Common case */
6603 else
6604 return do_argstrip(self, LEFTSTRIP, args);
6605}
6606
6607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006608PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006609"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006610\n\
6611Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006612If chars is given and not None, remove characters in chars instead.\n\
6613If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006614
6615static PyObject *
6616unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6617{
6618 if (PyTuple_GET_SIZE(args) == 0)
6619 return do_strip(self, RIGHTSTRIP); /* Common case */
6620 else
6621 return do_argstrip(self, RIGHTSTRIP, args);
6622}
6623
6624
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
6628 PyUnicodeObject *u;
6629 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006630 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006631 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
6633 if (len < 0)
6634 len = 0;
6635
Tim Peters7a29bd52001-09-12 03:03:31 +00006636 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 /* no repeat, return original string */
6638 Py_INCREF(str);
6639 return (PyObject*) str;
6640 }
Tim Peters8f422462000-09-09 06:13:41 +00006641
6642 /* ensure # of chars needed doesn't overflow int and # of bytes
6643 * needed doesn't overflow size_t
6644 */
6645 nchars = len * str->length;
6646 if (len && nchars / len != str->length) {
6647 PyErr_SetString(PyExc_OverflowError,
6648 "repeated string is too long");
6649 return NULL;
6650 }
6651 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6652 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6653 PyErr_SetString(PyExc_OverflowError,
6654 "repeated string is too long");
6655 return NULL;
6656 }
6657 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 if (!u)
6659 return NULL;
6660
6661 p = u->str;
6662
Thomas Wouters477c8d52006-05-27 19:21:47 +00006663 if (str->length == 1 && len > 0) {
6664 Py_UNICODE_FILL(p, str->str[0], len);
6665 } else {
6666 Py_ssize_t done = 0; /* number of characters copied this far */
6667 if (done < nchars) {
6668 Py_UNICODE_COPY(p, str->str, str->length);
6669 done = str->length;
6670 }
6671 while (done < nchars) {
6672 int n = (done <= nchars-done) ? done : nchars-done;
6673 Py_UNICODE_COPY(p+done, p, n);
6674 done += n;
6675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 }
6677
6678 return (PyObject*) u;
6679}
6680
6681PyObject *PyUnicode_Replace(PyObject *obj,
6682 PyObject *subobj,
6683 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006684 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685{
6686 PyObject *self;
6687 PyObject *str1;
6688 PyObject *str2;
6689 PyObject *result;
6690
6691 self = PyUnicode_FromObject(obj);
6692 if (self == NULL)
6693 return NULL;
6694 str1 = PyUnicode_FromObject(subobj);
6695 if (str1 == NULL) {
6696 Py_DECREF(self);
6697 return NULL;
6698 }
6699 str2 = PyUnicode_FromObject(replobj);
6700 if (str2 == NULL) {
6701 Py_DECREF(self);
6702 Py_DECREF(str1);
6703 return NULL;
6704 }
Tim Petersced69f82003-09-16 20:30:58 +00006705 result = replace((PyUnicodeObject *)self,
6706 (PyUnicodeObject *)str1,
6707 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 maxcount);
6709 Py_DECREF(self);
6710 Py_DECREF(str1);
6711 Py_DECREF(str2);
6712 return result;
6713}
6714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006715PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716"S.replace (old, new[, maxsplit]) -> unicode\n\
6717\n\
6718Return a copy of S with all occurrences of substring\n\
6719old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
6722static PyObject*
6723unicode_replace(PyUnicodeObject *self, PyObject *args)
6724{
6725 PyUnicodeObject *str1;
6726 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006727 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 PyObject *result;
6729
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return NULL;
6732 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6733 if (str1 == NULL)
6734 return NULL;
6735 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006736 if (str2 == NULL) {
6737 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
6741 result = replace(self, str1, str2, maxcount);
6742
6743 Py_DECREF(str1);
6744 Py_DECREF(str2);
6745 return result;
6746}
6747
6748static
6749PyObject *unicode_repr(PyObject *unicode)
6750{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006751 PyObject *repr;
6752 char *p;
6753 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6754 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6755
6756 /* XXX(nnorwitz): rather than over-allocating, it would be
6757 better to choose a different scheme. Perhaps scan the
6758 first N-chars of the string and allocate based on that size.
6759 */
6760 /* Initial allocation is based on the longest-possible unichr
6761 escape.
6762
6763 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6764 unichr, so in this case it's the longest unichr escape. In
6765 narrow (UTF-16) builds this is five chars per source unichr
6766 since there are two unichrs in the surrogate pair, so in narrow
6767 (UTF-16) builds it's not the longest unichr escape.
6768
6769 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6770 so in the narrow (UTF-16) build case it's the longest unichr
6771 escape.
6772 */
6773
6774 repr = PyString_FromStringAndSize(NULL,
6775 2 /* quotes */
6776#ifdef Py_UNICODE_WIDE
6777 + 10*size
6778#else
6779 + 6*size
6780#endif
6781 + 1);
6782 if (repr == NULL)
6783 return NULL;
6784
6785 p = PyString_AS_STRING(repr);
6786
6787 /* Add quote */
6788 *p++ = (findchar(s, size, '\'') &&
6789 !findchar(s, size, '"')) ? '"' : '\'';
6790 while (size-- > 0) {
6791 Py_UNICODE ch = *s++;
6792
6793 /* Escape quotes and backslashes */
6794 if ((ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || (ch == '\\')) {
6795 *p++ = '\\';
6796 *p++ = (char) ch;
6797 continue;
6798 }
6799
6800#ifdef Py_UNICODE_WIDE
6801 /* Map 21-bit characters to '\U00xxxxxx' */
6802 else if (ch >= 0x10000) {
6803 *p++ = '\\';
6804 *p++ = 'U';
6805 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6806 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6807 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6808 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6809 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6810 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6811 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6812 *p++ = hexdigits[ch & 0x0000000F];
6813 continue;
6814 }
6815#else
6816 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6817 else if (ch >= 0xD800 && ch < 0xDC00) {
6818 Py_UNICODE ch2;
6819 Py_UCS4 ucs;
6820
6821 ch2 = *s++;
6822 size--;
6823 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6824 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6825 *p++ = '\\';
6826 *p++ = 'U';
6827 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6828 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6829 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6830 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6831 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6832 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6833 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6834 *p++ = hexdigits[ucs & 0x0000000F];
6835 continue;
6836 }
6837 /* Fall through: isolated surrogates are copied as-is */
6838 s--;
6839 size++;
6840 }
6841#endif
6842
6843 /* Map 16-bit characters to '\uxxxx' */
6844 if (ch >= 256) {
6845 *p++ = '\\';
6846 *p++ = 'u';
6847 *p++ = hexdigits[(ch >> 12) & 0x000F];
6848 *p++ = hexdigits[(ch >> 8) & 0x000F];
6849 *p++ = hexdigits[(ch >> 4) & 0x000F];
6850 *p++ = hexdigits[ch & 0x000F];
6851 }
6852
6853 /* Map special whitespace to '\t', \n', '\r' */
6854 else if (ch == '\t') {
6855 *p++ = '\\';
6856 *p++ = 't';
6857 }
6858 else if (ch == '\n') {
6859 *p++ = '\\';
6860 *p++ = 'n';
6861 }
6862 else if (ch == '\r') {
6863 *p++ = '\\';
6864 *p++ = 'r';
6865 }
6866
6867 /* Map non-printable US ASCII to '\xhh' */
6868 else if (ch < ' ' || ch >= 0x7F) {
6869 *p++ = '\\';
6870 *p++ = 'x';
6871 *p++ = hexdigits[(ch >> 4) & 0x000F];
6872 *p++ = hexdigits[ch & 0x000F];
6873 }
6874
6875 /* Copy everything else as-is */
6876 else
6877 *p++ = (char) ch;
6878 }
6879 /* Add quote */
6880 *p++ = PyString_AS_STRING(repr)[0];
6881
6882 *p = '\0';
6883 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
6884 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885}
6886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888"S.rfind(sub [,start [,end]]) -> int\n\
6889\n\
6890Return the highest index in S where substring sub is found,\n\
6891such that sub is contained within s[start,end]. Optional\n\
6892arguments start and end are interpreted as in slice notation.\n\
6893\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
6896static PyObject *
6897unicode_rfind(PyUnicodeObject *self, PyObject *args)
6898{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006899 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006900 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006901 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006902 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
Guido van Rossumb8872e62000-05-09 14:14:27 +00006904 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6905 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006907 substring = PyUnicode_FromObject(substring);
6908 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 return NULL;
6910
Thomas Wouters477c8d52006-05-27 19:21:47 +00006911 result = stringlib_rfind_slice(
6912 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6913 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6914 start, end
6915 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
6917 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006918
6919 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006922PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923"S.rindex(sub [,start [,end]]) -> int\n\
6924\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006925Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927static PyObject *
6928unicode_rindex(PyUnicodeObject *self, PyObject *args)
6929{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006930 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006932 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006933 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
Guido van Rossumb8872e62000-05-09 14:14:27 +00006935 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6936 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006938 substring = PyUnicode_FromObject(substring);
6939 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 return NULL;
6941
Thomas Wouters477c8d52006-05-27 19:21:47 +00006942 result = stringlib_rfind_slice(
6943 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6944 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6945 start, end
6946 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
6948 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 if (result < 0) {
6951 PyErr_SetString(PyExc_ValueError, "substring not found");
6952 return NULL;
6953 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006954 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955}
6956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006958"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959\n\
6960Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006961done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
6963static PyObject *
6964unicode_rjust(PyUnicodeObject *self, PyObject *args)
6965{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006966 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006967 Py_UNICODE fillchar = ' ';
6968
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006969 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970 return NULL;
6971
Tim Peters7a29bd52001-09-12 03:03:31 +00006972 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 Py_INCREF(self);
6974 return (PyObject*) self;
6975 }
6976
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006977 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982{
6983 /* standard clamping */
6984 if (start < 0)
6985 start = 0;
6986 if (end < 0)
6987 end = 0;
6988 if (end > self->length)
6989 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006990 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 /* full slice, return original string */
6992 Py_INCREF(self);
6993 return (PyObject*) self;
6994 }
6995 if (start > end)
6996 start = end;
6997 /* copy slice */
6998 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6999 end - start);
7000}
7001
7002PyObject *PyUnicode_Split(PyObject *s,
7003 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007004 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005{
7006 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007007
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 s = PyUnicode_FromObject(s);
7009 if (s == NULL)
7010 return NULL;
7011 if (sep != NULL) {
7012 sep = PyUnicode_FromObject(sep);
7013 if (sep == NULL) {
7014 Py_DECREF(s);
7015 return NULL;
7016 }
7017 }
7018
7019 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7020
7021 Py_DECREF(s);
7022 Py_XDECREF(sep);
7023 return result;
7024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027"S.split([sep [,maxsplit]]) -> list of strings\n\
7028\n\
7029Return a list of the words in S, using sep as the\n\
7030delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007031splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007032any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033
7034static PyObject*
7035unicode_split(PyUnicodeObject *self, PyObject *args)
7036{
7037 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007038 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039
Martin v. Löwis18e16552006-02-15 17:27:45 +00007040 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 return NULL;
7042
7043 if (substring == Py_None)
7044 return split(self, NULL, maxcount);
7045 else if (PyUnicode_Check(substring))
7046 return split(self, (PyUnicodeObject *)substring, maxcount);
7047 else
7048 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7049}
7050
Thomas Wouters477c8d52006-05-27 19:21:47 +00007051PyObject *
7052PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7053{
7054 PyObject* str_obj;
7055 PyObject* sep_obj;
7056 PyObject* out;
7057
7058 str_obj = PyUnicode_FromObject(str_in);
7059 if (!str_obj)
7060 return NULL;
7061 sep_obj = PyUnicode_FromObject(sep_in);
7062 if (!sep_obj) {
7063 Py_DECREF(str_obj);
7064 return NULL;
7065 }
7066
7067 out = stringlib_partition(
7068 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7069 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7070 );
7071
7072 Py_DECREF(sep_obj);
7073 Py_DECREF(str_obj);
7074
7075 return out;
7076}
7077
7078
7079PyObject *
7080PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7081{
7082 PyObject* str_obj;
7083 PyObject* sep_obj;
7084 PyObject* out;
7085
7086 str_obj = PyUnicode_FromObject(str_in);
7087 if (!str_obj)
7088 return NULL;
7089 sep_obj = PyUnicode_FromObject(sep_in);
7090 if (!sep_obj) {
7091 Py_DECREF(str_obj);
7092 return NULL;
7093 }
7094
7095 out = stringlib_rpartition(
7096 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7097 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7098 );
7099
7100 Py_DECREF(sep_obj);
7101 Py_DECREF(str_obj);
7102
7103 return out;
7104}
7105
7106PyDoc_STRVAR(partition__doc__,
7107"S.partition(sep) -> (head, sep, tail)\n\
7108\n\
7109Searches for the separator sep in S, and returns the part before it,\n\
7110the separator itself, and the part after it. If the separator is not\n\
7111found, returns S and two empty strings.");
7112
7113static PyObject*
7114unicode_partition(PyUnicodeObject *self, PyObject *separator)
7115{
7116 return PyUnicode_Partition((PyObject *)self, separator);
7117}
7118
7119PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007120"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007121\n\
7122Searches for the separator sep in S, starting at the end of S, and returns\n\
7123the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007124separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007125
7126static PyObject*
7127unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7128{
7129 return PyUnicode_RPartition((PyObject *)self, separator);
7130}
7131
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007132PyObject *PyUnicode_RSplit(PyObject *s,
7133 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007134 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007135{
7136 PyObject *result;
7137
7138 s = PyUnicode_FromObject(s);
7139 if (s == NULL)
7140 return NULL;
7141 if (sep != NULL) {
7142 sep = PyUnicode_FromObject(sep);
7143 if (sep == NULL) {
7144 Py_DECREF(s);
7145 return NULL;
7146 }
7147 }
7148
7149 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7150
7151 Py_DECREF(s);
7152 Py_XDECREF(sep);
7153 return result;
7154}
7155
7156PyDoc_STRVAR(rsplit__doc__,
7157"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7158\n\
7159Return a list of the words in S, using sep as the\n\
7160delimiter string, starting at the end of the string and\n\
7161working to the front. If maxsplit is given, at most maxsplit\n\
7162splits are done. If sep is not specified, any whitespace string\n\
7163is a separator.");
7164
7165static PyObject*
7166unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7167{
7168 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007169 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007170
Martin v. Löwis18e16552006-02-15 17:27:45 +00007171 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007172 return NULL;
7173
7174 if (substring == Py_None)
7175 return rsplit(self, NULL, maxcount);
7176 else if (PyUnicode_Check(substring))
7177 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7178 else
7179 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7180}
7181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007183"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184\n\
7185Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007186Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188
7189static PyObject*
7190unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7191{
Guido van Rossum86662912000-04-11 15:38:46 +00007192 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193
Guido van Rossum86662912000-04-11 15:38:46 +00007194 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 return NULL;
7196
Guido van Rossum86662912000-04-11 15:38:46 +00007197 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198}
7199
7200static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007201PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007203 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
7204 Py_XINCREF(res);
7205 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206}
7207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007208PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209"S.swapcase() -> unicode\n\
7210\n\
7211Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007212and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007215unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 return fixup(self, fixswapcase);
7218}
7219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007220PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221"S.translate(table) -> unicode\n\
7222\n\
7223Return a copy of the string S, where all characters have been mapped\n\
7224through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007225Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7226Unmapped characters are left untouched. Characters mapped to None\n\
7227are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228
7229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007230unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231{
Tim Petersced69f82003-09-16 20:30:58 +00007232 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007234 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 "ignore");
7236}
7237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239"S.upper() -> unicode\n\
7240\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007241Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242
7243static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007244unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 return fixup(self, fixupper);
7247}
7248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007249PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250"S.zfill(width) -> unicode\n\
7251\n\
7252Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007253of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
7255static PyObject *
7256unicode_zfill(PyUnicodeObject *self, PyObject *args)
7257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 PyUnicodeObject *u;
7260
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t width;
7262 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 return NULL;
7264
7265 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007266 if (PyUnicode_CheckExact(self)) {
7267 Py_INCREF(self);
7268 return (PyObject*) self;
7269 }
7270 else
7271 return PyUnicode_FromUnicode(
7272 PyUnicode_AS_UNICODE(self),
7273 PyUnicode_GET_SIZE(self)
7274 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
7276
7277 fill = width - self->length;
7278
7279 u = pad(self, fill, 0, '0');
7280
Walter Dörwald068325e2002-04-15 13:36:47 +00007281 if (u == NULL)
7282 return NULL;
7283
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 if (u->str[fill] == '+' || u->str[fill] == '-') {
7285 /* move sign to beginning of string */
7286 u->str[0] = u->str[fill];
7287 u->str[fill] = '0';
7288 }
7289
7290 return (PyObject*) u;
7291}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292
7293#if 0
7294static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007295unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 return PyInt_FromLong(unicode_freelist_size);
7298}
7299#endif
7300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007301PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007302"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007304Return True if S starts with the specified prefix, False otherwise.\n\
7305With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306With optional end, stop comparing S at that position.\n\
7307prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
7309static PyObject *
7310unicode_startswith(PyUnicodeObject *self,
7311 PyObject *args)
7312{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007315 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007316 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007320 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322 if (PyTuple_Check(subobj)) {
7323 Py_ssize_t i;
7324 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7325 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7326 PyTuple_GET_ITEM(subobj, i));
7327 if (substring == NULL)
7328 return NULL;
7329 result = tailmatch(self, substring, start, end, -1);
7330 Py_DECREF(substring);
7331 if (result) {
7332 Py_RETURN_TRUE;
7333 }
7334 }
7335 /* nothing matched */
7336 Py_RETURN_FALSE;
7337 }
7338 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340 return NULL;
7341 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344}
7345
7346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007347PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007348"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007350Return True if S ends with the specified suffix, False otherwise.\n\
7351With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352With optional end, stop comparing S at that position.\n\
7353suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354
7355static PyObject *
7356unicode_endswith(PyUnicodeObject *self,
7357 PyObject *args)
7358{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007361 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007362 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7366 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368 if (PyTuple_Check(subobj)) {
7369 Py_ssize_t i;
7370 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7371 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7372 PyTuple_GET_ITEM(subobj, i));
7373 if (substring == NULL)
7374 return NULL;
7375 result = tailmatch(self, substring, start, end, +1);
7376 Py_DECREF(substring);
7377 if (result) {
7378 Py_RETURN_TRUE;
7379 }
7380 }
7381 Py_RETURN_FALSE;
7382 }
7383 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
7392
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007393
7394static PyObject *
7395unicode_getnewargs(PyUnicodeObject *v)
7396{
7397 return Py_BuildValue("(u#)", v->str, v->length);
7398}
7399
7400
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401static PyMethodDef unicode_methods[] = {
7402
7403 /* Order is according to common usage: often used methods should
7404 appear first, since lookup is done sequentially. */
7405
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007406 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7407 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7408 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007409 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007410 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7411 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7412 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7413 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7414 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7415 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7416 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007417 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007418 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7419 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7420 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007421 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007422 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007423/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7424 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7425 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7426 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007427 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007428 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007429 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007430 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007431 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7432 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7433 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7434 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7435 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7436 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7437 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7438 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7439 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7440 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7441 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7442 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7443 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7444 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007445 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007446#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007447 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448#endif
7449
7450#if 0
7451 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007452 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453#endif
7454
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007455 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 {NULL, NULL}
7457};
7458
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007459static PyObject *
7460unicode_mod(PyObject *v, PyObject *w)
7461{
7462 if (!PyUnicode_Check(v)) {
7463 Py_INCREF(Py_NotImplemented);
7464 return Py_NotImplemented;
7465 }
7466 return PyUnicode_Format(v, w);
7467}
7468
7469static PyNumberMethods unicode_as_number = {
7470 0, /*nb_add*/
7471 0, /*nb_subtract*/
7472 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007473 unicode_mod, /*nb_remainder*/
7474};
7475
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007477 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007478 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7480 (ssizeargfunc) unicode_getitem, /* sq_item */
7481 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 0, /* sq_ass_item */
7483 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007484 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485};
7486
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007487static PyObject*
7488unicode_subscript(PyUnicodeObject* self, PyObject* item)
7489{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007490 if (PyIndex_Check(item)) {
7491 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007492 if (i == -1 && PyErr_Occurred())
7493 return NULL;
7494 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007495 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007496 return unicode_getitem(self, i);
7497 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007498 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007499 Py_UNICODE* source_buf;
7500 Py_UNICODE* result_buf;
7501 PyObject* result;
7502
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007503 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007504 &start, &stop, &step, &slicelength) < 0) {
7505 return NULL;
7506 }
7507
7508 if (slicelength <= 0) {
7509 return PyUnicode_FromUnicode(NULL, 0);
7510 } else {
7511 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007512 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7513 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007514
7515 if (result_buf == NULL)
7516 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007517
7518 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7519 result_buf[i] = source_buf[cur];
7520 }
Tim Petersced69f82003-09-16 20:30:58 +00007521
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007522 result = PyUnicode_FromUnicode(result_buf, slicelength);
7523 PyMem_FREE(result_buf);
7524 return result;
7525 }
7526 } else {
7527 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7528 return NULL;
7529 }
7530}
7531
7532static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007533 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007534 (binaryfunc)unicode_subscript, /* mp_subscript */
7535 (objobjargproc)0, /* mp_ass_subscript */
7536};
7537
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 const void **ptr)
7542{
7543 if (index != 0) {
7544 PyErr_SetString(PyExc_SystemError,
7545 "accessing non-existent unicode segment");
7546 return -1;
7547 }
7548 *ptr = (void *) self->str;
7549 return PyUnicode_GET_DATA_SIZE(self);
7550}
7551
Martin v. Löwis18e16552006-02-15 17:27:45 +00007552static Py_ssize_t
7553unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 const void **ptr)
7555{
7556 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007557 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 return -1;
7559}
7560
7561static int
7562unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
7565 if (lenp)
7566 *lenp = PyUnicode_GET_DATA_SIZE(self);
7567 return 1;
7568}
7569
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007570static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007572 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 const void **ptr)
7574{
7575 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007576
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 if (index != 0) {
7578 PyErr_SetString(PyExc_SystemError,
7579 "accessing non-existent unicode segment");
7580 return -1;
7581 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007582 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 if (str == NULL)
7584 return -1;
7585 *ptr = (void *) PyString_AS_STRING(str);
7586 return PyString_GET_SIZE(str);
7587}
7588
7589/* Helpers for PyUnicode_Format() */
7590
7591static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007592getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007594 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 if (argidx < arglen) {
7596 (*p_argidx)++;
7597 if (arglen < 0)
7598 return args;
7599 else
7600 return PyTuple_GetItem(args, argidx);
7601 }
7602 PyErr_SetString(PyExc_TypeError,
7603 "not enough arguments for format string");
7604 return NULL;
7605}
7606
7607#define F_LJUST (1<<0)
7608#define F_SIGN (1<<1)
7609#define F_BLANK (1<<2)
7610#define F_ALT (1<<3)
7611#define F_ZERO (1<<4)
7612
Martin v. Löwis18e16552006-02-15 17:27:45 +00007613static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007614strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007616 register Py_ssize_t i;
7617 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 for (i = len - 1; i >= 0; i--)
7619 buffer[i] = (Py_UNICODE) charbuffer[i];
7620
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 return len;
7622}
7623
Neal Norwitzfc76d632006-01-10 06:03:13 +00007624static int
7625doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7626{
Tim Peters15231542006-02-16 01:08:01 +00007627 Py_ssize_t result;
7628
Neal Norwitzfc76d632006-01-10 06:03:13 +00007629 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007630 result = strtounicode(buffer, (char *)buffer);
7631 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007632}
7633
7634static int
7635longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7636{
Tim Peters15231542006-02-16 01:08:01 +00007637 Py_ssize_t result;
7638
Neal Norwitzfc76d632006-01-10 06:03:13 +00007639 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007640 result = strtounicode(buffer, (char *)buffer);
7641 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007642}
7643
Guido van Rossum078151d2002-08-11 04:24:12 +00007644/* XXX To save some code duplication, formatfloat/long/int could have been
7645 shared with stringobject.c, converting from 8-bit to Unicode after the
7646 formatting is done. */
7647
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648static int
7649formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007650 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651 int flags,
7652 int prec,
7653 int type,
7654 PyObject *v)
7655{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007656 /* fmt = '%#.' + `prec` + `type`
7657 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 char fmt[20];
7659 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007660
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661 x = PyFloat_AsDouble(v);
7662 if (x == -1.0 && PyErr_Occurred())
7663 return -1;
7664 if (prec < 0)
7665 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7667 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007668 /* Worst case length calc to ensure no buffer overrun:
7669
7670 'g' formats:
7671 fmt = %#.<prec>g
7672 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7673 for any double rep.)
7674 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7675
7676 'f' formats:
7677 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7678 len = 1 + 50 + 1 + prec = 52 + prec
7679
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007680 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007681 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007682
7683 */
7684 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7685 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007686 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007687 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007688 return -1;
7689 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007690 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7691 (flags&F_ALT) ? "#" : "",
7692 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007693 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694}
7695
Tim Peters38fd5b62000-09-21 05:43:11 +00007696static PyObject*
7697formatlong(PyObject *val, int flags, int prec, int type)
7698{
7699 char *buf;
7700 int i, len;
7701 PyObject *str; /* temporary string object. */
7702 PyUnicodeObject *result;
7703
7704 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7705 if (!str)
7706 return NULL;
7707 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007708 if (!result) {
7709 Py_DECREF(str);
7710 return NULL;
7711 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007712 for (i = 0; i < len; i++)
7713 result->str[i] = buf[i];
7714 result->str[len] = 0;
7715 Py_DECREF(str);
7716 return (PyObject*)result;
7717}
7718
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719static int
7720formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007721 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 int flags,
7723 int prec,
7724 int type,
7725 PyObject *v)
7726{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007727 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007728 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7729 * + 1 + 1
7730 * = 24
7731 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007732 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007733 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 long x;
7735
7736 x = PyInt_AsLong(v);
7737 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007738 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007739 if (x < 0 && type == 'u') {
7740 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007741 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007742 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7743 sign = "-";
7744 else
7745 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007747 prec = 1;
7748
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007749 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7750 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007751 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007752 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007753 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007754 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007755 return -1;
7756 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007757
7758 if ((flags & F_ALT) &&
7759 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007760 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007761 * of issues that cause pain:
7762 * - when 0 is being converted, the C standard leaves off
7763 * the '0x' or '0X', which is inconsistent with other
7764 * %#x/%#X conversions and inconsistent with Python's
7765 * hex() function
7766 * - there are platforms that violate the standard and
7767 * convert 0 with the '0x' or '0X'
7768 * (Metrowerks, Compaq Tru64)
7769 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007770 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007771 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007772 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007773 * We can achieve the desired consistency by inserting our
7774 * own '0x' or '0X' prefix, and substituting %x/%X in place
7775 * of %#x/%#X.
7776 *
7777 * Note that this is the same approach as used in
7778 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007779 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007780 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7781 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007782 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007783 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007784 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7785 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007786 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007787 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007788 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007789 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007790 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007791 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792}
7793
7794static int
7795formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007796 size_t buflen,
7797 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007799 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007800 if (PyUnicode_Check(v)) {
7801 if (PyUnicode_GET_SIZE(v) != 1)
7802 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007806 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007807 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007808 goto onError;
7809 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811
7812 else {
7813 /* Integer input truncated to a character */
7814 long x;
7815 x = PyInt_AsLong(v);
7816 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007817 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007818#ifdef Py_UNICODE_WIDE
7819 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007820 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007821 "%c arg not in range(0x110000) "
7822 "(wide Python build)");
7823 return -1;
7824 }
7825#else
7826 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007827 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007828 "%c arg not in range(0x10000) "
7829 "(narrow Python build)");
7830 return -1;
7831 }
7832#endif
7833 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 }
7835 buf[1] = '\0';
7836 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007837
7838 onError:
7839 PyErr_SetString(PyExc_TypeError,
7840 "%c requires int or char");
7841 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842}
7843
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007844/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7845
7846 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7847 chars are formatted. XXX This is a magic number. Each formatting
7848 routine does bounds checking to ensure no overflow, but a better
7849 solution may be to malloc a buffer of appropriate size for each
7850 format. For now, the current solution is sufficient.
7851*/
7852#define FORMATBUFLEN (size_t)120
7853
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854PyObject *PyUnicode_Format(PyObject *format,
7855 PyObject *args)
7856{
7857 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007858 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 int args_owned = 0;
7860 PyUnicodeObject *result = NULL;
7861 PyObject *dict = NULL;
7862 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007863
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 if (format == NULL || args == NULL) {
7865 PyErr_BadInternalCall();
7866 return NULL;
7867 }
7868 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007869 if (uformat == NULL)
7870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 fmt = PyUnicode_AS_UNICODE(uformat);
7872 fmtcnt = PyUnicode_GET_SIZE(uformat);
7873
7874 reslen = rescnt = fmtcnt + 100;
7875 result = _PyUnicode_New(reslen);
7876 if (result == NULL)
7877 goto onError;
7878 res = PyUnicode_AS_UNICODE(result);
7879
7880 if (PyTuple_Check(args)) {
7881 arglen = PyTuple_Size(args);
7882 argidx = 0;
7883 }
7884 else {
7885 arglen = -1;
7886 argidx = -2;
7887 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007888 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7889 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 dict = args;
7891
7892 while (--fmtcnt >= 0) {
7893 if (*fmt != '%') {
7894 if (--rescnt < 0) {
7895 rescnt = fmtcnt + 100;
7896 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007897 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007898 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7900 --rescnt;
7901 }
7902 *res++ = *fmt++;
7903 }
7904 else {
7905 /* Got a format specifier */
7906 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007907 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 Py_UNICODE c = '\0';
7910 Py_UNICODE fill;
7911 PyObject *v = NULL;
7912 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007913 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007915 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007916 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917
7918 fmt++;
7919 if (*fmt == '(') {
7920 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007921 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922 PyObject *key;
7923 int pcount = 1;
7924
7925 if (dict == NULL) {
7926 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007927 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928 goto onError;
7929 }
7930 ++fmt;
7931 --fmtcnt;
7932 keystart = fmt;
7933 /* Skip over balanced parentheses */
7934 while (pcount > 0 && --fmtcnt >= 0) {
7935 if (*fmt == ')')
7936 --pcount;
7937 else if (*fmt == '(')
7938 ++pcount;
7939 fmt++;
7940 }
7941 keylen = fmt - keystart - 1;
7942 if (fmtcnt < 0 || pcount > 0) {
7943 PyErr_SetString(PyExc_ValueError,
7944 "incomplete format key");
7945 goto onError;
7946 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007947#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007948 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 then looked up since Python uses strings to hold
7950 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007951 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 key = PyUnicode_EncodeUTF8(keystart,
7953 keylen,
7954 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007955#else
7956 key = PyUnicode_FromUnicode(keystart, keylen);
7957#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 if (key == NULL)
7959 goto onError;
7960 if (args_owned) {
7961 Py_DECREF(args);
7962 args_owned = 0;
7963 }
7964 args = PyObject_GetItem(dict, key);
7965 Py_DECREF(key);
7966 if (args == NULL) {
7967 goto onError;
7968 }
7969 args_owned = 1;
7970 arglen = -1;
7971 argidx = -2;
7972 }
7973 while (--fmtcnt >= 0) {
7974 switch (c = *fmt++) {
7975 case '-': flags |= F_LJUST; continue;
7976 case '+': flags |= F_SIGN; continue;
7977 case ' ': flags |= F_BLANK; continue;
7978 case '#': flags |= F_ALT; continue;
7979 case '0': flags |= F_ZERO; continue;
7980 }
7981 break;
7982 }
7983 if (c == '*') {
7984 v = getnextarg(args, arglen, &argidx);
7985 if (v == NULL)
7986 goto onError;
7987 if (!PyInt_Check(v)) {
7988 PyErr_SetString(PyExc_TypeError,
7989 "* wants int");
7990 goto onError;
7991 }
7992 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007993 if (width == -1 && PyErr_Occurred())
7994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 if (width < 0) {
7996 flags |= F_LJUST;
7997 width = -width;
7998 }
7999 if (--fmtcnt >= 0)
8000 c = *fmt++;
8001 }
8002 else if (c >= '0' && c <= '9') {
8003 width = c - '0';
8004 while (--fmtcnt >= 0) {
8005 c = *fmt++;
8006 if (c < '0' || c > '9')
8007 break;
8008 if ((width*10) / 10 != width) {
8009 PyErr_SetString(PyExc_ValueError,
8010 "width too big");
8011 goto onError;
8012 }
8013 width = width*10 + (c - '0');
8014 }
8015 }
8016 if (c == '.') {
8017 prec = 0;
8018 if (--fmtcnt >= 0)
8019 c = *fmt++;
8020 if (c == '*') {
8021 v = getnextarg(args, arglen, &argidx);
8022 if (v == NULL)
8023 goto onError;
8024 if (!PyInt_Check(v)) {
8025 PyErr_SetString(PyExc_TypeError,
8026 "* wants int");
8027 goto onError;
8028 }
8029 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008030 if (prec == -1 && PyErr_Occurred())
8031 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 if (prec < 0)
8033 prec = 0;
8034 if (--fmtcnt >= 0)
8035 c = *fmt++;
8036 }
8037 else if (c >= '0' && c <= '9') {
8038 prec = c - '0';
8039 while (--fmtcnt >= 0) {
8040 c = Py_CHARMASK(*fmt++);
8041 if (c < '0' || c > '9')
8042 break;
8043 if ((prec*10) / 10 != prec) {
8044 PyErr_SetString(PyExc_ValueError,
8045 "prec too big");
8046 goto onError;
8047 }
8048 prec = prec*10 + (c - '0');
8049 }
8050 }
8051 } /* prec */
8052 if (fmtcnt >= 0) {
8053 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 if (--fmtcnt >= 0)
8055 c = *fmt++;
8056 }
8057 }
8058 if (fmtcnt < 0) {
8059 PyErr_SetString(PyExc_ValueError,
8060 "incomplete format");
8061 goto onError;
8062 }
8063 if (c != '%') {
8064 v = getnextarg(args, arglen, &argidx);
8065 if (v == NULL)
8066 goto onError;
8067 }
8068 sign = 0;
8069 fill = ' ';
8070 switch (c) {
8071
8072 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008073 pbuf = formatbuf;
8074 /* presume that buffer length is at least 1 */
8075 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 len = 1;
8077 break;
8078
8079 case 's':
8080 case 'r':
8081 if (PyUnicode_Check(v) && c == 's') {
8082 temp = v;
8083 Py_INCREF(temp);
8084 }
8085 else {
8086 PyObject *unicode;
8087 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008088 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 else
8090 temp = PyObject_Repr(v);
8091 if (temp == NULL)
8092 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008093 if (PyUnicode_Check(temp))
8094 /* nothing to do */;
8095 else if (PyString_Check(temp)) {
8096 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008097 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008099 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008101 Py_DECREF(temp);
8102 temp = unicode;
8103 if (temp == NULL)
8104 goto onError;
8105 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008106 else {
8107 Py_DECREF(temp);
8108 PyErr_SetString(PyExc_TypeError,
8109 "%s argument has non-string str()");
8110 goto onError;
8111 }
8112 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008113 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 len = PyUnicode_GET_SIZE(temp);
8115 if (prec >= 0 && len > prec)
8116 len = prec;
8117 break;
8118
8119 case 'i':
8120 case 'd':
8121 case 'u':
8122 case 'o':
8123 case 'x':
8124 case 'X':
8125 if (c == 'i')
8126 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008127 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008128 temp = formatlong(v, flags, prec, c);
8129 if (!temp)
8130 goto onError;
8131 pbuf = PyUnicode_AS_UNICODE(temp);
8132 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008133 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008135 else {
8136 pbuf = formatbuf;
8137 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8138 flags, prec, c, v);
8139 if (len < 0)
8140 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008141 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008142 }
8143 if (flags & F_ZERO)
8144 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 break;
8146
8147 case 'e':
8148 case 'E':
8149 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008150 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 case 'g':
8152 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008153 if (c == 'F')
8154 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008155 pbuf = formatbuf;
8156 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8157 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 if (len < 0)
8159 goto onError;
8160 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008161 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 fill = '0';
8163 break;
8164
8165 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008166 pbuf = formatbuf;
8167 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 if (len < 0)
8169 goto onError;
8170 break;
8171
8172 default:
8173 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008174 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008175 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008176 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008177 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008178 (Py_ssize_t)(fmt - 1 -
8179 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 goto onError;
8181 }
8182 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008183 if (*pbuf == '-' || *pbuf == '+') {
8184 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 len--;
8186 }
8187 else if (flags & F_SIGN)
8188 sign = '+';
8189 else if (flags & F_BLANK)
8190 sign = ' ';
8191 else
8192 sign = 0;
8193 }
8194 if (width < len)
8195 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008196 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 reslen -= rescnt;
8198 rescnt = width + fmtcnt + 100;
8199 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008200 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008201 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008202 PyErr_NoMemory();
8203 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008204 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008205 if (_PyUnicode_Resize(&result, reslen) < 0) {
8206 Py_XDECREF(temp);
8207 goto onError;
8208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 res = PyUnicode_AS_UNICODE(result)
8210 + reslen - rescnt;
8211 }
8212 if (sign) {
8213 if (fill != ' ')
8214 *res++ = sign;
8215 rescnt--;
8216 if (width > len)
8217 width--;
8218 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008219 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8220 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008221 assert(pbuf[1] == c);
8222 if (fill != ' ') {
8223 *res++ = *pbuf++;
8224 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008225 }
Tim Petersfff53252001-04-12 18:38:48 +00008226 rescnt -= 2;
8227 width -= 2;
8228 if (width < 0)
8229 width = 0;
8230 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 if (width > len && !(flags & F_LJUST)) {
8233 do {
8234 --rescnt;
8235 *res++ = fill;
8236 } while (--width > len);
8237 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008238 if (fill == ' ') {
8239 if (sign)
8240 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008241 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008242 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008243 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008244 *res++ = *pbuf++;
8245 *res++ = *pbuf++;
8246 }
8247 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008248 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 res += len;
8250 rescnt -= len;
8251 while (--width >= len) {
8252 --rescnt;
8253 *res++ = ' ';
8254 }
8255 if (dict && (argidx < arglen) && c != '%') {
8256 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008257 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008258 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 goto onError;
8260 }
8261 Py_XDECREF(temp);
8262 } /* '%' */
8263 } /* until end */
8264 if (argidx < arglen && !dict) {
8265 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008266 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 goto onError;
8268 }
8269
Thomas Woutersa96affe2006-03-12 00:29:36 +00008270 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 if (args_owned) {
8273 Py_DECREF(args);
8274 }
8275 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 return (PyObject *)result;
8277
8278 onError:
8279 Py_XDECREF(result);
8280 Py_DECREF(uformat);
8281 if (args_owned) {
8282 Py_DECREF(args);
8283 }
8284 return NULL;
8285}
8286
8287static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008288 (readbufferproc) unicode_buffer_getreadbuf,
8289 (writebufferproc) unicode_buffer_getwritebuf,
8290 (segcountproc) unicode_buffer_getsegcount,
8291 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292};
8293
Jeremy Hylton938ace62002-07-17 16:30:39 +00008294static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008295unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8296
Tim Peters6d6c1a32001-08-02 04:15:00 +00008297static PyObject *
8298unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8299{
8300 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008301 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008302 char *encoding = NULL;
8303 char *errors = NULL;
8304
Guido van Rossume023fe02001-08-30 03:12:59 +00008305 if (type != &PyUnicode_Type)
8306 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008307 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8308 kwlist, &x, &encoding, &errors))
8309 return NULL;
8310 if (x == NULL)
8311 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008312 if (encoding == NULL && errors == NULL)
8313 return PyObject_Unicode(x);
8314 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008315 return PyUnicode_FromEncodedObject(x, encoding, errors);
8316}
8317
Guido van Rossume023fe02001-08-30 03:12:59 +00008318static PyObject *
8319unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8320{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008321 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008322 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008323
8324 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8325 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8326 if (tmp == NULL)
8327 return NULL;
8328 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008329 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008330 if (pnew == NULL) {
8331 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008332 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008333 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008334 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8335 if (pnew->str == NULL) {
8336 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008337 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008338 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008339 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008340 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008341 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8342 pnew->length = n;
8343 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008344 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008345 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008346}
8347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008348PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008349"unicode(string [, encoding[, errors]]) -> object\n\
8350\n\
8351Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008352encoding defaults to the current default string encoding.\n\
8353errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008354
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008355static PyObject *unicode_iter(PyObject *seq);
8356
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357PyTypeObject PyUnicode_Type = {
8358 PyObject_HEAD_INIT(&PyType_Type)
8359 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008360 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 sizeof(PyUnicodeObject), /* tp_size */
8362 0, /* tp_itemsize */
8363 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008364 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008366 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008368 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008369 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008370 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008372 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 (hashfunc) unicode_hash, /* tp_hash*/
8374 0, /* tp_call*/
8375 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008376 PyObject_GenericGetAttr, /* tp_getattro */
8377 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008379 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8380 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008381 unicode_doc, /* tp_doc */
8382 0, /* tp_traverse */
8383 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008384 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008385 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008386 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008387 0, /* tp_iternext */
8388 unicode_methods, /* tp_methods */
8389 0, /* tp_members */
8390 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008391 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008392 0, /* tp_dict */
8393 0, /* tp_descr_get */
8394 0, /* tp_descr_set */
8395 0, /* tp_dictoffset */
8396 0, /* tp_init */
8397 0, /* tp_alloc */
8398 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008399 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400};
8401
8402/* Initialize the Unicode implementation */
8403
Thomas Wouters78890102000-07-22 19:25:51 +00008404void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008406 int i;
8407
Thomas Wouters477c8d52006-05-27 19:21:47 +00008408 /* XXX - move this array to unicodectype.c ? */
8409 Py_UNICODE linebreak[] = {
8410 0x000A, /* LINE FEED */
8411 0x000D, /* CARRIAGE RETURN */
8412 0x001C, /* FILE SEPARATOR */
8413 0x001D, /* GROUP SEPARATOR */
8414 0x001E, /* RECORD SEPARATOR */
8415 0x0085, /* NEXT LINE */
8416 0x2028, /* LINE SEPARATOR */
8417 0x2029, /* PARAGRAPH SEPARATOR */
8418 };
8419
Fred Drakee4315f52000-05-09 19:53:39 +00008420 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008421 unicode_freelist = NULL;
8422 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008424 if (!unicode_empty)
8425 return;
8426
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008427 for (i = 0; i < 256; i++)
8428 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008429 if (PyType_Ready(&PyUnicode_Type) < 0)
8430 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008431
8432 /* initialize the linebreak bloom filter */
8433 bloom_linebreak = make_bloom_mask(
8434 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8435 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008436
8437 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438}
8439
8440/* Finalize the Unicode implementation */
8441
8442void
Thomas Wouters78890102000-07-22 19:25:51 +00008443_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008445 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008446 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008448 Py_XDECREF(unicode_empty);
8449 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008450
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008451 for (i = 0; i < 256; i++) {
8452 if (unicode_latin1[i]) {
8453 Py_DECREF(unicode_latin1[i]);
8454 unicode_latin1[i] = NULL;
8455 }
8456 }
8457
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008458 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 PyUnicodeObject *v = u;
8460 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008461 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008462 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008463 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008464 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008466 unicode_freelist = NULL;
8467 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008469
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008470
8471
8472/********************* Unicode Iterator **************************/
8473
8474typedef struct {
8475 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008476 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008477 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8478} unicodeiterobject;
8479
8480static void
8481unicodeiter_dealloc(unicodeiterobject *it)
8482{
8483 _PyObject_GC_UNTRACK(it);
8484 Py_XDECREF(it->it_seq);
8485 PyObject_GC_Del(it);
8486}
8487
8488static int
8489unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8490{
8491 Py_VISIT(it->it_seq);
8492 return 0;
8493}
8494
8495static PyObject *
8496unicodeiter_next(unicodeiterobject *it)
8497{
8498 PyUnicodeObject *seq;
8499 PyObject *item;
8500
8501 assert(it != NULL);
8502 seq = it->it_seq;
8503 if (seq == NULL)
8504 return NULL;
8505 assert(PyUnicode_Check(seq));
8506
8507 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008508 item = PyUnicode_FromUnicode(
8509 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008510 if (item != NULL)
8511 ++it->it_index;
8512 return item;
8513 }
8514
8515 Py_DECREF(seq);
8516 it->it_seq = NULL;
8517 return NULL;
8518}
8519
8520static PyObject *
8521unicodeiter_len(unicodeiterobject *it)
8522{
8523 Py_ssize_t len = 0;
8524 if (it->it_seq)
8525 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8526 return PyInt_FromSsize_t(len);
8527}
8528
8529PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8530
8531static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008532 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8533 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008534 {NULL, NULL} /* sentinel */
8535};
8536
8537PyTypeObject PyUnicodeIter_Type = {
8538 PyObject_HEAD_INIT(&PyType_Type)
8539 0, /* ob_size */
8540 "unicodeiterator", /* tp_name */
8541 sizeof(unicodeiterobject), /* tp_basicsize */
8542 0, /* tp_itemsize */
8543 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008544 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008545 0, /* tp_print */
8546 0, /* tp_getattr */
8547 0, /* tp_setattr */
8548 0, /* tp_compare */
8549 0, /* tp_repr */
8550 0, /* tp_as_number */
8551 0, /* tp_as_sequence */
8552 0, /* tp_as_mapping */
8553 0, /* tp_hash */
8554 0, /* tp_call */
8555 0, /* tp_str */
8556 PyObject_GenericGetAttr, /* tp_getattro */
8557 0, /* tp_setattro */
8558 0, /* tp_as_buffer */
8559 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8560 0, /* tp_doc */
8561 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8562 0, /* tp_clear */
8563 0, /* tp_richcompare */
8564 0, /* tp_weaklistoffset */
8565 PyObject_SelfIter, /* tp_iter */
8566 (iternextfunc)unicodeiter_next, /* tp_iternext */
8567 unicodeiter_methods, /* tp_methods */
8568 0,
8569};
8570
8571static PyObject *
8572unicode_iter(PyObject *seq)
8573{
8574 unicodeiterobject *it;
8575
8576 if (!PyUnicode_Check(seq)) {
8577 PyErr_BadInternalCall();
8578 return NULL;
8579 }
8580 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8581 if (it == NULL)
8582 return NULL;
8583 it->it_index = 0;
8584 Py_INCREF(seq);
8585 it->it_seq = (PyUnicodeObject *)seq;
8586 _PyObject_GC_TRACK(it);
8587 return (PyObject *)it;
8588}
8589
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008590#ifdef __cplusplus
8591}
8592#endif
8593
8594
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008595/*
8596Local variables:
8597c-basic-offset: 4
8598indent-tabs-mode: nil
8599End:
8600*/