blob: 930d58c803c7f079c7925bd118aa3e456c651581 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000297 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000300 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Andrew Dalkee0df7622006-05-27 11:04:36 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
Neal Norwitze7d8be82008-07-31 17:17:14 +0000315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
318 }
319
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000329 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
332 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000333 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000334 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000337 }
338 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 }
340 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000341 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 if (unicode == NULL)
344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 }
348
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000349 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000350 PyErr_NoMemory();
351 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000352 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000353 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
359 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000360 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000362 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000378 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000380 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
385 }
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
389 }
390 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 }
395 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400}
401
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000402static
403int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404{
405 register PyUnicodeObject *v;
406
407 /* Argument checks */
408 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 PyErr_BadInternalCall();
410 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000412 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000414 PyErr_BadInternalCall();
415 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000421 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 }
432
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
436}
437
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000438int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
439{
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000456 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000457
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
468 }
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
471 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
Tim Petersced69f82003-09-16 20:30:58 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
477
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000480 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481
482 return (PyObject *)unicode;
483}
484
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000485PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
486{
487 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000488
Benjamin Peterson857ce152009-01-31 16:29:18 +0000489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000491 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 return NULL;
493 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000494
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
500
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000505 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000506
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
517 }
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
520 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000521
522 return PyUnicode_DecodeUTF8(u, size, NULL);
523 }
524
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
528
529 return (PyObject *)unicode;
530}
531
532PyObject *PyUnicode_FromString(const char *u)
533{
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
538 }
539
540 return PyUnicode_FromStringAndSize(u, size);
541}
542
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543#ifdef HAVE_WCHAR_H
544
Mark Dickinson6b265f12009-03-18 16:07:26 +0000545#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546# define CONVERT_WCHAR_TO_SURROGATES
547#endif
548
549#ifdef CONVERT_WCHAR_TO_SURROGATES
550
551/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
553
554PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
556{
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
561
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
573 }
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
578
579 /* Copy the wchar_t data into the new object */
580 {
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
589 }
590 else
591 *u++ = *w++;
592 }
593 }
594 return (PyObject *)unicode;
595}
596
597#else
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000600 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601{
602 PyUnicodeObject *unicode;
603
604 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 PyErr_BadInternalCall();
606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 }
608
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614#ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000616#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624#endif
625
626 return (PyObject *)unicode;
627}
628
Mark Dickinson6b265f12009-03-18 16:07:26 +0000629#endif /* CONVERT_WCHAR_TO_SURROGATES */
630
631#undef CONVERT_WCHAR_TO_SURROGATES
632
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633static void
634makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
635{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
641 }
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
650 }
651 *fmt++ = c;
652 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653}
654
655#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656
657PyObject *
658PyUnicode_FromFormatV(const char *format, va_list vargs)
659{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000680
681#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000682 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000683#else
684#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000687 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000688#endif
689#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000697 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
703 if (*f == 's')
704 ++callcount;
705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000706 }
707 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
714 }
715 callresult = callresults;
716 }
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
729 */
730 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
755 {
756 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000757 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000764 break;
765 }
766 case 'U':
767 {
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
772 }
773 case 'V':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
784 }
785 case 'S':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
797 }
798 case 'R':
799 {
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
810 }
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
817 */
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
829 }
830 } else
831 n++;
832 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000833 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
839 }
840 realbuffer = abuffer;
841 }
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000851
Benjamin Peterson857ce152009-01-31 16:29:18 +0000852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000854
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
870 }
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
876 }
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
881 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000882
Benjamin Peterson857ce152009-01-31 16:29:18 +0000883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
918 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000928 break;
929 }
930 case 'U':
931 {
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
937 }
938 case 'V':
939 {
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
948 }
949 break;
950 }
951 case 'S':
952 case 'R':
953 {
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
968 }
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
978 }
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
987 }
988 } else
989 *s++ = *f;
990 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000992 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1005 }
1006 PyObject_Free(callresults);
1007 }
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011}
1012
1013#undef appendstring
1014
1015PyObject *
1016PyUnicode_FromFormat(const char *format, ...)
1017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 PyObject* ret;
1019 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001020
1021#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001025#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001029}
1030
Martin v. Löwis18e16552006-02-15 17:27:45 +00001031Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001032 wchar_t *w,
1033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyErr_BadInternalCall();
1037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039
1040 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044#ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046#else
1047 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 }
1054#endif
1055
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060}
1061
1062#endif
1063
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064PyObject *PyUnicode_FromOrdinal(int ordinal)
1065{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001066 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067
1068#ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074 }
1075#else
1076 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#endif
1083
Hye-Shik Chang40574832004-04-06 07:24:51 +00001084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001086}
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_FromObject(register PyObject *obj)
1089{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001092 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 Py_INCREF(obj);
1094 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 }
1096 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103}
1104
1105PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 const char *encoding,
1107 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001110 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 PyErr_BadInternalCall();
1115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118#if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001123
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001126
1127 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 return PyObject_Unicode(obj);
1135 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136#else
1137 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#endif
1143
1144 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001145 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 }
Christian Heimes3497f942008-05-26 12:29:14 +00001149 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 }
Tim Petersced69f82003-09-16 20:30:58 +00001165
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001173
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 return v;
1175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184{
1185 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186
1187 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001189
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001211 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245 return NULL;
1246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252{
1253 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1261}
1262
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1266{
1267 PyObject *v;
1268
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1272 }
1273
1274 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284 return NULL;
1285}
1286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
Fred Drakee4315f52000-05-09 19:53:39 +00001297
Tim Petersced69f82003-09-16 20:30:58 +00001298 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001300
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001307#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001319 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001321 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001322 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 Py_DECREF(v);
1324 goto onError;
1325 }
1326 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 return NULL;
1330}
1331
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001333 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001334{
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_AS_UNICODE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return NULL;
1355}
1356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358{
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1362 }
1363 return PyUnicode_GET_SIZE(unicode);
1364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 return -1;
1367}
1368
Thomas Wouters78890102000-07-22 19:25:51 +00001369const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001370{
1371 return unicode_default_encoding;
1372}
1373
1374int PyUnicode_SetDefaultEncoding(const char *encoding)
1375{
1376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001377
Fred Drakee4315f52000-05-09 19:53:39 +00001378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 encoding,
1386 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001387 return 0;
1388
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return -1;
1391}
1392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393/* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001395 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 and adjust various state variables.
1397 return 0 on success, -1 on error
1398*/
1399
1400static
1401int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 int res = -1;
1417
1418 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 }
1423
1424 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001425 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 }
1430 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 }
1438
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 }
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001450 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 }
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_XDECREF(restuple);
1479 return res;
1480}
1481
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482/* --- UTF-7 Codec -------------------------------------------------------- */
1483
Antoine Pitrou653dece2009-05-04 18:32:32 +00001484/* See RFC2152 for details. We encode conservatively and decode liberally. */
1485
1486/* Three simple macros defining base-64. */
1487
1488/* Is c a base-64 character? */
1489
1490#define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493/* given that c is a base-64 character, what is its base-64 value? */
1494
1495#define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1500
1501/* What is the base-64 character of the bottom 6 bits of n? */
1502
1503#define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1510
1511#define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1513
1514/* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
Tim Petersced69f82003-09-16 20:30:58 +00001528static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001529char utf7_category[128] = {
1530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534/* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538/* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542/* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546};
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554#define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001561 Py_ssize_t size,
1562 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565}
1566
Antoine Pitrou653dece2009-05-04 18:32:32 +00001567/* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1573
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001574PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001605 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 e = s + size;
1607
1608 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625#ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628#else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631#endif
1632 surrogate = 0;
1633 }
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1638 }
1639 }
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1643 }
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1647 }
1648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001851 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001852 return v;
1853}
1854
Antoine Pitrou653dece2009-05-04 18:32:32 +00001855#undef IS_BASE64
1856#undef FROM_BASE64
1857#undef TO_BASE64
1858#undef DECODE_DIRECT
1859#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861/* --- UTF-8 Codec -------------------------------------------------------- */
1862
Tim Petersced69f82003-09-16 20:30:58 +00001863static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864char utf8_code_length[256] = {
1865 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1866 illegal prefix. see RFC 2279 for details */
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1880 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1881 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1882 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1883};
1884
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001886 Py_ssize_t size,
1887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888{
Walter Dörwald69652032004-09-07 20:24:22 +00001889 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1890}
1891
1892PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001893 Py_ssize_t size,
1894 const char *errors,
1895 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001896{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001897 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001899 Py_ssize_t startinpos;
1900 Py_ssize_t endinpos;
1901 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 const char *e;
1903 PyUnicodeObject *unicode;
1904 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001905 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001906 PyObject *errorHandler = NULL;
1907 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908
1909 /* Note: size will always be longer than the resulting Unicode
1910 character count */
1911 unicode = _PyUnicode_New(size);
1912 if (!unicode)
1913 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001914 if (size == 0) {
1915 if (consumed)
1916 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
1920 /* Unpack UTF-8 encoded data */
1921 p = unicode->str;
1922 e = s + size;
1923
1924 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001925 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926
1927 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001928 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 s++;
1930 continue;
1931 }
1932
1933 n = utf8_code_length[ch];
1934
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001935 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001936 if (consumed)
1937 break;
1938 else {
1939 errmsg = "unexpected end of data";
1940 startinpos = s-starts;
1941 endinpos = size;
1942 goto utf8Error;
1943 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
1946 switch (n) {
1947
1948 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001949 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001950 startinpos = s-starts;
1951 endinpos = startinpos+1;
1952 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953
1954 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001956 startinpos = s-starts;
1957 endinpos = startinpos+1;
1958 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959
1960 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001961 if ((s[1] & 0xc0) != 0x80) {
1962 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001963 startinpos = s-starts;
1964 endinpos = startinpos+2;
1965 goto utf8Error;
1966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001968 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 startinpos = s-starts;
1970 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001971 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001972 goto utf8Error;
1973 }
1974 else
1975 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 break;
1977
1978 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001979 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001980 (s[2] & 0xc0) != 0x80) {
1981 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001982 startinpos = s-starts;
1983 endinpos = startinpos+3;
1984 goto utf8Error;
1985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001987 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001988 /* Note: UTF-8 encodings of surrogates are considered
1989 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001990
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001991 XXX For wide builds (UCS-4) we should probably try
1992 to recombine the surrogates into a single code
1993 unit.
1994 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001996 startinpos = s-starts;
1997 endinpos = startinpos+3;
1998 goto utf8Error;
1999 }
2000 else
2001 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002002 break;
2003
2004 case 4:
2005 if ((s[1] & 0xc0) != 0x80 ||
2006 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002007 (s[3] & 0xc0) != 0x80) {
2008 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002009 startinpos = s-starts;
2010 endinpos = startinpos+4;
2011 goto utf8Error;
2012 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002013 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002015 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002016 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002017 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002018 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002019 UTF-16 */
2020 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002021 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002022 startinpos = s-starts;
2023 endinpos = startinpos+4;
2024 goto utf8Error;
2025 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002026#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002027 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002028#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002030
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002031 /* translate from 10000..10FFFF to 0..FFFF */
2032 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002033
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002034 /* high surrogate = top 10 bits added to D800 */
2035 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002036
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002037 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002038 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002039#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 break;
2041
2042 default:
2043 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002044 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002045 startinpos = s-starts;
2046 endinpos = startinpos+n;
2047 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
2049 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002051
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002052 utf8Error:
2053 outpos = p-PyUnicode_AS_UNICODE(unicode);
2054 if (unicode_decode_call_errorhandler(
2055 errors, &errorHandler,
2056 "utf8", errmsg,
2057 starts, size, &startinpos, &endinpos, &exc, &s,
2058 &unicode, &outpos, &p))
2059 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
Walter Dörwald69652032004-09-07 20:24:22 +00002061 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002062 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
2064 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002065 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 goto onError;
2067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068 Py_XDECREF(errorHandler);
2069 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 return (PyObject *)unicode;
2071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002072 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002073 Py_XDECREF(errorHandler);
2074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 Py_DECREF(unicode);
2076 return NULL;
2077}
2078
Tim Peters602f7402002-04-27 18:03:26 +00002079/* Allocation strategy: if the string is short, convert into a stack buffer
2080 and allocate exactly as much space needed at the end. Else allocate the
2081 maximum possible needed (4 result bytes per Unicode character), and return
2082 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002083*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002084PyObject *
2085PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002086 Py_ssize_t size,
2087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088{
Tim Peters602f7402002-04-27 18:03:26 +00002089#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002090
Martin v. Löwis18e16552006-02-15 17:27:45 +00002091 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002092 PyObject *v; /* result string object */
2093 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002094 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002095 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002096 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002097
Tim Peters602f7402002-04-27 18:03:26 +00002098 assert(s != NULL);
2099 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100
Tim Peters602f7402002-04-27 18:03:26 +00002101 if (size <= MAX_SHORT_UNICHARS) {
2102 /* Write into the stack buffer; nallocated can't overflow.
2103 * At the end, we'll allocate exactly as much heap space as it
2104 * turns out we need.
2105 */
2106 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2107 v = NULL; /* will allocate after we're done */
2108 p = stackbuf;
2109 }
2110 else {
2111 /* Overallocate on the heap, and give the excess back at the end. */
2112 nallocated = size * 4;
2113 if (nallocated / 4 != size) /* overflow! */
2114 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002115 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002116 if (v == NULL)
2117 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002118 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002119 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002120
Tim Peters602f7402002-04-27 18:03:26 +00002121 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002122 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002123
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002124 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002125 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002129 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002130 *p++ = (char)(0xc0 | (ch >> 6));
2131 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002132 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002133 else {
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Encode UCS2 Unicode ordinals */
2135 if (ch < 0x10000) {
2136 /* Special case: check for high surrogate */
2137 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2138 Py_UCS4 ch2 = s[i];
2139 /* Check for low surrogate and combine the two to
2140 form a UCS4 value */
2141 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002142 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002143 i++;
2144 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002145 }
Tim Peters602f7402002-04-27 18:03:26 +00002146 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002147 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002148 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002149 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2150 *p++ = (char)(0x80 | (ch & 0x3f));
2151 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002152 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002153 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002154 /* Encode UCS4 Unicode ordinals */
2155 *p++ = (char)(0xf0 | (ch >> 18));
2156 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2157 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2158 *p++ = (char)(0x80 | (ch & 0x3f));
2159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002161
Tim Peters602f7402002-04-27 18:03:26 +00002162 if (v == NULL) {
2163 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002164 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002166 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002167 }
2168 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002169 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002170 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002171 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002172 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002175
Tim Peters602f7402002-04-27 18:03:26 +00002176#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177}
2178
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2180{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 if (!PyUnicode_Check(unicode)) {
2182 PyErr_BadArgument();
2183 return NULL;
2184 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002185 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002186 PyUnicode_GET_SIZE(unicode),
2187 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188}
2189
Walter Dörwald6e390802007-08-17 16:41:28 +00002190/* --- UTF-32 Codec ------------------------------------------------------- */
2191
2192PyObject *
2193PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002194 Py_ssize_t size,
2195 const char *errors,
2196 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002197{
2198 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2199}
2200
2201PyObject *
2202PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002203 Py_ssize_t size,
2204 const char *errors,
2205 int *byteorder,
2206 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002207{
2208 const char *starts = s;
2209 Py_ssize_t startinpos;
2210 Py_ssize_t endinpos;
2211 Py_ssize_t outpos;
2212 PyUnicodeObject *unicode;
2213 Py_UNICODE *p;
2214#ifndef Py_UNICODE_WIDE
2215 int i, pairs;
2216#else
2217 const int pairs = 0;
2218#endif
2219 const unsigned char *q, *e;
2220 int bo = 0; /* assume native ordering by default */
2221 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002222 /* Offsets from q for retrieving bytes in the right order. */
2223#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2224 int iorder[] = {0, 1, 2, 3};
2225#else
2226 int iorder[] = {3, 2, 1, 0};
2227#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002228 PyObject *errorHandler = NULL;
2229 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002230 /* On narrow builds we split characters outside the BMP into two
2231 codepoints => count how much extra space we need. */
2232#ifndef Py_UNICODE_WIDE
2233 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002234 if (((Py_UCS4 *)s)[i] >= 0x10000)
2235 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002236#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002237
2238 /* This might be one to much, because of a BOM */
2239 unicode = _PyUnicode_New((size+3)/4+pairs);
2240 if (!unicode)
2241 return NULL;
2242 if (size == 0)
2243 return (PyObject *)unicode;
2244
2245 /* Unpack UTF-32 encoded data */
2246 p = unicode->str;
2247 q = (unsigned char *)s;
2248 e = q + size;
2249
2250 if (byteorder)
2251 bo = *byteorder;
2252
2253 /* Check for BOM marks (U+FEFF) in the input and adjust current
2254 byte order setting accordingly. In native mode, the leading BOM
2255 mark is skipped, in all other modes, it is copied to the output
2256 stream as-is (giving a ZWNBSP character). */
2257 if (bo == 0) {
2258 if (size >= 4) {
2259 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002261#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002262 if (bom == 0x0000FEFF) {
2263 q += 4;
2264 bo = -1;
2265 }
2266 else if (bom == 0xFFFE0000) {
2267 q += 4;
2268 bo = 1;
2269 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002270#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002271 if (bom == 0x0000FEFF) {
2272 q += 4;
2273 bo = 1;
2274 }
2275 else if (bom == 0xFFFE0000) {
2276 q += 4;
2277 bo = -1;
2278 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002279#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002280 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002281 }
2282
2283 if (bo == -1) {
2284 /* force LE */
2285 iorder[0] = 0;
2286 iorder[1] = 1;
2287 iorder[2] = 2;
2288 iorder[3] = 3;
2289 }
2290 else if (bo == 1) {
2291 /* force BE */
2292 iorder[0] = 3;
2293 iorder[1] = 2;
2294 iorder[2] = 1;
2295 iorder[3] = 0;
2296 }
2297
2298 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002299 Py_UCS4 ch;
2300 /* remaining bytes at the end? (size should be divisible by 4) */
2301 if (e-q<4) {
2302 if (consumed)
2303 break;
2304 errmsg = "truncated data";
2305 startinpos = ((const char *)q)-starts;
2306 endinpos = ((const char *)e)-starts;
2307 goto utf32Error;
2308 /* The remaining input chars are ignored if the callback
2309 chooses to skip the input */
2310 }
2311 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2312 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002313
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 if (ch >= 0x110000)
2315 {
2316 errmsg = "codepoint not in range(0x110000)";
2317 startinpos = ((const char *)q)-starts;
2318 endinpos = startinpos+4;
2319 goto utf32Error;
2320 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002321#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002322 if (ch >= 0x10000)
2323 {
2324 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2325 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2326 }
2327 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002328#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002329 *p++ = ch;
2330 q += 4;
2331 continue;
2332 utf32Error:
2333 outpos = p-PyUnicode_AS_UNICODE(unicode);
2334 if (unicode_decode_call_errorhandler(
2335 errors, &errorHandler,
2336 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002337 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002338 &unicode, &outpos, &p))
2339 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002340 }
2341
2342 if (byteorder)
2343 *byteorder = bo;
2344
2345 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002346 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002347
2348 /* Adjust length */
2349 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2350 goto onError;
2351
2352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
2354 return (PyObject *)unicode;
2355
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002356 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002357 Py_DECREF(unicode);
2358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
2360 return NULL;
2361}
2362
2363PyObject *
2364PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002365 Py_ssize_t size,
2366 const char *errors,
2367 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002368{
2369 PyObject *v;
2370 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002371 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002373 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002374#else
2375 const int pairs = 0;
2376#endif
2377 /* Offsets from p for storing byte pairs in the right order. */
2378#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2379 int iorder[] = {0, 1, 2, 3};
2380#else
2381 int iorder[] = {3, 2, 1, 0};
2382#endif
2383
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002384#define STORECHAR(CH) \
2385 do { \
2386 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2387 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2388 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2389 p[iorder[0]] = (CH) & 0xff; \
2390 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002391 } while(0)
2392
2393 /* In narrow builds we can output surrogate pairs as one codepoint,
2394 so we need less space. */
2395#ifndef Py_UNICODE_WIDE
2396 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002397 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2398 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2399 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002400#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002401 nsize = (size - pairs + (byteorder == 0));
2402 bytesize = nsize * 4;
2403 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002404 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002405 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002406 if (v == NULL)
2407 return NULL;
2408
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002409 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002411 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002412 if (size == 0)
2413 return v;
2414
2415 if (byteorder == -1) {
2416 /* force LE */
2417 iorder[0] = 0;
2418 iorder[1] = 1;
2419 iorder[2] = 2;
2420 iorder[3] = 3;
2421 }
2422 else if (byteorder == 1) {
2423 /* force BE */
2424 iorder[0] = 3;
2425 iorder[1] = 2;
2426 iorder[2] = 1;
2427 iorder[3] = 0;
2428 }
2429
2430 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002431 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002432#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002433 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2434 Py_UCS4 ch2 = *s;
2435 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2436 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2437 s++;
2438 size--;
2439 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002440 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002441#endif
2442 STORECHAR(ch);
2443 }
2444 return v;
2445#undef STORECHAR
2446}
2447
2448PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2449{
2450 if (!PyUnicode_Check(unicode)) {
2451 PyErr_BadArgument();
2452 return NULL;
2453 }
2454 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002455 PyUnicode_GET_SIZE(unicode),
2456 NULL,
2457 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002458}
2459
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460/* --- UTF-16 Codec ------------------------------------------------------- */
2461
Tim Peters772747b2001-08-09 22:21:55 +00002462PyObject *
2463PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002464 Py_ssize_t size,
2465 const char *errors,
2466 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467{
Walter Dörwald69652032004-09-07 20:24:22 +00002468 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2469}
2470
2471PyObject *
2472PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002473 Py_ssize_t size,
2474 const char *errors,
2475 int *byteorder,
2476 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002479 Py_ssize_t startinpos;
2480 Py_ssize_t endinpos;
2481 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 PyUnicodeObject *unicode;
2483 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002484 const unsigned char *q, *e;
2485 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002486 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002487 /* Offsets from q for retrieving byte pairs in the right order. */
2488#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2489 int ihi = 1, ilo = 0;
2490#else
2491 int ihi = 0, ilo = 1;
2492#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493 PyObject *errorHandler = NULL;
2494 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495
2496 /* Note: size will always be longer than the resulting Unicode
2497 character count */
2498 unicode = _PyUnicode_New(size);
2499 if (!unicode)
2500 return NULL;
2501 if (size == 0)
2502 return (PyObject *)unicode;
2503
2504 /* Unpack UTF-16 encoded data */
2505 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002506 q = (unsigned char *)s;
2507 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
2509 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002510 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002512 /* Check for BOM marks (U+FEFF) in the input and adjust current
2513 byte order setting accordingly. In native mode, the leading BOM
2514 mark is skipped, in all other modes, it is copied to the output
2515 stream as-is (giving a ZWNBSP character). */
2516 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002517 if (size >= 2) {
2518 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002519#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002520 if (bom == 0xFEFF) {
2521 q += 2;
2522 bo = -1;
2523 }
2524 else if (bom == 0xFFFE) {
2525 q += 2;
2526 bo = 1;
2527 }
Tim Petersced69f82003-09-16 20:30:58 +00002528#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002529 if (bom == 0xFEFF) {
2530 q += 2;
2531 bo = 1;
2532 }
2533 else if (bom == 0xFFFE) {
2534 q += 2;
2535 bo = -1;
2536 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002537#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002538 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540
Tim Peters772747b2001-08-09 22:21:55 +00002541 if (bo == -1) {
2542 /* force LE */
2543 ihi = 1;
2544 ilo = 0;
2545 }
2546 else if (bo == 1) {
2547 /* force BE */
2548 ihi = 0;
2549 ilo = 1;
2550 }
2551
2552 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002553 Py_UNICODE ch;
2554 /* remaining bytes at the end? (size should be even) */
2555 if (e-q<2) {
2556 if (consumed)
2557 break;
2558 errmsg = "truncated data";
2559 startinpos = ((const char *)q)-starts;
2560 endinpos = ((const char *)e)-starts;
2561 goto utf16Error;
2562 /* The remaining input chars are ignored if the callback
2563 chooses to skip the input */
2564 }
2565 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566
Benjamin Peterson857ce152009-01-31 16:29:18 +00002567 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002568
2569 if (ch < 0xD800 || ch > 0xDFFF) {
2570 *p++ = ch;
2571 continue;
2572 }
2573
2574 /* UTF-16 code pair: */
2575 if (q >= e) {
2576 errmsg = "unexpected end of data";
2577 startinpos = (((const char *)q)-2)-starts;
2578 endinpos = ((const char *)e)-starts;
2579 goto utf16Error;
2580 }
2581 if (0xD800 <= ch && ch <= 0xDBFF) {
2582 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2583 q += 2;
2584 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002585#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 *p++ = ch;
2587 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002591 continue;
2592 }
2593 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002594 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002595 startinpos = (((const char *)q)-4)-starts;
2596 endinpos = startinpos+2;
2597 goto utf16Error;
2598 }
2599
Benjamin Peterson857ce152009-01-31 16:29:18 +00002600 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002601 errmsg = "illegal encoding";
2602 startinpos = (((const char *)q)-2)-starts;
2603 endinpos = startinpos+2;
2604 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002605
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002606 utf16Error:
2607 outpos = p-PyUnicode_AS_UNICODE(unicode);
2608 if (unicode_decode_call_errorhandler(
2609 errors, &errorHandler,
2610 "utf16", errmsg,
2611 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2612 &unicode, &outpos, &p))
2613 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 }
2615
2616 if (byteorder)
2617 *byteorder = bo;
2618
Walter Dörwald69652032004-09-07 20:24:22 +00002619 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002620 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002621
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002623 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 goto onError;
2625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 Py_XDECREF(errorHandler);
2627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 return (PyObject *)unicode;
2629
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002630 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002632 Py_XDECREF(errorHandler);
2633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 return NULL;
2635}
2636
Tim Peters772747b2001-08-09 22:21:55 +00002637PyObject *
2638PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002639 Py_ssize_t size,
2640 const char *errors,
2641 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642{
2643 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002644 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002645 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002646#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002647 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002648#else
2649 const int pairs = 0;
2650#endif
Tim Peters772747b2001-08-09 22:21:55 +00002651 /* Offsets from p for storing byte pairs in the right order. */
2652#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2653 int ihi = 1, ilo = 0;
2654#else
2655 int ihi = 0, ilo = 1;
2656#endif
2657
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002658#define STORECHAR(CH) \
2659 do { \
2660 p[ihi] = ((CH) >> 8) & 0xff; \
2661 p[ilo] = (CH) & 0xff; \
2662 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002663 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002667 if (s[i] >= 0x10000)
2668 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002669#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002670 /* 2 * (size + pairs + (byteorder == 0)) */
2671 if (size > PY_SSIZE_T_MAX ||
2672 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002673 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002674 nsize = size + pairs + (byteorder == 0);
2675 bytesize = nsize * 2;
2676 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002677 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002678 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 if (v == NULL)
2680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002682 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002684 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002685 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002686 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002687
2688 if (byteorder == -1) {
2689 /* force LE */
2690 ihi = 1;
2691 ilo = 0;
2692 }
2693 else if (byteorder == 1) {
2694 /* force BE */
2695 ihi = 0;
2696 ilo = 1;
2697 }
2698
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002699 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002700 Py_UNICODE ch = *s++;
2701 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002702#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002703 if (ch >= 0x10000) {
2704 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2705 ch = 0xD800 | ((ch-0x10000) >> 10);
2706 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002707#endif
Tim Peters772747b2001-08-09 22:21:55 +00002708 STORECHAR(ch);
2709 if (ch2)
2710 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002713#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714}
2715
2716PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2717{
2718 if (!PyUnicode_Check(unicode)) {
2719 PyErr_BadArgument();
2720 return NULL;
2721 }
2722 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002723 PyUnicode_GET_SIZE(unicode),
2724 NULL,
2725 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726}
2727
2728/* --- Unicode Escape Codec ----------------------------------------------- */
2729
Fredrik Lundh06d12682001-01-24 07:59:11 +00002730static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002731
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002733 Py_ssize_t size,
2734 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002737 Py_ssize_t startinpos;
2738 Py_ssize_t endinpos;
2739 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744 char* message;
2745 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 PyObject *errorHandler = NULL;
2747 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002748
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 /* Escaped strings will always be longer than the resulting
2750 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 length after conversion to the true value.
2752 (but if the error callback returns a long replacement string
2753 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 v = _PyUnicode_New(size);
2755 if (v == NULL)
2756 goto onError;
2757 if (size == 0)
2758 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 while (s < end) {
2764 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002765 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767
2768 /* Non-escape characters are interpreted as Unicode ordinals */
2769 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002770 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 continue;
2772 }
2773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 /* \ - Escapes */
2776 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002777 c = *s++;
2778 if (s > end)
2779 c = '\0'; /* Invalid after \ */
2780 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002782 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 case '\n': break;
2784 case '\\': *p++ = '\\'; break;
2785 case '\'': *p++ = '\''; break;
2786 case '\"': *p++ = '\"'; break;
2787 case 'b': *p++ = '\b'; break;
2788 case 'f': *p++ = '\014'; break; /* FF */
2789 case 't': *p++ = '\t'; break;
2790 case 'n': *p++ = '\n'; break;
2791 case 'r': *p++ = '\r'; break;
2792 case 'v': *p++ = '\013'; break; /* VT */
2793 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002795 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 case '0': case '1': case '2': case '3':
2797 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002798 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002799 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002801 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002804 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 break;
2806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002807 /* hex escapes */
2808 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002810 digits = 2;
2811 message = "truncated \\xXX escape";
2812 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002814 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816 digits = 4;
2817 message = "truncated \\uXXXX escape";
2818 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002820 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002821 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 digits = 8;
2823 message = "truncated \\UXXXXXXXX escape";
2824 hexescape:
2825 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 outpos = p-PyUnicode_AS_UNICODE(v);
2827 if (s+digits>end) {
2828 endinpos = size;
2829 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002830 errors, &errorHandler,
2831 "unicodeescape", "end of string in escape sequence",
2832 starts, size, &startinpos, &endinpos, &exc, &s,
2833 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 goto onError;
2835 goto nextByte;
2836 }
2837 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002838 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002839 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 endinpos = (s+i+1)-starts;
2841 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002842 errors, &errorHandler,
2843 "unicodeescape", message,
2844 starts, size, &startinpos, &endinpos, &exc, &s,
2845 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002846 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002848 }
2849 chr = (chr<<4) & ~0xF;
2850 if (c >= '0' && c <= '9')
2851 chr += c - '0';
2852 else if (c >= 'a' && c <= 'f')
2853 chr += 10 + c - 'a';
2854 else
2855 chr += 10 + c - 'A';
2856 }
2857 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002858 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 /* _decoding_error will have already written into the
2860 target buffer. */
2861 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002862 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002863 /* when we get here, chr is a 32-bit unicode character */
2864 if (chr <= 0xffff)
2865 /* UCS-2 character */
2866 *p++ = (Py_UNICODE) chr;
2867 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002868 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002869 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002870#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002871 *p++ = chr;
2872#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002873 chr -= 0x10000L;
2874 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002875 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002876#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002877 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 endinpos = s-starts;
2879 outpos = p-PyUnicode_AS_UNICODE(v);
2880 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002881 errors, &errorHandler,
2882 "unicodeescape", "illegal Unicode character",
2883 starts, size, &startinpos, &endinpos, &exc, &s,
2884 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002885 goto onError;
2886 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 break;
2888
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002889 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 case 'N':
2891 message = "malformed \\N character escape";
2892 if (ucnhash_CAPI == NULL) {
2893 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002894 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002895 if (ucnhash_CAPI == NULL)
2896 goto ucnhashError;
2897 }
2898 if (*s == '{') {
2899 const char *start = s+1;
2900 /* look for the closing brace */
2901 while (*s != '}' && s < end)
2902 s++;
2903 if (s > start && s < end && *s == '}') {
2904 /* found a name. look it up in the unicode database */
2905 message = "unknown Unicode character name";
2906 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002907 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002908 goto store;
2909 }
2910 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 endinpos = s-starts;
2912 outpos = p-PyUnicode_AS_UNICODE(v);
2913 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002914 errors, &errorHandler,
2915 "unicodeescape", message,
2916 starts, size, &startinpos, &endinpos, &exc, &s,
2917 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002919 break;
2920
2921 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002922 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 message = "\\ at end of string";
2924 s--;
2925 endinpos = s-starts;
2926 outpos = p-PyUnicode_AS_UNICODE(v);
2927 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002928 errors, &errorHandler,
2929 "unicodeescape", message,
2930 starts, size, &startinpos, &endinpos, &exc, &s,
2931 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002932 goto onError;
2933 }
2934 else {
2935 *p++ = '\\';
2936 *p++ = (unsigned char)s[-1];
2937 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002938 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002940 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002943 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002944 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002945 Py_XDECREF(errorHandler);
2946 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002948
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002949 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002950 PyErr_SetString(
2951 PyExc_UnicodeError,
2952 "\\N escapes not supported (can't load unicodedata module)"
2953 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002954 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 Py_XDECREF(errorHandler);
2956 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002957 return NULL;
2958
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002959 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 Py_XDECREF(errorHandler);
2962 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 return NULL;
2964}
2965
2966/* Return a Unicode-Escape string version of the Unicode object.
2967
2968 If quotes is true, the string is enclosed in u"" or u'' quotes as
2969 appropriate.
2970
2971*/
2972
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002973Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002974 Py_ssize_t size,
2975 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002976{
2977 /* like wcschr, but doesn't stop at NULL characters */
2978
2979 while (size-- > 0) {
2980 if (*s == ch)
2981 return s;
2982 s++;
2983 }
2984
2985 return NULL;
2986}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002987
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988static
2989PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002990 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 int quotes)
2992{
2993 PyObject *repr;
2994 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002996 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002997#ifdef Py_UNICODE_WIDE
2998 const Py_ssize_t expandsize = 10;
2999#else
3000 const Py_ssize_t expandsize = 6;
3001#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002
Neal Norwitz17753ec2006-08-21 22:21:19 +00003003 /* XXX(nnorwitz): rather than over-allocating, it would be
3004 better to choose a different scheme. Perhaps scan the
3005 first N-chars of the string and allocate based on that size.
3006 */
3007 /* Initial allocation is based on the longest-possible unichr
3008 escape.
3009
3010 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3011 unichr, so in this case it's the longest unichr escape. In
3012 narrow (UTF-16) builds this is five chars per source unichr
3013 since there are two unichrs in the surrogate pair, so in narrow
3014 (UTF-16) builds it's not the longest unichr escape.
3015
3016 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3017 so in the narrow (UTF-16) build case it's the longest unichr
3018 escape.
3019 */
3020
Neal Norwitze7d8be82008-07-31 17:17:14 +00003021 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003022 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003023
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003024 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003025 2
3026 + expandsize*size
3027 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 if (repr == NULL)
3029 return NULL;
3030
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003031 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
3033 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003035 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 !findchar(s, size, '"')) ? '"' : '\'';
3037 }
3038 while (size-- > 0) {
3039 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003040
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003041 /* Escape quotes and backslashes */
3042 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003043 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 *p++ = '\\';
3045 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003046 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003047 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003048
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003049#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003050 /* Map 21-bit characters to '\U00xxxxxx' */
3051 else if (ch >= 0x10000) {
3052 *p++ = '\\';
3053 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003054 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3058 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3059 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3060 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003061 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003062 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003063 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003064#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003065 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3066 else if (ch >= 0xD800 && ch < 0xDC00) {
3067 Py_UNICODE ch2;
3068 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003069
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003070 ch2 = *s++;
3071 size--;
3072 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3073 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3074 *p++ = '\\';
3075 *p++ = 'U';
3076 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3080 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3081 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3082 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3083 *p++ = hexdigit[ucs & 0x0000000F];
3084 continue;
3085 }
3086 /* Fall through: isolated surrogates are copied as-is */
3087 s--;
3088 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003089 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003090#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003091
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003093 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 *p++ = '\\';
3095 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003096 *p++ = hexdigit[(ch >> 12) & 0x000F];
3097 *p++ = hexdigit[(ch >> 8) & 0x000F];
3098 *p++ = hexdigit[(ch >> 4) & 0x000F];
3099 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003101
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003102 /* Map special whitespace to '\t', \n', '\r' */
3103 else if (ch == '\t') {
3104 *p++ = '\\';
3105 *p++ = 't';
3106 }
3107 else if (ch == '\n') {
3108 *p++ = '\\';
3109 *p++ = 'n';
3110 }
3111 else if (ch == '\r') {
3112 *p++ = '\\';
3113 *p++ = 'r';
3114 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003115
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003116 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003117 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003119 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003120 *p++ = hexdigit[(ch >> 4) & 0x000F];
3121 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003122 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 /* Copy everything else as-is */
3125 else
3126 *p++ = (char) ch;
3127 }
3128 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003129 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130
3131 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003132 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 return repr;
3134}
3135
3136PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003137 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138{
3139 return unicodeescape_string(s, size, 0);
3140}
3141
3142PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3143{
3144 if (!PyUnicode_Check(unicode)) {
3145 PyErr_BadArgument();
3146 return NULL;
3147 }
3148 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003149 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150}
3151
3152/* --- Raw Unicode Escape Codec ------------------------------------------- */
3153
3154PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003155 Py_ssize_t size,
3156 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003159 Py_ssize_t startinpos;
3160 Py_ssize_t endinpos;
3161 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 const char *end;
3165 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 PyObject *errorHandler = NULL;
3167 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003168
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 /* Escaped strings will always be longer than the resulting
3170 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003171 length after conversion to the true value. (But decoding error
3172 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 v = _PyUnicode_New(size);
3174 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003175 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003177 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 end = s + size;
3180 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003181 unsigned char c;
3182 Py_UCS4 x;
3183 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003184 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003186 /* Non-escape characters are interpreted as Unicode ordinals */
3187 if (*s != '\\') {
3188 *p++ = (unsigned char)*s++;
3189 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003190 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003191 startinpos = s-starts;
3192
3193 /* \u-escapes are only interpreted iff the number of leading
3194 backslashes if odd */
3195 bs = s;
3196 for (;s < end;) {
3197 if (*s != '\\')
3198 break;
3199 *p++ = (unsigned char)*s++;
3200 }
3201 if (((s - bs) & 1) == 0 ||
3202 s >= end ||
3203 (*s != 'u' && *s != 'U')) {
3204 continue;
3205 }
3206 p--;
3207 count = *s=='u' ? 4 : 8;
3208 s++;
3209
3210 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3211 outpos = p-PyUnicode_AS_UNICODE(v);
3212 for (x = 0, i = 0; i < count; ++i, ++s) {
3213 c = (unsigned char)*s;
3214 if (!isxdigit(c)) {
3215 endinpos = s-starts;
3216 if (unicode_decode_call_errorhandler(
3217 errors, &errorHandler,
3218 "rawunicodeescape", "truncated \\uXXXX",
3219 starts, size, &startinpos, &endinpos, &exc, &s,
3220 &v, &outpos, &p))
3221 goto onError;
3222 goto nextByte;
3223 }
3224 x = (x<<4) & ~0xF;
3225 if (c >= '0' && c <= '9')
3226 x += c - '0';
3227 else if (c >= 'a' && c <= 'f')
3228 x += 10 + c - 'a';
3229 else
3230 x += 10 + c - 'A';
3231 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003232 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 /* UCS-2 character */
3234 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 /* UCS-4 character. Either store directly, or as
3237 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003238#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003240#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003241 x -= 0x10000L;
3242 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3243 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003244#endif
3245 } else {
3246 endinpos = s-starts;
3247 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003248 if (unicode_decode_call_errorhandler(
3249 errors, &errorHandler,
3250 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003251 starts, size, &startinpos, &endinpos, &exc, &s,
3252 &v, &outpos, &p))
3253 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003254 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003255 nextByte:
3256 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003258 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003259 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003264 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 Py_XDECREF(errorHandler);
3267 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 return NULL;
3269}
3270
3271PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003272 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273{
3274 PyObject *repr;
3275 char *p;
3276 char *q;
3277
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003278 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003279#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003280 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003281#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003282 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003283#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003284
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003286 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003287
Neal Norwitze7d8be82008-07-31 17:17:14 +00003288 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 if (repr == NULL)
3290 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003291 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003292 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003294 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 while (size-- > 0) {
3296 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003297#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003298 /* Map 32-bit characters to '\Uxxxxxxxx' */
3299 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300 *p++ = '\\';
3301 *p++ = 'U';
3302 *p++ = hexdigit[(ch >> 28) & 0xf];
3303 *p++ = hexdigit[(ch >> 24) & 0xf];
3304 *p++ = hexdigit[(ch >> 20) & 0xf];
3305 *p++ = hexdigit[(ch >> 16) & 0xf];
3306 *p++ = hexdigit[(ch >> 12) & 0xf];
3307 *p++ = hexdigit[(ch >> 8) & 0xf];
3308 *p++ = hexdigit[(ch >> 4) & 0xf];
3309 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003310 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003311 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003312#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003313 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3314 if (ch >= 0xD800 && ch < 0xDC00) {
3315 Py_UNICODE ch2;
3316 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003317
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003318 ch2 = *s++;
3319 size--;
3320 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3321 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3322 *p++ = '\\';
3323 *p++ = 'U';
3324 *p++ = hexdigit[(ucs >> 28) & 0xf];
3325 *p++ = hexdigit[(ucs >> 24) & 0xf];
3326 *p++ = hexdigit[(ucs >> 20) & 0xf];
3327 *p++ = hexdigit[(ucs >> 16) & 0xf];
3328 *p++ = hexdigit[(ucs >> 12) & 0xf];
3329 *p++ = hexdigit[(ucs >> 8) & 0xf];
3330 *p++ = hexdigit[(ucs >> 4) & 0xf];
3331 *p++ = hexdigit[ucs & 0xf];
3332 continue;
3333 }
3334 /* Fall through: isolated surrogates are copied as-is */
3335 s--;
3336 size++;
3337 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003338#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003339 /* Map 16-bit characters to '\uxxxx' */
3340 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 *p++ = '\\';
3342 *p++ = 'u';
3343 *p++ = hexdigit[(ch >> 12) & 0xf];
3344 *p++ = hexdigit[(ch >> 8) & 0xf];
3345 *p++ = hexdigit[(ch >> 4) & 0xf];
3346 *p++ = hexdigit[ch & 15];
3347 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003348 /* Copy everything else as-is */
3349 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 *p++ = (char) ch;
3351 }
3352 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003353 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 return repr;
3355}
3356
3357PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3358{
3359 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360 PyErr_BadArgument();
3361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 }
3363 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365}
3366
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003367/* --- Unicode Internal Codec ------------------------------------------- */
3368
3369PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003370 Py_ssize_t size,
3371 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003372{
3373 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003374 Py_ssize_t startinpos;
3375 Py_ssize_t endinpos;
3376 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003377 PyUnicodeObject *v;
3378 Py_UNICODE *p;
3379 const char *end;
3380 const char *reason;
3381 PyObject *errorHandler = NULL;
3382 PyObject *exc = NULL;
3383
Neal Norwitzd43069c2006-01-08 01:12:10 +00003384#ifdef Py_UNICODE_WIDE
3385 Py_UNICODE unimax = PyUnicode_GetMax();
3386#endif
3387
Armin Rigo7ccbca92006-10-04 12:17:45 +00003388 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3390 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003392 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003393 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003394 p = PyUnicode_AS_UNICODE(v);
3395 end = s + size;
3396
3397 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003398 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003399 /* We have to sanity check the raw data, otherwise doom looms for
3400 some malformed UCS-4 data. */
3401 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003402#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003405 end-s < Py_UNICODE_SIZE
3406 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003407 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003408 startinpos = s - starts;
3409 if (end-s < Py_UNICODE_SIZE) {
3410 endinpos = end-starts;
3411 reason = "truncated input";
3412 }
3413 else {
3414 endinpos = s - starts + Py_UNICODE_SIZE;
3415 reason = "illegal code point (> 0x10FFFF)";
3416 }
3417 outpos = p - PyUnicode_AS_UNICODE(v);
3418 if (unicode_decode_call_errorhandler(
3419 errors, &errorHandler,
3420 "unicode_internal", reason,
3421 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003422 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003423 goto onError;
3424 }
3425 }
3426 else {
3427 p++;
3428 s += Py_UNICODE_SIZE;
3429 }
3430 }
3431
Martin v. Löwis412fb672006-04-13 06:34:32 +00003432 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003433 goto onError;
3434 Py_XDECREF(errorHandler);
3435 Py_XDECREF(exc);
3436 return (PyObject *)v;
3437
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003438 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003439 Py_XDECREF(v);
3440 Py_XDECREF(errorHandler);
3441 Py_XDECREF(exc);
3442 return NULL;
3443}
3444
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445/* --- Latin-1 Codec ------------------------------------------------------ */
3446
3447PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003448 Py_ssize_t size,
3449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450{
3451 PyUnicodeObject *v;
3452 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003453
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003455 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003456 Py_UNICODE r = *(unsigned char*)s;
3457 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003458 }
3459
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 v = _PyUnicode_New(size);
3461 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 p = PyUnicode_AS_UNICODE(v);
3466 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003467 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003469
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003470 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 Py_XDECREF(v);
3472 return NULL;
3473}
3474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475/* create or adjust a UnicodeEncodeError */
3476static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003477 const char *encoding,
3478 const Py_UNICODE *unicode, Py_ssize_t size,
3479 Py_ssize_t startpos, Py_ssize_t endpos,
3480 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003483 *exceptionObject = PyUnicodeEncodeError_Create(
3484 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 }
3486 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003487 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3488 goto onError;
3489 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3490 goto onError;
3491 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3492 goto onError;
3493 return;
3494 onError:
3495 Py_DECREF(*exceptionObject);
3496 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 }
3498}
3499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500/* raises a UnicodeEncodeError */
3501static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003502 const char *encoding,
3503 const Py_UNICODE *unicode, Py_ssize_t size,
3504 Py_ssize_t startpos, Py_ssize_t endpos,
3505 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506{
3507 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003510 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511}
3512
3513/* error handling callback helper:
3514 build arguments, call the callback and check the arguments,
3515 put the result into newpos and return the replacement string, which
3516 has to be freed by the caller */
3517static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003518 PyObject **errorHandler,
3519 const char *encoding, const char *reason,
3520 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3521 Py_ssize_t startpos, Py_ssize_t endpos,
3522 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525
3526 PyObject *restuple;
3527 PyObject *resunicode;
3528
3529 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003532 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 }
3534
3535 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539
3540 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003541 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003545 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003546 Py_DECREF(restuple);
3547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 }
3549 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 &resunicode, newpos)) {
3551 Py_DECREF(restuple);
3552 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 }
3554 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003555 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003556 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003557 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3558 Py_DECREF(restuple);
3559 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003560 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 Py_INCREF(resunicode);
3562 Py_DECREF(restuple);
3563 return resunicode;
3564}
3565
3566static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003567 Py_ssize_t size,
3568 const char *errors,
3569 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570{
3571 /* output object */
3572 PyObject *res;
3573 /* pointers to the beginning and end+1 of input */
3574 const Py_UNICODE *startp = p;
3575 const Py_UNICODE *endp = p + size;
3576 /* pointer to the beginning of the unencodable characters */
3577 /* const Py_UNICODE *badp = NULL; */
3578 /* pointer into the output */
3579 char *str;
3580 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003581 Py_ssize_t respos = 0;
3582 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003583 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3584 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 PyObject *errorHandler = NULL;
3586 PyObject *exc = NULL;
3587 /* the following variable is used for caching string comparisons
3588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3589 int known_errorHandler = -1;
3590
3591 /* allocate enough for a simple encoding without
3592 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003593 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 if (res == NULL)
3595 goto onError;
3596 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003597 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003598 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 ressize = size;
3600
3601 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003602 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003604 /* can we encode this? */
3605 if (c<limit) {
3606 /* no overflow check, because we know that the space is enough */
3607 *str++ = (char)c;
3608 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003609 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003610 else {
3611 Py_ssize_t unicodepos = p-startp;
3612 Py_ssize_t requiredsize;
3613 PyObject *repunicode;
3614 Py_ssize_t repsize;
3615 Py_ssize_t newpos;
3616 Py_ssize_t respos;
3617 Py_UNICODE *uni2;
3618 /* startpos for collecting unencodable chars */
3619 const Py_UNICODE *collstart = p;
3620 const Py_UNICODE *collend = p;
3621 /* find all unecodable characters */
3622 while ((collend < endp) && ((*collend)>=limit))
3623 ++collend;
3624 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3625 if (known_errorHandler==-1) {
3626 if ((errors==NULL) || (!strcmp(errors, "strict")))
3627 known_errorHandler = 1;
3628 else if (!strcmp(errors, "replace"))
3629 known_errorHandler = 2;
3630 else if (!strcmp(errors, "ignore"))
3631 known_errorHandler = 3;
3632 else if (!strcmp(errors, "xmlcharrefreplace"))
3633 known_errorHandler = 4;
3634 else
3635 known_errorHandler = 0;
3636 }
3637 switch (known_errorHandler) {
3638 case 1: /* strict */
3639 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3640 goto onError;
3641 case 2: /* replace */
3642 while (collstart++<collend)
3643 *str++ = '?'; /* fall through */
3644 case 3: /* ignore */
3645 p = collend;
3646 break;
3647 case 4: /* xmlcharrefreplace */
3648 respos = str-PyString_AS_STRING(res);
3649 /* determine replacement size (temporarily (mis)uses p) */
3650 for (p = collstart, repsize = 0; p < collend; ++p) {
3651 if (*p<10)
3652 repsize += 2+1+1;
3653 else if (*p<100)
3654 repsize += 2+2+1;
3655 else if (*p<1000)
3656 repsize += 2+3+1;
3657 else if (*p<10000)
3658 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003659#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 else
3661 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003662#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003663 else if (*p<100000)
3664 repsize += 2+5+1;
3665 else if (*p<1000000)
3666 repsize += 2+6+1;
3667 else
3668 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003669#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003670 }
3671 requiredsize = respos+repsize+(endp-collend);
3672 if (requiredsize > ressize) {
3673 if (requiredsize<2*ressize)
3674 requiredsize = 2*ressize;
3675 if (_PyString_Resize(&res, requiredsize))
3676 goto onError;
3677 str = PyString_AS_STRING(res) + respos;
3678 ressize = requiredsize;
3679 }
3680 /* generate replacement (temporarily (mis)uses p) */
3681 for (p = collstart; p < collend; ++p) {
3682 str += sprintf(str, "&#%d;", (int)*p);
3683 }
3684 p = collend;
3685 break;
3686 default:
3687 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3688 encoding, reason, startp, size, &exc,
3689 collstart-startp, collend-startp, &newpos);
3690 if (repunicode == NULL)
3691 goto onError;
3692 /* need more space? (at least enough for what we have+the
3693 replacement+the rest of the string, so we won't have to
3694 check space for encodable characters) */
3695 respos = str-PyString_AS_STRING(res);
3696 repsize = PyUnicode_GET_SIZE(repunicode);
3697 requiredsize = respos+repsize+(endp-collend);
3698 if (requiredsize > ressize) {
3699 if (requiredsize<2*ressize)
3700 requiredsize = 2*ressize;
3701 if (_PyString_Resize(&res, requiredsize)) {
3702 Py_DECREF(repunicode);
3703 goto onError;
3704 }
3705 str = PyString_AS_STRING(res) + respos;
3706 ressize = requiredsize;
3707 }
3708 /* check if there is anything unencodable in the replacement
3709 and copy it to the output */
3710 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3711 c = *uni2;
3712 if (c >= limit) {
3713 raise_encode_exception(&exc, encoding, startp, size,
3714 unicodepos, unicodepos+1, reason);
3715 Py_DECREF(repunicode);
3716 goto onError;
3717 }
3718 *str = (char)c;
3719 }
3720 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003721 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003722 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003723 }
3724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003726 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003728 /* If this falls res will be NULL */
3729 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 Py_XDECREF(errorHandler);
3731 Py_XDECREF(exc);
3732 return res;
3733
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003734 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 Py_XDECREF(res);
3736 Py_XDECREF(errorHandler);
3737 Py_XDECREF(exc);
3738 return NULL;
3739}
3740
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003742 Py_ssize_t size,
3743 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746}
3747
3748PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3749{
3750 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003751 PyErr_BadArgument();
3752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 }
3754 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003755 PyUnicode_GET_SIZE(unicode),
3756 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757}
3758
3759/* --- 7-bit ASCII Codec -------------------------------------------------- */
3760
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003762 Py_ssize_t size,
3763 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 PyUnicodeObject *v;
3767 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003768 Py_ssize_t startinpos;
3769 Py_ssize_t endinpos;
3770 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 const char *e;
3772 PyObject *errorHandler = NULL;
3773 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003776 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003777 Py_UNICODE r = *(unsigned char*)s;
3778 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003779 }
Tim Petersced69f82003-09-16 20:30:58 +00003780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 v = _PyUnicode_New(size);
3782 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003783 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003785 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 e = s + size;
3788 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 register unsigned char c = (unsigned char)*s;
3790 if (c < 128) {
3791 *p++ = c;
3792 ++s;
3793 }
3794 else {
3795 startinpos = s-starts;
3796 endinpos = startinpos + 1;
3797 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3798 if (unicode_decode_call_errorhandler(
3799 errors, &errorHandler,
3800 "ascii", "ordinal not in range(128)",
3801 starts, size, &startinpos, &endinpos, &exc, &s,
3802 &v, &outpos, &p))
3803 goto onError;
3804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003806 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003807 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3808 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003812
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003813 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 Py_XDECREF(errorHandler);
3816 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 return NULL;
3818}
3819
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003821 Py_ssize_t size,
3822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825}
3826
3827PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3828{
3829 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 PyErr_BadArgument();
3831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 }
3833 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 PyUnicode_GET_SIZE(unicode),
3835 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836}
3837
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003838#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003839
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003840/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003841
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003842#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003843#define NEED_RETRY
3844#endif
3845
3846/* XXX This code is limited to "true" double-byte encodings, as
3847 a) it assumes an incomplete character consists of a single byte, and
3848 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003849 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003850
3851static int is_dbcs_lead_byte(const char *s, int offset)
3852{
3853 const char *curr = s + offset;
3854
3855 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003856 const char *prev = CharPrev(s, curr);
3857 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003858 }
3859 return 0;
3860}
3861
3862/*
3863 * Decode MBCS string into unicode object. If 'final' is set, converts
3864 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3865 */
3866static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003867 const char *s, /* MBCS string */
3868 int size, /* sizeof MBCS string */
3869 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003870{
3871 Py_UNICODE *p;
3872 Py_ssize_t n = 0;
3873 int usize = 0;
3874
3875 assert(size >= 0);
3876
3877 /* Skip trailing lead-byte unless 'final' is set */
3878 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003879 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003880
3881 /* First get the size of the result */
3882 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3884 if (usize == 0) {
3885 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3886 return -1;
3887 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003888 }
3889
3890 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003891 /* Create unicode object */
3892 *v = _PyUnicode_New(usize);
3893 if (*v == NULL)
3894 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895 }
3896 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003897 /* Extend unicode object */
3898 n = PyUnicode_GET_SIZE(*v);
3899 if (_PyUnicode_Resize(v, n + usize) < 0)
3900 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003901 }
3902
3903 /* Do the conversion */
3904 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003905 p = PyUnicode_AS_UNICODE(*v) + n;
3906 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3907 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3908 return -1;
3909 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910 }
3911
3912 return size;
3913}
3914
3915PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003916 Py_ssize_t size,
3917 const char *errors,
3918 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003919{
3920 PyUnicodeObject *v = NULL;
3921 int done;
3922
3923 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003924 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003925
3926#ifdef NEED_RETRY
3927 retry:
3928 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930 else
3931#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003932 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933
3934 if (done < 0) {
3935 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937 }
3938
3939 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941
3942#ifdef NEED_RETRY
3943 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003944 s += done;
3945 size -= done;
3946 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003947 }
3948#endif
3949
3950 return (PyObject *)v;
3951}
3952
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003954 Py_ssize_t size,
3955 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003956{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003957 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3958}
3959
3960/*
3961 * Convert unicode into string object (MBCS).
3962 * Returns 0 if succeed, -1 otherwise.
3963 */
3964static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003965 const Py_UNICODE *p, /* unicode */
3966 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003967{
3968 int mbcssize = 0;
3969 Py_ssize_t n = 0;
3970
3971 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003972
3973 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003974 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003975 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3976 if (mbcssize == 0) {
3977 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3978 return -1;
3979 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003980 }
3981
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003983 /* Create string object */
3984 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3985 if (*repr == NULL)
3986 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003987 }
3988 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 /* Extend string object */
3990 n = PyString_Size(*repr);
3991 if (_PyString_Resize(repr, n + mbcssize) < 0)
3992 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003993 }
3994
3995 /* Do the conversion */
3996 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003997 char *s = PyString_AS_STRING(*repr) + n;
3998 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3999 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4000 return -1;
4001 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004002 }
4003
4004 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004005}
4006
4007PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004008 Py_ssize_t size,
4009 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004010{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004011 PyObject *repr = NULL;
4012 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004013
Martin v. Löwisd8251432006-06-14 05:21:04 +00004014#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004016 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018 else
4019#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004021
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004023 Py_XDECREF(repr);
4024 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004025 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026
4027#ifdef NEED_RETRY
4028 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004029 p += INT_MAX;
4030 size -= INT_MAX;
4031 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004032 }
4033#endif
4034
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004035 return repr;
4036}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004037
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004038PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4039{
4040 if (!PyUnicode_Check(unicode)) {
4041 PyErr_BadArgument();
4042 return NULL;
4043 }
4044 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004045 PyUnicode_GET_SIZE(unicode),
4046 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004047}
4048
Martin v. Löwisd8251432006-06-14 05:21:04 +00004049#undef NEED_RETRY
4050
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004051#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053/* --- Character Mapping Codec -------------------------------------------- */
4054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004056 Py_ssize_t size,
4057 PyObject *mapping,
4058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004061 Py_ssize_t startinpos;
4062 Py_ssize_t endinpos;
4063 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 PyUnicodeObject *v;
4066 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 PyObject *errorHandler = NULL;
4069 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004070 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004072
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 /* Default to Latin-1 */
4074 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
4077 v = _PyUnicode_New(size);
4078 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004081 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004084 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 mapstring = PyUnicode_AS_UNICODE(mapping);
4086 maplen = PyUnicode_GET_SIZE(mapping);
4087 while (s < e) {
4088 unsigned char ch = *s;
4089 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 if (ch < maplen)
4092 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004094 if (x == 0xfffe) {
4095 /* undefined mapping */
4096 outpos = p-PyUnicode_AS_UNICODE(v);
4097 startinpos = s-starts;
4098 endinpos = startinpos+1;
4099 if (unicode_decode_call_errorhandler(
4100 errors, &errorHandler,
4101 "charmap", "character maps to <undefined>",
4102 starts, size, &startinpos, &endinpos, &exc, &s,
4103 &v, &outpos, &p)) {
4104 goto onError;
4105 }
4106 continue;
4107 }
4108 *p++ = x;
4109 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004110 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004111 }
4112 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004113 while (s < e) {
4114 unsigned char ch = *s;
4115 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004116
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4118 w = PyInt_FromLong((long)ch);
4119 if (w == NULL)
4120 goto onError;
4121 x = PyObject_GetItem(mapping, w);
4122 Py_DECREF(w);
4123 if (x == NULL) {
4124 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4125 /* No mapping found means: mapping is undefined. */
4126 PyErr_Clear();
4127 x = Py_None;
4128 Py_INCREF(x);
4129 } else
4130 goto onError;
4131 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004132
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004133 /* Apply mapping */
4134 if (PyInt_Check(x)) {
4135 long value = PyInt_AS_LONG(x);
4136 if (value < 0 || value > 65535) {
4137 PyErr_SetString(PyExc_TypeError,
4138 "character mapping must be in range(65536)");
4139 Py_DECREF(x);
4140 goto onError;
4141 }
4142 *p++ = (Py_UNICODE)value;
4143 }
4144 else if (x == Py_None) {
4145 /* undefined mapping */
4146 outpos = p-PyUnicode_AS_UNICODE(v);
4147 startinpos = s-starts;
4148 endinpos = startinpos+1;
4149 if (unicode_decode_call_errorhandler(
4150 errors, &errorHandler,
4151 "charmap", "character maps to <undefined>",
4152 starts, size, &startinpos, &endinpos, &exc, &s,
4153 &v, &outpos, &p)) {
4154 Py_DECREF(x);
4155 goto onError;
4156 }
4157 Py_DECREF(x);
4158 continue;
4159 }
4160 else if (PyUnicode_Check(x)) {
4161 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004162
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004163 if (targetsize == 1)
4164 /* 1-1 mapping */
4165 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004166
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004167 else if (targetsize > 1) {
4168 /* 1-n mapping */
4169 if (targetsize > extrachars) {
4170 /* resize first */
4171 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4172 Py_ssize_t needed = (targetsize - extrachars) + \
4173 (targetsize << 2);
4174 extrachars += needed;
4175 /* XXX overflow detection missing */
4176 if (_PyUnicode_Resize(&v,
4177 PyUnicode_GET_SIZE(v) + needed) < 0) {
4178 Py_DECREF(x);
4179 goto onError;
4180 }
4181 p = PyUnicode_AS_UNICODE(v) + oldpos;
4182 }
4183 Py_UNICODE_COPY(p,
4184 PyUnicode_AS_UNICODE(x),
4185 targetsize);
4186 p += targetsize;
4187 extrachars -= targetsize;
4188 }
4189 /* 1-0 mapping: skip the character */
4190 }
4191 else {
4192 /* wrong return value */
4193 PyErr_SetString(PyExc_TypeError,
4194 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004195 Py_DECREF(x);
4196 goto onError;
4197 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004198 Py_DECREF(x);
4199 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 }
4202 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004203 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4204 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 Py_XDECREF(errorHandler);
4206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004208
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004209 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 Py_XDECREF(errorHandler);
4211 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 Py_XDECREF(v);
4213 return NULL;
4214}
4215
Martin v. Löwis3f767792006-06-04 19:36:28 +00004216/* Charmap encoding: the lookup table */
4217
4218struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004219 PyObject_HEAD
4220 unsigned char level1[32];
4221 int count2, count3;
4222 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004223};
4224
4225static PyObject*
4226encoding_map_size(PyObject *obj, PyObject* args)
4227{
4228 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004229 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004230 128*map->count3);
4231}
4232
4233static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004234 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004235 PyDoc_STR("Return the size (in bytes) of this object") },
4236 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004237};
4238
4239static void
4240encoding_map_dealloc(PyObject* o)
4241{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004242 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004243}
4244
4245static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004246 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004247 "EncodingMap", /*tp_name*/
4248 sizeof(struct encoding_map), /*tp_basicsize*/
4249 0, /*tp_itemsize*/
4250 /* methods */
4251 encoding_map_dealloc, /*tp_dealloc*/
4252 0, /*tp_print*/
4253 0, /*tp_getattr*/
4254 0, /*tp_setattr*/
4255 0, /*tp_compare*/
4256 0, /*tp_repr*/
4257 0, /*tp_as_number*/
4258 0, /*tp_as_sequence*/
4259 0, /*tp_as_mapping*/
4260 0, /*tp_hash*/
4261 0, /*tp_call*/
4262 0, /*tp_str*/
4263 0, /*tp_getattro*/
4264 0, /*tp_setattro*/
4265 0, /*tp_as_buffer*/
4266 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4267 0, /*tp_doc*/
4268 0, /*tp_traverse*/
4269 0, /*tp_clear*/
4270 0, /*tp_richcompare*/
4271 0, /*tp_weaklistoffset*/
4272 0, /*tp_iter*/
4273 0, /*tp_iternext*/
4274 encoding_map_methods, /*tp_methods*/
4275 0, /*tp_members*/
4276 0, /*tp_getset*/
4277 0, /*tp_base*/
4278 0, /*tp_dict*/
4279 0, /*tp_descr_get*/
4280 0, /*tp_descr_set*/
4281 0, /*tp_dictoffset*/
4282 0, /*tp_init*/
4283 0, /*tp_alloc*/
4284 0, /*tp_new*/
4285 0, /*tp_free*/
4286 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004287};
4288
4289PyObject*
4290PyUnicode_BuildEncodingMap(PyObject* string)
4291{
4292 Py_UNICODE *decode;
4293 PyObject *result;
4294 struct encoding_map *mresult;
4295 int i;
4296 int need_dict = 0;
4297 unsigned char level1[32];
4298 unsigned char level2[512];
4299 unsigned char *mlevel1, *mlevel2, *mlevel3;
4300 int count2 = 0, count3 = 0;
4301
4302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4303 PyErr_BadArgument();
4304 return NULL;
4305 }
4306 decode = PyUnicode_AS_UNICODE(string);
4307 memset(level1, 0xFF, sizeof level1);
4308 memset(level2, 0xFF, sizeof level2);
4309
4310 /* If there isn't a one-to-one mapping of NULL to \0,
4311 or if there are non-BMP characters, we need to use
4312 a mapping dictionary. */
4313 if (decode[0] != 0)
4314 need_dict = 1;
4315 for (i = 1; i < 256; i++) {
4316 int l1, l2;
4317 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004318#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004320#endif
4321 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004322 need_dict = 1;
4323 break;
4324 }
4325 if (decode[i] == 0xFFFE)
4326 /* unmapped character */
4327 continue;
4328 l1 = decode[i] >> 11;
4329 l2 = decode[i] >> 7;
4330 if (level1[l1] == 0xFF)
4331 level1[l1] = count2++;
4332 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004333 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004334 }
4335
4336 if (count2 >= 0xFF || count3 >= 0xFF)
4337 need_dict = 1;
4338
4339 if (need_dict) {
4340 PyObject *result = PyDict_New();
4341 PyObject *key, *value;
4342 if (!result)
4343 return NULL;
4344 for (i = 0; i < 256; i++) {
4345 key = value = NULL;
4346 key = PyInt_FromLong(decode[i]);
4347 value = PyInt_FromLong(i);
4348 if (!key || !value)
4349 goto failed1;
4350 if (PyDict_SetItem(result, key, value) == -1)
4351 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004352 Py_DECREF(key);
4353 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004354 }
4355 return result;
4356 failed1:
4357 Py_XDECREF(key);
4358 Py_XDECREF(value);
4359 Py_DECREF(result);
4360 return NULL;
4361 }
4362
4363 /* Create a three-level trie */
4364 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4365 16*count2 + 128*count3 - 1);
4366 if (!result)
4367 return PyErr_NoMemory();
4368 PyObject_Init(result, &EncodingMapType);
4369 mresult = (struct encoding_map*)result;
4370 mresult->count2 = count2;
4371 mresult->count3 = count3;
4372 mlevel1 = mresult->level1;
4373 mlevel2 = mresult->level23;
4374 mlevel3 = mresult->level23 + 16*count2;
4375 memcpy(mlevel1, level1, 32);
4376 memset(mlevel2, 0xFF, 16*count2);
4377 memset(mlevel3, 0, 128*count3);
4378 count3 = 0;
4379 for (i = 1; i < 256; i++) {
4380 int o1, o2, o3, i2, i3;
4381 if (decode[i] == 0xFFFE)
4382 /* unmapped character */
4383 continue;
4384 o1 = decode[i]>>11;
4385 o2 = (decode[i]>>7) & 0xF;
4386 i2 = 16*mlevel1[o1] + o2;
4387 if (mlevel2[i2] == 0xFF)
4388 mlevel2[i2] = count3++;
4389 o3 = decode[i] & 0x7F;
4390 i3 = 128*mlevel2[i2] + o3;
4391 mlevel3[i3] = i;
4392 }
4393 return result;
4394}
4395
4396static int
4397encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4398{
4399 struct encoding_map *map = (struct encoding_map*)mapping;
4400 int l1 = c>>11;
4401 int l2 = (c>>7) & 0xF;
4402 int l3 = c & 0x7F;
4403 int i;
4404
4405#ifdef Py_UNICODE_WIDE
4406 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004407 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 }
4409#endif
4410 if (c == 0)
4411 return 0;
4412 /* level 1*/
4413 i = map->level1[l1];
4414 if (i == 0xFF) {
4415 return -1;
4416 }
4417 /* level 2*/
4418 i = map->level23[16*i+l2];
4419 if (i == 0xFF) {
4420 return -1;
4421 }
4422 /* level 3 */
4423 i = map->level23[16*map->count2 + 128*i + l3];
4424 if (i == 0) {
4425 return -1;
4426 }
4427 return i;
4428}
4429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430/* Lookup the character ch in the mapping. If the character
4431 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004432 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject *w = PyInt_FromLong((long)c);
4436 PyObject *x;
4437
4438 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004439 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 x = PyObject_GetItem(mapping, w);
4441 Py_DECREF(w);
4442 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004443 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4444 /* No mapping found means: mapping is undefined. */
4445 PyErr_Clear();
4446 x = Py_None;
4447 Py_INCREF(x);
4448 return x;
4449 } else
4450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004452 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004453 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004455 long value = PyInt_AS_LONG(x);
4456 if (value < 0 || value > 255) {
4457 PyErr_SetString(PyExc_TypeError,
4458 "character mapping must be in range(256)");
4459 Py_DECREF(x);
4460 return NULL;
4461 }
4462 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004464 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004465 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004467 /* wrong return value */
4468 PyErr_SetString(PyExc_TypeError,
4469 "character mapping must return integer, None or str");
4470 Py_DECREF(x);
4471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 }
4473}
4474
Martin v. Löwis3f767792006-06-04 19:36:28 +00004475static int
4476charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4477{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4479 /* exponentially overallocate to minimize reallocations */
4480 if (requiredsize < 2*outsize)
4481 requiredsize = 2*outsize;
4482 if (_PyString_Resize(outobj, requiredsize)) {
4483 return 0;
4484 }
4485 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004486}
4487
Benjamin Peterson857ce152009-01-31 16:29:18 +00004488typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004490}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491/* lookup the character, put the result in the output string and adjust
4492 various state variables. Reallocate the output string if not enough
4493 space is available. Return a new reference to the object that
4494 was put in the output buffer, or Py_None, if the mapping was undefined
4495 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004496 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004498charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004499 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004501 PyObject *rep;
4502 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504
Christian Heimese93237d2007-12-19 02:37:44 +00004505 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004506 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004507 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004508 if (res == -1)
4509 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004510 if (outsize<requiredsize)
4511 if (!charmapencode_resize(outobj, outpos, requiredsize))
4512 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004513 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004514 outstart[(*outpos)++] = (char)res;
4515 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516 }
4517
4518 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004520 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004521 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 Py_DECREF(rep);
4523 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004524 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004525 if (PyInt_Check(rep)) {
4526 Py_ssize_t requiredsize = *outpos+1;
4527 if (outsize<requiredsize)
4528 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4529 Py_DECREF(rep);
4530 return enc_EXCEPTION;
4531 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004532 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004534 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004535 else {
4536 const char *repchars = PyString_AS_STRING(rep);
4537 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4538 Py_ssize_t requiredsize = *outpos+repsize;
4539 if (outsize<requiredsize)
4540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4541 Py_DECREF(rep);
4542 return enc_EXCEPTION;
4543 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004544 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004545 memcpy(outstart + *outpos, repchars, repsize);
4546 *outpos += repsize;
4547 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 }
Georg Brandl9f167602006-06-04 21:46:16 +00004549 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004550 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551}
4552
4553/* handle an error in PyUnicode_EncodeCharmap
4554 Return 0 on success, -1 on error */
4555static
4556int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004559 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004560 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561{
4562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t repsize;
4564 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 Py_UNICODE *uni2;
4566 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t collstartpos = *inpos;
4568 Py_ssize_t collendpos = *inpos+1;
4569 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 char *encoding = "charmap";
4571 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004572 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 /* find all unencodable characters */
4575 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004576 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004577 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004578 int res = encoding_map_lookup(p[collendpos], mapping);
4579 if (res != -1)
4580 break;
4581 ++collendpos;
4582 continue;
4583 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004584
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004585 rep = charmapencode_lookup(p[collendpos], mapping);
4586 if (rep==NULL)
4587 return -1;
4588 else if (rep!=Py_None) {
4589 Py_DECREF(rep);
4590 break;
4591 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004592 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004593 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 }
4595 /* cache callback name lookup
4596 * (if not done yet, i.e. it's the first error) */
4597 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004598 if ((errors==NULL) || (!strcmp(errors, "strict")))
4599 *known_errorHandler = 1;
4600 else if (!strcmp(errors, "replace"))
4601 *known_errorHandler = 2;
4602 else if (!strcmp(errors, "ignore"))
4603 *known_errorHandler = 3;
4604 else if (!strcmp(errors, "xmlcharrefreplace"))
4605 *known_errorHandler = 4;
4606 else
4607 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
4609 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004610 case 1: /* strict */
4611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4612 return -1;
4613 case 2: /* replace */
4614 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004615 x = charmapencode_output('?', mapping, res, respos);
4616 if (x==enc_EXCEPTION) {
4617 return -1;
4618 }
4619 else if (x==enc_FAILED) {
4620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4621 return -1;
4622 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004623 }
4624 /* fall through */
4625 case 3: /* ignore */
4626 *inpos = collendpos;
4627 break;
4628 case 4: /* xmlcharrefreplace */
4629 /* generate replacement (temporarily (mis)uses p) */
4630 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004631 char buffer[2+29+1+1];
4632 char *cp;
4633 sprintf(buffer, "&#%d;", (int)p[collpos]);
4634 for (cp = buffer; *cp; ++cp) {
4635 x = charmapencode_output(*cp, mapping, res, respos);
4636 if (x==enc_EXCEPTION)
4637 return -1;
4638 else if (x==enc_FAILED) {
4639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4640 return -1;
4641 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004642 }
4643 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004644 *inpos = collendpos;
4645 break;
4646 default:
4647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004648 encoding, reason, p, size, exceptionObject,
4649 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004650 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004651 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004652 /* generate replacement */
4653 repsize = PyUnicode_GET_SIZE(repunicode);
4654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004655 x = charmapencode_output(*uni2, mapping, res, respos);
4656 if (x==enc_EXCEPTION) {
4657 return -1;
4658 }
4659 else if (x==enc_FAILED) {
4660 Py_DECREF(repunicode);
4661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4662 return -1;
4663 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004664 }
4665 *inpos = newpos;
4666 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 }
4668 return 0;
4669}
4670
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004672 Py_ssize_t size,
4673 PyObject *mapping,
4674 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 /* output object */
4677 PyObject *res = NULL;
4678 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 PyObject *errorHandler = NULL;
4683 PyObject *exc = NULL;
4684 /* the following variable is used for caching string comparisons
4685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4686 * 3=ignore, 4=xmlcharrefreplace */
4687 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
4689 /* Default to Latin-1 */
4690 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004691 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 /* allocate enough for a simple encoding without
4694 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004695 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 if (res == NULL)
4697 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004698 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004702 /* try to encode it */
4703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4704 if (x==enc_EXCEPTION) /* error */
4705 goto onError;
4706 if (x==enc_FAILED) { /* unencodable character */
4707 if (charmap_encoding_error(p, size, &inpos, mapping,
4708 &exc,
4709 &known_errorHandler, &errorHandler, errors,
4710 &res, &respos)) {
4711 goto onError;
4712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004713 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004714 else
4715 /* done with this character => adjust input position */
4716 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004720 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004721 if (_PyString_Resize(&res, respos))
4722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 }
4724 Py_XDECREF(exc);
4725 Py_XDECREF(errorHandler);
4726 return res;
4727
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 Py_XDECREF(res);
4730 Py_XDECREF(exc);
4731 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 return NULL;
4733}
4734
4735PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004736 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737{
4738 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004739 PyErr_BadArgument();
4740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 }
4742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 PyUnicode_GET_SIZE(unicode),
4744 mapping,
4745 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746}
4747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748/* create or adjust a UnicodeTranslateError */
4749static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004750 const Py_UNICODE *unicode, Py_ssize_t size,
4751 Py_ssize_t startpos, Py_ssize_t endpos,
4752 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004755 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004756 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 }
4758 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4760 goto onError;
4761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4762 goto onError;
4763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4764 goto onError;
4765 return;
4766 onError:
4767 Py_DECREF(*exceptionObject);
4768 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 }
4770}
4771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772/* raises a UnicodeTranslateError */
4773static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004774 const Py_UNICODE *unicode, Py_ssize_t size,
4775 Py_ssize_t startpos, Py_ssize_t endpos,
4776 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777{
4778 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004779 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004781 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782}
4783
4784/* error handling callback helper:
4785 build arguments, call the callback and check the arguments,
4786 put the result into newpos and return the replacement string, which
4787 has to be freed by the caller */
4788static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004789 PyObject **errorHandler,
4790 const char *reason,
4791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4792 Py_ssize_t startpos, Py_ssize_t endpos,
4793 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796
Martin v. Löwis412fb672006-04-13 06:34:32 +00004797 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 PyObject *restuple;
4799 PyObject *resunicode;
4800
4801 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004802 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004804 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 }
4806
4807 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811
4812 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004817 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004818 Py_DECREF(restuple);
4819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 }
4821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 &resunicode, &i_newpos)) {
4823 Py_DECREF(restuple);
4824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004826 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004828 else
4829 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004830 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4832 Py_DECREF(restuple);
4833 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 Py_INCREF(resunicode);
4836 Py_DECREF(restuple);
4837 return resunicode;
4838}
4839
4840/* Lookup the character ch in the mapping and put the result in result,
4841 which must be decrefed by the caller.
4842 Return 0 on success, -1 on error */
4843static
4844int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4845{
4846 PyObject *w = PyInt_FromLong((long)c);
4847 PyObject *x;
4848
4849 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 x = PyObject_GetItem(mapping, w);
4852 Py_DECREF(w);
4853 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4855 /* No mapping found means: use 1:1 mapping. */
4856 PyErr_Clear();
4857 *result = NULL;
4858 return 0;
4859 } else
4860 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 }
4862 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004863 *result = x;
4864 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 }
4866 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004867 long value = PyInt_AS_LONG(x);
4868 long max = PyUnicode_GetMax();
4869 if (value < 0 || value > max) {
4870 PyErr_Format(PyExc_TypeError,
4871 "character mapping must be in range(0x%lx)", max+1);
4872 Py_DECREF(x);
4873 return -1;
4874 }
4875 *result = x;
4876 return 0;
4877 }
4878 else if (PyUnicode_Check(x)) {
4879 *result = x;
4880 return 0;
4881 }
4882 else {
4883 /* wrong return value */
4884 PyErr_SetString(PyExc_TypeError,
4885 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004886 Py_DECREF(x);
4887 return -1;
4888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889}
4890/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 if not reallocate and adjust various state variables.
4892 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893static
Walter Dörwald4894c302003-10-24 14:25:28 +00004894int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004898 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 /* remember old output position */
4900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4901 /* exponentially overallocate to minimize reallocations */
4902 if (requiredsize < 2 * oldsize)
4903 requiredsize = 2 * oldsize;
4904 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4905 return -1;
4906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 }
4908 return 0;
4909}
4910/* lookup the character, put the result in the output string and adjust
4911 various state variables. Return a new reference to the object that
4912 was put in the output buffer in *result, or Py_None, if the mapping was
4913 undefined (in which case no character was written).
4914 The called must decref result.
4915 Return 0 on success, -1 on error. */
4916static
Walter Dörwald4894c302003-10-24 14:25:28 +00004917int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4919 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920{
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004924 /* not found => default to 1:1 mapping */
4925 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 }
4927 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004930 /* no overflow check, because we know that the space is enough */
4931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 }
4933 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4935 if (repsize==1) {
4936 /* no overflow check, because we know that the space is enough */
4937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4938 }
4939 else if (repsize!=0) {
4940 /* more than one character */
4941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4942 (insize - (curinp-startinp)) +
4943 repsize - 1;
4944 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4945 return -1;
4946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4947 *outp += repsize;
4948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 }
4950 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004951 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 return 0;
4953}
4954
4955PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004956 Py_ssize_t size,
4957 PyObject *mapping,
4958 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 /* output object */
4961 PyObject *res = NULL;
4962 /* pointers to the beginning and end+1 of input */
4963 const Py_UNICODE *startp = p;
4964 const Py_UNICODE *endp = p + size;
4965 /* pointer into the output */
4966 Py_UNICODE *str;
4967 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 char *reason = "character maps to <undefined>";
4970 PyObject *errorHandler = NULL;
4971 PyObject *exc = NULL;
4972 /* the following variable is used for caching string comparisons
4973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4974 * 3=ignore, 4=xmlcharrefreplace */
4975 int known_errorHandler = -1;
4976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004978 PyErr_BadArgument();
4979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981
4982 /* allocate enough for a simple 1:1 translation without
4983 replacements, if we need more, we'll resize */
4984 res = PyUnicode_FromUnicode(NULL, size);
4985 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 /* try to encode it */
4993 PyObject *x = NULL;
4994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4995 Py_XDECREF(x);
4996 goto onError;
4997 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004998 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004999 if (x!=Py_None) /* it worked => adjust input pointer */
5000 ++p;
5001 else { /* untranslatable character */
5002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5003 Py_ssize_t repsize;
5004 Py_ssize_t newpos;
5005 Py_UNICODE *uni2;
5006 /* startpos for collecting untranslatable chars */
5007 const Py_UNICODE *collstart = p;
5008 const Py_UNICODE *collend = p+1;
5009 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 /* find all untranslatable characters */
5012 while (collend < endp) {
5013 if (charmaptranslate_lookup(*collend, mapping, &x))
5014 goto onError;
5015 Py_XDECREF(x);
5016 if (x!=Py_None)
5017 break;
5018 ++collend;
5019 }
5020 /* cache callback name lookup
5021 * (if not done yet, i.e. it's the first error) */
5022 if (known_errorHandler==-1) {
5023 if ((errors==NULL) || (!strcmp(errors, "strict")))
5024 known_errorHandler = 1;
5025 else if (!strcmp(errors, "replace"))
5026 known_errorHandler = 2;
5027 else if (!strcmp(errors, "ignore"))
5028 known_errorHandler = 3;
5029 else if (!strcmp(errors, "xmlcharrefreplace"))
5030 known_errorHandler = 4;
5031 else
5032 known_errorHandler = 0;
5033 }
5034 switch (known_errorHandler) {
5035 case 1: /* strict */
5036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005037 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005038 case 2: /* replace */
5039 /* No need to check for space, this is a 1:1 replacement */
5040 for (coll = collstart; coll<collend; ++coll)
5041 *str++ = '?';
5042 /* fall through */
5043 case 3: /* ignore */
5044 p = collend;
5045 break;
5046 case 4: /* xmlcharrefreplace */
5047 /* generate replacement (temporarily (mis)uses p) */
5048 for (p = collstart; p < collend; ++p) {
5049 char buffer[2+29+1+1];
5050 char *cp;
5051 sprintf(buffer, "&#%d;", (int)*p);
5052 if (charmaptranslate_makespace(&res, &str,
5053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5054 goto onError;
5055 for (cp = buffer; *cp; ++cp)
5056 *str++ = *cp;
5057 }
5058 p = collend;
5059 break;
5060 default:
5061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5062 reason, startp, size, &exc,
5063 collstart-startp, collend-startp, &newpos);
5064 if (repunicode == NULL)
5065 goto onError;
5066 /* generate replacement */
5067 repsize = PyUnicode_GET_SIZE(repunicode);
5068 if (charmaptranslate_makespace(&res, &str,
5069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5070 Py_DECREF(repunicode);
5071 goto onError;
5072 }
5073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5074 *str++ = *uni2;
5075 p = startp + newpos;
5076 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005077 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005078 }
5079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 /* Resize if we allocated to much */
5081 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005082 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005083 if (PyUnicode_Resize(&res, respos) < 0)
5084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005085 }
5086 Py_XDECREF(exc);
5087 Py_XDECREF(errorHandler);
5088 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005090 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 Py_XDECREF(res);
5092 Py_XDECREF(exc);
5093 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 return NULL;
5095}
5096
5097PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005098 PyObject *mapping,
5099 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100{
5101 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 str = PyUnicode_FromObject(str);
5104 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005105 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005107 PyUnicode_GET_SIZE(str),
5108 mapping,
5109 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 Py_DECREF(str);
5111 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005112
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005113 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_XDECREF(str);
5115 return NULL;
5116}
Tim Petersced69f82003-09-16 20:30:58 +00005117
Guido van Rossum9e896b32000-04-05 20:11:21 +00005118/* --- Decimal Encoder ---------------------------------------------------- */
5119
5120int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005121 Py_ssize_t length,
5122 char *output,
5123 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005124{
5125 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 PyObject *errorHandler = NULL;
5127 PyObject *exc = NULL;
5128 const char *encoding = "decimal";
5129 const char *reason = "invalid decimal Unicode string";
5130 /* the following variable is used for caching string comparisons
5131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5132 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005133
5134 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005135 PyErr_BadArgument();
5136 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005137 }
5138
5139 p = s;
5140 end = s + length;
5141 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005142 register Py_UNICODE ch = *p;
5143 int decimal;
5144 PyObject *repunicode;
5145 Py_ssize_t repsize;
5146 Py_ssize_t newpos;
5147 Py_UNICODE *uni2;
5148 Py_UNICODE *collstart;
5149 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005150
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005151 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005152 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 ++p;
5154 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005155 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005156 decimal = Py_UNICODE_TODECIMAL(ch);
5157 if (decimal >= 0) {
5158 *output++ = '0' + decimal;
5159 ++p;
5160 continue;
5161 }
5162 if (0 < ch && ch < 256) {
5163 *output++ = (char)ch;
5164 ++p;
5165 continue;
5166 }
5167 /* All other characters are considered unencodable */
5168 collstart = p;
5169 collend = p+1;
5170 while (collend < end) {
5171 if ((0 < *collend && *collend < 256) ||
5172 !Py_UNICODE_ISSPACE(*collend) ||
5173 Py_UNICODE_TODECIMAL(*collend))
5174 break;
5175 }
5176 /* cache callback name lookup
5177 * (if not done yet, i.e. it's the first error) */
5178 if (known_errorHandler==-1) {
5179 if ((errors==NULL) || (!strcmp(errors, "strict")))
5180 known_errorHandler = 1;
5181 else if (!strcmp(errors, "replace"))
5182 known_errorHandler = 2;
5183 else if (!strcmp(errors, "ignore"))
5184 known_errorHandler = 3;
5185 else if (!strcmp(errors, "xmlcharrefreplace"))
5186 known_errorHandler = 4;
5187 else
5188 known_errorHandler = 0;
5189 }
5190 switch (known_errorHandler) {
5191 case 1: /* strict */
5192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5193 goto onError;
5194 case 2: /* replace */
5195 for (p = collstart; p < collend; ++p)
5196 *output++ = '?';
5197 /* fall through */
5198 case 3: /* ignore */
5199 p = collend;
5200 break;
5201 case 4: /* xmlcharrefreplace */
5202 /* generate replacement (temporarily (mis)uses p) */
5203 for (p = collstart; p < collend; ++p)
5204 output += sprintf(output, "&#%d;", (int)*p);
5205 p = collend;
5206 break;
5207 default:
5208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5209 encoding, reason, s, length, &exc,
5210 collstart-s, collend-s, &newpos);
5211 if (repunicode == NULL)
5212 goto onError;
5213 /* generate replacement */
5214 repsize = PyUnicode_GET_SIZE(repunicode);
5215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5216 Py_UNICODE ch = *uni2;
5217 if (Py_UNICODE_ISSPACE(ch))
5218 *output++ = ' ';
5219 else {
5220 decimal = Py_UNICODE_TODECIMAL(ch);
5221 if (decimal >= 0)
5222 *output++ = '0' + decimal;
5223 else if (0 < ch && ch < 256)
5224 *output++ = (char)ch;
5225 else {
5226 Py_DECREF(repunicode);
5227 raise_encode_exception(&exc, encoding,
5228 s, length, collstart-s, collend-s, reason);
5229 goto onError;
5230 }
5231 }
5232 }
5233 p = s + newpos;
5234 Py_DECREF(repunicode);
5235 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005236 }
5237 /* 0-terminate the output string */
5238 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 Py_XDECREF(exc);
5240 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005241 return 0;
5242
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005243 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 Py_XDECREF(exc);
5245 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005246 return -1;
5247}
5248
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249/* --- Helpers ------------------------------------------------------------ */
5250
Eric Smitha9f7d622008-02-17 19:46:49 +00005251#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005252#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005253
5254#include "stringlib/count.h"
5255#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005256#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005257#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005258
Fredrik Lundhc8162812006-05-26 19:33:03 +00005259/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005260#define ADJUST_INDICES(start, end, len) \
5261 if (end > len) \
5262 end = len; \
5263 else if (end < 0) { \
5264 end += len; \
5265 if (end < 0) \
5266 end = 0; \
5267 } \
5268 if (start < 0) { \
5269 start += len; \
5270 if (start < 0) \
5271 start = 0; \
5272 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005273
Martin v. Löwis18e16552006-02-15 17:27:45 +00005274Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005275 PyObject *substr,
5276 Py_ssize_t start,
5277 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005279 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005280 PyUnicodeObject* str_obj;
5281 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005282
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005283 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5284 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005285 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005286 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5287 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005288 Py_DECREF(str_obj);
5289 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 }
Tim Petersced69f82003-09-16 20:30:58 +00005291
Antoine Pitrou64672132010-01-13 07:55:48 +00005292 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005293 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005294 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5295 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005296 );
5297
5298 Py_DECREF(sub_obj);
5299 Py_DECREF(str_obj);
5300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 return result;
5302}
5303
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005305 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005306 Py_ssize_t start,
5307 Py_ssize_t end,
5308 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005311
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005312 str = PyUnicode_FromObject(str);
5313 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005314 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005315 sub = PyUnicode_FromObject(sub);
5316 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005317 Py_DECREF(str);
5318 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 }
Tim Petersced69f82003-09-16 20:30:58 +00005320
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005321 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005322 result = stringlib_find_slice(
5323 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5324 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5325 start, end
5326 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005327 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005328 result = stringlib_rfind_slice(
5329 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5330 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5331 start, end
5332 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005333
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005334 Py_DECREF(str);
5335 Py_DECREF(sub);
5336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 return result;
5338}
5339
Tim Petersced69f82003-09-16 20:30:58 +00005340static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005342 PyUnicodeObject *substring,
5343 Py_ssize_t start,
5344 Py_ssize_t end,
5345 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 if (substring->length == 0)
5348 return 1;
5349
Antoine Pitrou64672132010-01-13 07:55:48 +00005350 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 end -= substring->length;
5352 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005353 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354
5355 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005356 if (Py_UNICODE_MATCH(self, end, substring))
5357 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 } else {
5359 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005360 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 }
5362
5363 return 0;
5364}
5365
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005367 PyObject *substr,
5368 Py_ssize_t start,
5369 Py_ssize_t end,
5370 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 str = PyUnicode_FromObject(str);
5375 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 substr = PyUnicode_FromObject(substr);
5378 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005379 Py_DECREF(str);
5380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 }
Tim Petersced69f82003-09-16 20:30:58 +00005382
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 (PyUnicodeObject *)substr,
5385 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 Py_DECREF(str);
5387 Py_DECREF(substr);
5388 return result;
5389}
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391/* Apply fixfct filter to the Unicode object self and return a
5392 reference to the modified object */
5393
Tim Petersced69f82003-09-16 20:30:58 +00005394static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005396 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397{
5398
5399 PyUnicodeObject *u;
5400
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005401 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005403 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005404
5405 Py_UNICODE_COPY(u->str, self->str, self->length);
5406
Tim Peters7a29bd52001-09-12 03:03:31 +00005407 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005408 /* fixfct should return TRUE if it modified the buffer. If
5409 FALSE, return a reference to the original buffer instead
5410 (to save space, not time) */
5411 Py_INCREF(self);
5412 Py_DECREF(u);
5413 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 }
5415 return (PyObject*) u;
5416}
5417
Tim Petersced69f82003-09-16 20:30:58 +00005418static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419int fixupper(PyUnicodeObject *self)
5420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005421 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 Py_UNICODE *s = self->str;
5423 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005426 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005427
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005428 ch = Py_UNICODE_TOUPPER(*s);
5429 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005431 *s = ch;
5432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 s++;
5434 }
5435
5436 return status;
5437}
5438
Tim Petersced69f82003-09-16 20:30:58 +00005439static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440int fixlower(PyUnicodeObject *self)
5441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005442 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 Py_UNICODE *s = self->str;
5444 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005447 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005448
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 ch = Py_UNICODE_TOLOWER(*s);
5450 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005452 *s = ch;
5453 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 s++;
5455 }
5456
5457 return status;
5458}
5459
Tim Petersced69f82003-09-16 20:30:58 +00005460static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461int fixswapcase(PyUnicodeObject *self)
5462{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005463 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 Py_UNICODE *s = self->str;
5465 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 while (len-- > 0) {
5468 if (Py_UNICODE_ISUPPER(*s)) {
5469 *s = Py_UNICODE_TOLOWER(*s);
5470 status = 1;
5471 } else if (Py_UNICODE_ISLOWER(*s)) {
5472 *s = Py_UNICODE_TOUPPER(*s);
5473 status = 1;
5474 }
5475 s++;
5476 }
5477
5478 return status;
5479}
5480
Tim Petersced69f82003-09-16 20:30:58 +00005481static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482int fixcapitalize(PyUnicodeObject *self)
5483{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005484 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005485 Py_UNICODE *s = self->str;
5486 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005487
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005488 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005489 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005490 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005491 *s = Py_UNICODE_TOUPPER(*s);
5492 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005494 s++;
5495 while (--len > 0) {
5496 if (Py_UNICODE_ISUPPER(*s)) {
5497 *s = Py_UNICODE_TOLOWER(*s);
5498 status = 1;
5499 }
5500 s++;
5501 }
5502 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503}
5504
5505static
5506int fixtitle(PyUnicodeObject *self)
5507{
5508 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5509 register Py_UNICODE *e;
5510 int previous_is_cased;
5511
5512 /* Shortcut for single character strings */
5513 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005514 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5515 if (*p != ch) {
5516 *p = ch;
5517 return 1;
5518 }
5519 else
5520 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 }
Tim Petersced69f82003-09-16 20:30:58 +00005522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 e = p + PyUnicode_GET_SIZE(self);
5524 previous_is_cased = 0;
5525 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005526 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005527
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005528 if (previous_is_cased)
5529 *p = Py_UNICODE_TOLOWER(ch);
5530 else
5531 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005532
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005533 if (Py_UNICODE_ISLOWER(ch) ||
5534 Py_UNICODE_ISUPPER(ch) ||
5535 Py_UNICODE_ISTITLE(ch))
5536 previous_is_cased = 1;
5537 else
5538 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
5540 return 1;
5541}
5542
Tim Peters8ce9f162004-08-27 01:49:32 +00005543PyObject *
5544PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005547 const Py_UNICODE blank = ' ';
5548 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005549 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005551 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5552 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005553 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5554 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005555 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005557 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
Tim Peters05eba1f2004-08-27 21:32:02 +00005559 fseq = PySequence_Fast(seq, "");
5560 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005561 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005562 }
5563
Tim Peters91879ab2004-08-27 22:35:44 +00005564 /* Grrrr. A codec may be invoked to convert str objects to
5565 * Unicode, and so it's possible to call back into Python code
5566 * during PyUnicode_FromObject(), and so it's possible for a sick
5567 * codec to change the size of fseq (if seq is a list). Therefore
5568 * we have to keep refetching the size -- can't assume seqlen
5569 * is invariant.
5570 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005571 seqlen = PySequence_Fast_GET_SIZE(fseq);
5572 /* If empty sequence, return u"". */
5573 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005574 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5575 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 }
5577 /* If singleton sequence with an exact Unicode, return that. */
5578 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005579 item = PySequence_Fast_GET_ITEM(fseq, 0);
5580 if (PyUnicode_CheckExact(item)) {
5581 Py_INCREF(item);
5582 res = (PyUnicodeObject *)item;
5583 goto Done;
5584 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005585 }
5586
Tim Peters05eba1f2004-08-27 21:32:02 +00005587 /* At least two items to join, or one that isn't exact Unicode. */
5588 if (seqlen > 1) {
5589 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005590 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005591 sep = &blank;
5592 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005593 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005594 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005595 internal_separator = PyUnicode_FromObject(separator);
5596 if (internal_separator == NULL)
5597 goto onError;
5598 sep = PyUnicode_AS_UNICODE(internal_separator);
5599 seplen = PyUnicode_GET_SIZE(internal_separator);
5600 /* In case PyUnicode_FromObject() mutated seq. */
5601 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005602 }
5603 }
5604
5605 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005606 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005607 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005608 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 res_p = PyUnicode_AS_UNICODE(res);
5610 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005611
Tim Peters05eba1f2004-08-27 21:32:02 +00005612 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 Py_ssize_t itemlen;
5614 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005615
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005616 item = PySequence_Fast_GET_ITEM(fseq, i);
5617 /* Convert item to Unicode. */
5618 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5619 PyErr_Format(PyExc_TypeError,
5620 "sequence item %zd: expected string or Unicode,"
5621 " %.80s found",
5622 i, Py_TYPE(item)->tp_name);
5623 goto onError;
5624 }
5625 item = PyUnicode_FromObject(item);
5626 if (item == NULL)
5627 goto onError;
5628 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005629
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005630 /* In case PyUnicode_FromObject() mutated seq. */
5631 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005632
Tim Peters8ce9f162004-08-27 01:49:32 +00005633 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 itemlen = PyUnicode_GET_SIZE(item);
5635 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005636 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005637 goto Overflow;
5638 if (i < seqlen - 1) {
5639 new_res_used += seplen;
5640 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005641 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005642 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005643 if (new_res_used > res_alloc) {
5644 /* double allocated size until it's big enough */
5645 do {
5646 res_alloc += res_alloc;
5647 if (res_alloc <= 0)
5648 goto Overflow;
5649 } while (new_res_used > res_alloc);
5650 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5651 Py_DECREF(item);
5652 goto onError;
5653 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005654 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005655 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005656
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005657 /* Copy item, and maybe the separator. */
5658 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5659 res_p += itemlen;
5660 if (i < seqlen - 1) {
5661 Py_UNICODE_COPY(res_p, sep, seplen);
5662 res_p += seplen;
5663 }
5664 Py_DECREF(item);
5665 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005667
Tim Peters05eba1f2004-08-27 21:32:02 +00005668 /* Shrink res to match the used area; this probably can't fail,
5669 * but it's cheap to check.
5670 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005671 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005672 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005673
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005674 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005675 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 return (PyObject *)res;
5678
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005681 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005682 Py_DECREF(item);
5683 /* fall through */
5684
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005685 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005687 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005688 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 return NULL;
5690}
5691
Tim Petersced69f82003-09-16 20:30:58 +00005692static
5693PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005694 Py_ssize_t left,
5695 Py_ssize_t right,
5696 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697{
5698 PyUnicodeObject *u;
5699
5700 if (left < 0)
5701 left = 0;
5702 if (right < 0)
5703 right = 0;
5704
Tim Peters7a29bd52001-09-12 03:03:31 +00005705 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 Py_INCREF(self);
5707 return self;
5708 }
5709
Neal Norwitze7d8be82008-07-31 17:17:14 +00005710 if (left > PY_SSIZE_T_MAX - self->length ||
5711 right > PY_SSIZE_T_MAX - (left + self->length)) {
5712 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5713 return NULL;
5714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 u = _PyUnicode_New(left + self->length + right);
5716 if (u) {
5717 if (left)
5718 Py_UNICODE_FILL(u->str, fill, left);
5719 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5720 if (right)
5721 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5722 }
5723
5724 return u;
5725}
5726
Antoine Pitrou64672132010-01-13 07:55:48 +00005727PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
5731 string = PyUnicode_FromObject(string);
5732 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
Antoine Pitrou64672132010-01-13 07:55:48 +00005735 list = stringlib_splitlines(
5736 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5737 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
5739 Py_DECREF(string);
5740 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741}
5742
Tim Petersced69f82003-09-16 20:30:58 +00005743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005745 PyUnicodeObject *substring,
5746 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005749 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005752 return stringlib_split_whitespace(
5753 (PyObject*) self, self->str, self->length, maxcount
5754 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Antoine Pitrou64672132010-01-13 07:55:48 +00005756 return stringlib_split(
5757 (PyObject*) self, self->str, self->length,
5758 substring->str, substring->length,
5759 maxcount
5760 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761}
5762
Tim Petersced69f82003-09-16 20:30:58 +00005763static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005764PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005765 PyUnicodeObject *substring,
5766 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005769 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005770
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005772 return stringlib_rsplit_whitespace(
5773 (PyObject*) self, self->str, self->length, maxcount
5774 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775
Antoine Pitrou64672132010-01-13 07:55:48 +00005776 return stringlib_rsplit(
5777 (PyObject*) self, self->str, self->length,
5778 substring->str, substring->length,
5779 maxcount
5780 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005781}
5782
5783static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005785 PyUnicodeObject *str1,
5786 PyUnicodeObject *str2,
5787 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788{
5789 PyUnicodeObject *u;
5790
5791 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005792 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005793 else if (maxcount == 0 || self->length == 0)
5794 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Fredrik Lundh347ee272006-05-24 16:35:18 +00005796 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005797 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005798 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005799 if (str1->length == 0)
5800 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005801 if (str1->length == 1) {
5802 /* replace characters */
5803 Py_UNICODE u1, u2;
5804 if (!findchar(self->str, self->length, str1->str[0]))
5805 goto nothing;
5806 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5807 if (!u)
5808 return NULL;
5809 Py_UNICODE_COPY(u->str, self->str, self->length);
5810 u1 = str1->str[0];
5811 u2 = str2->str[0];
5812 for (i = 0; i < u->length; i++)
5813 if (u->str[i] == u1) {
5814 if (--maxcount < 0)
5815 break;
5816 u->str[i] = u2;
5817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005819 i = stringlib_find(
5820 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005822 if (i < 0)
5823 goto nothing;
5824 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5825 if (!u)
5826 return NULL;
5827 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005828
5829 /* change everything in-place, starting with this one */
5830 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5831 i += str1->length;
5832
5833 while ( --maxcount > 0) {
5834 i = stringlib_find(self->str+i, self->length-i,
5835 str1->str, str1->length,
5836 i);
5837 if (i == -1)
5838 break;
5839 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5840 i += str1->length;
5841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005844
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005845 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005846 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 Py_UNICODE *p;
5848
5849 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005850 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5851 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005852 if (n == 0)
5853 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005854 /* new_size = self->length + n * (str2->length - str1->length)); */
5855 delta = (str2->length - str1->length);
5856 if (delta == 0) {
5857 new_size = self->length;
5858 } else {
5859 product = n * (str2->length - str1->length);
5860 if ((product / (str2->length - str1->length)) != n) {
5861 PyErr_SetString(PyExc_OverflowError,
5862 "replace string is too long");
5863 return NULL;
5864 }
5865 new_size = self->length + product;
5866 if (new_size < 0) {
5867 PyErr_SetString(PyExc_OverflowError,
5868 "replace string is too long");
5869 return NULL;
5870 }
5871 }
5872 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005873 if (!u)
5874 return NULL;
5875 i = 0;
5876 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005877 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005878 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005879 while (n-- > 0) {
5880 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005881 j = stringlib_find(self->str+i, self->length-i,
5882 str1->str, str1->length,
5883 i);
5884 if (j == -1)
5885 break;
5886 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005887 /* copy unchanged part [i:j] */
5888 Py_UNICODE_COPY(p, self->str+i, j-i);
5889 p += j - i;
5890 }
5891 /* copy substitution string */
5892 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005893 Py_UNICODE_COPY(p, str2->str, str2->length);
5894 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005895 }
5896 i = j + str1->length;
5897 }
5898 if (i < self->length)
5899 /* copy tail [i:] */
5900 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005901 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005902 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005903 while (n > 0) {
5904 Py_UNICODE_COPY(p, str2->str, str2->length);
5905 p += str2->length;
5906 if (--n <= 0)
5907 break;
5908 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005910 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 }
5912 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005914
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005915 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005916 /* nothing to replace; return original string (when possible) */
5917 if (PyUnicode_CheckExact(self)) {
5918 Py_INCREF(self);
5919 return (PyObject *) self;
5920 }
5921 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922}
5923
5924/* --- Unicode Object Methods --------------------------------------------- */
5925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005926PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005927 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928\n\
5929Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
5932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005933unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 return fixup(self, fixtitle);
5936}
5937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005938PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005939 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940\n\
5941Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
5944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005945unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 return fixup(self, fixcapitalize);
5948}
5949
5950#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005952 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953\n\
5954Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005958unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
5960 PyObject *list;
5961 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005962 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 /* Split into words */
5965 list = split(self, NULL, -1);
5966 if (!list)
5967 return NULL;
5968
5969 /* Capitalize each word */
5970 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5971 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005972 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 if (item == NULL)
5974 goto onError;
5975 Py_DECREF(PyList_GET_ITEM(list, i));
5976 PyList_SET_ITEM(list, i, item);
5977 }
5978
5979 /* Join the words to form a new string */
5980 item = PyUnicode_Join(NULL, list);
5981
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005982 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 Py_DECREF(list);
5984 return (PyObject *)item;
5985}
5986#endif
5987
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005988/* Argument converter. Coerces to a single unicode character */
5989
5990static int
5991convert_uc(PyObject *obj, void *addr)
5992{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005993 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5994 PyObject *uniobj;
5995 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005996
Benjamin Peterson857ce152009-01-31 16:29:18 +00005997 uniobj = PyUnicode_FromObject(obj);
5998 if (uniobj == NULL) {
5999 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006000 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006001 return 0;
6002 }
6003 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6004 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006005 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006006 Py_DECREF(uniobj);
6007 return 0;
6008 }
6009 unistr = PyUnicode_AS_UNICODE(uniobj);
6010 *fillcharloc = unistr[0];
6011 Py_DECREF(uniobj);
6012 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006013}
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006016 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006018Return S centered in a Unicode string of length width. Padding is\n\
6019done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject *
6022unicode_center(PyUnicodeObject *self, PyObject *args)
6023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 Py_ssize_t marg, left;
6025 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006026 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
Thomas Woutersde017742006-02-16 19:34:37 +00006028 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return NULL;
6030
Tim Peters7a29bd52001-09-12 03:03:31 +00006031 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 Py_INCREF(self);
6033 return (PyObject*) self;
6034 }
6035
6036 marg = width - self->length;
6037 left = marg / 2 + (marg & width & 1);
6038
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006039 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040}
6041
Marc-André Lemburge5034372000-08-08 08:04:29 +00006042#if 0
6043
6044/* This code should go into some future Unicode collation support
6045 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006046 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006047
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006048/* speedy UTF-16 code point order comparison */
6049/* gleaned from: */
6050/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6051
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006052static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006053{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006054 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006055 0, 0, 0, 0, 0, 0, 0, 0,
6056 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006057 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006058};
6059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060static int
6061unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6062{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006063 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006064
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 Py_UNICODE *s1 = str1->str;
6066 Py_UNICODE *s2 = str2->str;
6067
6068 len1 = str1->length;
6069 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006072 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006073
6074 c1 = *s1++;
6075 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006076
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006077 if (c1 > (1<<11) * 26)
6078 c1 += utf16Fixup[c1>>11];
6079 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006080 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006081 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006082
6083 if (c1 != c2)
6084 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006085
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006086 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 }
6088
6089 return (len1 < len2) ? -1 : (len1 != len2);
6090}
6091
Marc-André Lemburge5034372000-08-08 08:04:29 +00006092#else
6093
6094static int
6095unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6096{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006097 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006098
6099 Py_UNICODE *s1 = str1->str;
6100 Py_UNICODE *s2 = str2->str;
6101
6102 len1 = str1->length;
6103 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006104
Marc-André Lemburge5034372000-08-08 08:04:29 +00006105 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006106 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006107
Fredrik Lundh45714e92001-06-26 16:39:36 +00006108 c1 = *s1++;
6109 c2 = *s2++;
6110
6111 if (c1 != c2)
6112 return (c1 < c2) ? -1 : 1;
6113
Marc-André Lemburge5034372000-08-08 08:04:29 +00006114 len1--; len2--;
6115 }
6116
6117 return (len1 < len2) ? -1 : (len1 != len2);
6118}
6119
6120#endif
6121
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006123 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124{
6125 PyUnicodeObject *u = NULL, *v = NULL;
6126 int result;
6127
6128 /* Coerce the two arguments */
6129 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6130 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6133 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006134 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Thomas Wouters7e474022000-07-16 12:04:32 +00006136 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006138 Py_DECREF(u);
6139 Py_DECREF(v);
6140 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
6142
6143 result = unicode_compare(u, v);
6144
6145 Py_DECREF(u);
6146 Py_DECREF(v);
6147 return result;
6148
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006149 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 Py_XDECREF(u);
6151 Py_XDECREF(v);
6152 return -1;
6153}
6154
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006155PyObject *PyUnicode_RichCompare(PyObject *left,
6156 PyObject *right,
6157 int op)
6158{
6159 int result;
6160
6161 result = PyUnicode_Compare(left, right);
6162 if (result == -1 && PyErr_Occurred())
6163 goto onError;
6164
6165 /* Convert the return value to a Boolean */
6166 switch (op) {
6167 case Py_EQ:
6168 result = (result == 0);
6169 break;
6170 case Py_NE:
6171 result = (result != 0);
6172 break;
6173 case Py_LE:
6174 result = (result <= 0);
6175 break;
6176 case Py_GE:
6177 result = (result >= 0);
6178 break;
6179 case Py_LT:
6180 result = (result == -1);
6181 break;
6182 case Py_GT:
6183 result = (result == 1);
6184 break;
6185 }
6186 return PyBool_FromLong(result);
6187
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006188 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006189
6190 /* Standard case
6191
6192 Type errors mean that PyUnicode_FromObject() could not convert
6193 one of the arguments (usually the right hand side) to Unicode,
6194 ie. we can't handle the comparison request. However, it is
6195 possible that the other object knows a comparison method, which
6196 is why we return Py_NotImplemented to give the other object a
6197 chance.
6198
6199 */
6200 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6201 PyErr_Clear();
6202 Py_INCREF(Py_NotImplemented);
6203 return Py_NotImplemented;
6204 }
6205 if (op != Py_EQ && op != Py_NE)
6206 return NULL;
6207
6208 /* Equality comparison.
6209
6210 This is a special case: we silence any PyExc_UnicodeDecodeError
6211 and instead turn it into a PyErr_UnicodeWarning.
6212
6213 */
6214 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6215 return NULL;
6216 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006217 if (PyErr_Warn(PyExc_UnicodeWarning,
6218 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006219 "Unicode equal comparison "
6220 "failed to convert both arguments to Unicode - "
6221 "interpreting them as being unequal" :
6222 "Unicode unequal comparison "
6223 "failed to convert both arguments to Unicode - "
6224 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006225 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006226 return NULL;
6227 result = (op == Py_NE);
6228 return PyBool_FromLong(result);
6229}
6230
Guido van Rossum403d68b2000-03-13 15:55:09 +00006231int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006232 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006233{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006234 PyObject *str, *sub;
6235 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006236
6237 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006238 sub = PyUnicode_FromObject(element);
6239 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006240 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006241 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006242
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006243 str = PyUnicode_FromObject(container);
6244 if (!str) {
6245 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006246 return -1;
6247 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006248
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006249 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006250
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006251 Py_DECREF(str);
6252 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006253
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006254 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006255}
6256
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257/* Concat to string or Unicode object giving a new Unicode object. */
6258
6259PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006260 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261{
6262 PyUnicodeObject *u = NULL, *v = NULL, *w;
6263
6264 /* Coerce the two arguments */
6265 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6266 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6269 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272 /* Shortcuts */
6273 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006274 Py_DECREF(v);
6275 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 }
6277 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006278 Py_DECREF(u);
6279 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 }
6281
6282 /* Concat the two Unicode strings */
6283 w = _PyUnicode_New(u->length + v->length);
6284 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006285 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 Py_UNICODE_COPY(w->str, u->str, u->length);
6287 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6288
6289 Py_DECREF(u);
6290 Py_DECREF(v);
6291 return (PyObject *)w;
6292
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_XDECREF(u);
6295 Py_XDECREF(v);
6296 return NULL;
6297}
6298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006299PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006300 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006302Return the number of non-overlapping occurrences of substring sub in\n\
6303Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006304interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305
6306static PyObject *
6307unicode_count(PyUnicodeObject *self, PyObject *args)
6308{
6309 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006310 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006311 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 PyObject *result;
6313
Guido van Rossumb8872e62000-05-09 14:14:27 +00006314 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006315 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 return NULL;
6317
6318 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006319 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006321 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006322
Antoine Pitrou64672132010-01-13 07:55:48 +00006323 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006324 result = PyInt_FromSsize_t(
6325 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006326 substring->str, substring->length,
6327 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006328 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329
6330 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006331
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 return result;
6333}
6334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006335PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006336 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006338Encodes S using the codec registered for encoding. encoding defaults\n\
6339to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006340handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6342'xmlcharrefreplace' as well as any other name registered with\n\
6343codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344
6345static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006346unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006348 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 char *encoding = NULL;
6350 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006351 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006352
Benjamin Peterson332d7212009-09-18 21:14:55 +00006353 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6354 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006356 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006357 if (v == NULL)
6358 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006359 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006360 PyErr_Format(PyExc_TypeError,
6361 "encoder did not return a string/unicode object "
6362 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006363 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006364 Py_DECREF(v);
6365 return NULL;
6366 }
6367 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006368
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006369 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006370 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006371}
6372
6373PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006374 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006375\n\
6376Decodes S using the codec registered for encoding. encoding defaults\n\
6377to the default encoding. errors may be given to set a different error\n\
6378handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6379a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6380as well as any other name registerd with codecs.register_error that is\n\
6381able to handle UnicodeDecodeErrors.");
6382
6383static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006384unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006385{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006386 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006387 char *encoding = NULL;
6388 char *errors = NULL;
6389 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006390
Benjamin Peterson332d7212009-09-18 21:14:55 +00006391 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6392 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006393 return NULL;
6394 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006395 if (v == NULL)
6396 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006397 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006398 PyErr_Format(PyExc_TypeError,
6399 "decoder did not return a string/unicode object "
6400 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006401 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006402 Py_DECREF(v);
6403 return NULL;
6404 }
6405 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006406
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006407 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409}
6410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006411PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006412 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413\n\
6414Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006415If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
6417static PyObject*
6418unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6419{
6420 Py_UNICODE *e;
6421 Py_UNICODE *p;
6422 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006423 Py_UNICODE *qe;
6424 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 PyUnicodeObject *u;
6426 int tabsize = 8;
6427
6428 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430
Thomas Wouters7e474022000-07-16 12:04:32 +00006431 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006432 i = 0; /* chars up to and including most recent \n or \r */
6433 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6434 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 for (p = self->str; p < e; p++)
6436 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006437 if (tabsize > 0) {
6438 incr = tabsize - (j % tabsize); /* cannot overflow */
6439 if (j > PY_SSIZE_T_MAX - incr)
6440 goto overflow1;
6441 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006442 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006445 if (j > PY_SSIZE_T_MAX - 1)
6446 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 j++;
6448 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006449 if (i > PY_SSIZE_T_MAX - j)
6450 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006452 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 }
6454 }
6455
Guido van Rossum5bdff602008-03-11 21:18:06 +00006456 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006457 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 /* Second pass: create output string and fill it */
6460 u = _PyUnicode_New(i + j);
6461 if (!u)
6462 return NULL;
6463
Guido van Rossum5bdff602008-03-11 21:18:06 +00006464 j = 0; /* same as in first pass */
6465 q = u->str; /* next output char */
6466 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
6468 for (p = self->str; p < e; p++)
6469 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006470 if (tabsize > 0) {
6471 i = tabsize - (j % tabsize);
6472 j += i;
6473 while (i--) {
6474 if (q >= qe)
6475 goto overflow2;
6476 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006477 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006478 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006479 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006480 else {
6481 if (q >= qe)
6482 goto overflow2;
6483 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006484 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 if (*p == '\n' || *p == '\r')
6486 j = 0;
6487 }
6488
6489 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006490
6491 overflow2:
6492 Py_DECREF(u);
6493 overflow1:
6494 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006499 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500\n\
6501Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006502such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503arguments start and end are interpreted as in slice notation.\n\
6504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507static PyObject *
6508unicode_find(PyUnicodeObject *self, PyObject *args)
6509{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006510 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006511 Py_ssize_t start;
6512 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006513 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514
Facundo Batista57d56692007-11-16 18:04:14 +00006515 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006518 result = stringlib_find_slice(
6519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6521 start, end
6522 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006525
6526 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527}
6528
6529static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006530unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
6532 if (index < 0 || index >= self->length) {
6533 PyErr_SetString(PyExc_IndexError, "string index out of range");
6534 return NULL;
6535 }
6536
6537 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6538}
6539
6540static long
6541unicode_hash(PyUnicodeObject *self)
6542{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006543 /* Since Unicode objects compare equal to their ASCII string
6544 counterparts, they should use the individual character values
6545 as basis for their hash value. This is needed to assure that
6546 strings and Unicode objects behave in the same way as
6547 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548
Martin v. Löwis18e16552006-02-15 17:27:45 +00006549 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006550 register Py_UNICODE *p;
6551 register long x;
6552
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006554 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006555 len = PyUnicode_GET_SIZE(self);
6556 p = PyUnicode_AS_UNICODE(self);
6557 x = *p << 7;
6558 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006559 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006560 x ^= PyUnicode_GET_SIZE(self);
6561 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006562 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006563 self->hash = x;
6564 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565}
6566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006568 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006570Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
6572static PyObject *
6573unicode_index(PyUnicodeObject *self, PyObject *args)
6574{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006576 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006577 Py_ssize_t start;
6578 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Facundo Batista57d56692007-11-16 18:04:14 +00006580 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006583 result = stringlib_find_slice(
6584 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6585 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6586 start, end
6587 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006590
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 if (result < 0) {
6592 PyErr_SetString(PyExc_ValueError, "substring not found");
6593 return NULL;
6594 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006595
Martin v. Löwis18e16552006-02-15 17:27:45 +00006596 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597}
6598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006599PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006600 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006602Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006603at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
6605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006606unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607{
6608 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6609 register const Py_UNICODE *e;
6610 int cased;
6611
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 /* Shortcut for single character strings */
6613 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006614 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006616 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006617 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006618 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 e = p + PyUnicode_GET_SIZE(self);
6621 cased = 0;
6622 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006623 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006624
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006625 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6626 return PyBool_FromLong(0);
6627 else if (!cased && Py_UNICODE_ISLOWER(ch))
6628 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006630 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006634 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006636Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
6639static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006640unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
6642 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6643 register const Py_UNICODE *e;
6644 int cased;
6645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 /* Shortcut for single character strings */
6647 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006648 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006650 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006651 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006652 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006653
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 e = p + PyUnicode_GET_SIZE(self);
6655 cased = 0;
6656 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006657 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006658
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006659 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6660 return PyBool_FromLong(0);
6661 else if (!cased && Py_UNICODE_ISUPPER(ch))
6662 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006664 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006668 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006670Return True if S is a titlecased string and there is at least one\n\
6671character in S, i.e. upper- and titlecase characters may only\n\
6672follow uncased characters and lowercase characters only cased ones.\n\
6673Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006676unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677{
6678 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6679 register const Py_UNICODE *e;
6680 int cased, previous_is_cased;
6681
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 /* Shortcut for single character strings */
6683 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006684 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6685 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006687 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006688 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006689 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 e = p + PyUnicode_GET_SIZE(self);
6692 cased = 0;
6693 previous_is_cased = 0;
6694 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006695 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006696
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006697 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6698 if (previous_is_cased)
6699 return PyBool_FromLong(0);
6700 previous_is_cased = 1;
6701 cased = 1;
6702 }
6703 else if (Py_UNICODE_ISLOWER(ch)) {
6704 if (!previous_is_cased)
6705 return PyBool_FromLong(0);
6706 previous_is_cased = 1;
6707 cased = 1;
6708 }
6709 else
6710 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713}
6714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006715PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006716 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006718Return True if all characters in S are whitespace\n\
6719and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
6721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006722unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723{
6724 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6725 register const Py_UNICODE *e;
6726
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 /* Shortcut for single character strings */
6728 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006729 Py_UNICODE_ISSPACE(*p))
6730 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006732 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006733 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006734 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006735
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 e = p + PyUnicode_GET_SIZE(self);
6737 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006738 if (!Py_UNICODE_ISSPACE(*p))
6739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006741 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742}
6743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006744PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006745 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006747Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006748and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006749
6750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006751unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006752{
6753 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6754 register const Py_UNICODE *e;
6755
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006756 /* Shortcut for single character strings */
6757 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006758 Py_UNICODE_ISALPHA(*p))
6759 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006760
6761 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006762 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006763 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006764
6765 e = p + PyUnicode_GET_SIZE(self);
6766 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006767 if (!Py_UNICODE_ISALPHA(*p))
6768 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006769 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006770 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006771}
6772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006773PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006774 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006775\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006776Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778
6779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006780unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006781{
6782 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6783 register const Py_UNICODE *e;
6784
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785 /* Shortcut for single character strings */
6786 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006787 Py_UNICODE_ISALNUM(*p))
6788 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789
6790 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006791 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006792 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006793
6794 e = p + PyUnicode_GET_SIZE(self);
6795 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006796 if (!Py_UNICODE_ISALNUM(*p))
6797 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006800}
6801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006802PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006803 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006805Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807
6808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006809unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810{
6811 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6812 register const Py_UNICODE *e;
6813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 /* Shortcut for single character strings */
6815 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006816 Py_UNICODE_ISDECIMAL(*p))
6817 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006819 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006820 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006821 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006822
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 e = p + PyUnicode_GET_SIZE(self);
6824 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006825 if (!Py_UNICODE_ISDECIMAL(*p))
6826 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006828 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829}
6830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006832 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006834Return True if all characters in S are digits\n\
6835and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
6837static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006838unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
6840 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6841 register const Py_UNICODE *e;
6842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 /* Shortcut for single character strings */
6844 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006845 Py_UNICODE_ISDIGIT(*p))
6846 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006849 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006850 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 e = p + PyUnicode_GET_SIZE(self);
6853 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006854 if (!Py_UNICODE_ISDIGIT(*p))
6855 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858}
6859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006860PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006861 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006864False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865
6866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006867unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
6869 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6870 register const Py_UNICODE *e;
6871
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 /* Shortcut for single character strings */
6873 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006874 Py_UNICODE_ISNUMERIC(*p))
6875 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006878 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006879 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 e = p + PyUnicode_GET_SIZE(self);
6882 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006883 if (!Py_UNICODE_ISNUMERIC(*p))
6884 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887}
6888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006889PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006890 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891\n\
6892Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006893iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894
6895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006896unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006898 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899}
6900
Martin v. Löwis18e16552006-02-15 17:27:45 +00006901static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902unicode_length(PyUnicodeObject *self)
6903{
6904 return self->length;
6905}
6906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006907PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006908 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006910Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006911done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912
6913static PyObject *
6914unicode_ljust(PyUnicodeObject *self, PyObject *args)
6915{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006916 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006917 Py_UNICODE fillchar = ' ';
6918
Martin v. Löwis412fb672006-04-13 06:34:32 +00006919 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 return NULL;
6921
Tim Peters7a29bd52001-09-12 03:03:31 +00006922 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 Py_INCREF(self);
6924 return (PyObject*) self;
6925 }
6926
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006927 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006930PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006931 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006933Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006936unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 return fixup(self, fixlower);
6939}
6940
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006941#define LEFTSTRIP 0
6942#define RIGHTSTRIP 1
6943#define BOTHSTRIP 2
6944
6945/* Arrays indexed by above */
6946static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6947
6948#define STRIPNAME(i) (stripformat[i]+3)
6949
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006950/* externally visible for str.strip(unicode) */
6951PyObject *
6952_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6953{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006954 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6955 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6956 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6957 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6958 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006959
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006960 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006961
Benjamin Peterson857ce152009-01-31 16:29:18 +00006962 i = 0;
6963 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006964 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6965 i++;
6966 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006967 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006968
Benjamin Peterson857ce152009-01-31 16:29:18 +00006969 j = len;
6970 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006971 do {
6972 j--;
6973 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6974 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006975 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006976
Benjamin Peterson857ce152009-01-31 16:29:18 +00006977 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006978 Py_INCREF(self);
6979 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006980 }
6981 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006982 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006983}
6984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
6986static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006987do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006989 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6990 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006991
Benjamin Peterson857ce152009-01-31 16:29:18 +00006992 i = 0;
6993 if (striptype != RIGHTSTRIP) {
6994 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6995 i++;
6996 }
6997 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006998
Benjamin Peterson857ce152009-01-31 16:29:18 +00006999 j = len;
7000 if (striptype != LEFTSTRIP) {
7001 do {
7002 j--;
7003 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7004 j++;
7005 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007006
Benjamin Peterson857ce152009-01-31 16:29:18 +00007007 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7008 Py_INCREF(self);
7009 return (PyObject*)self;
7010 }
7011 else
7012 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013}
7014
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007015
7016static PyObject *
7017do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7018{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007019 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020
Benjamin Peterson857ce152009-01-31 16:29:18 +00007021 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7022 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023
Benjamin Peterson857ce152009-01-31 16:29:18 +00007024 if (sep != NULL && sep != Py_None) {
7025 if (PyUnicode_Check(sep))
7026 return _PyUnicode_XStrip(self, striptype, sep);
7027 else if (PyString_Check(sep)) {
7028 PyObject *res;
7029 sep = PyUnicode_FromObject(sep);
7030 if (sep==NULL)
7031 return NULL;
7032 res = _PyUnicode_XStrip(self, striptype, sep);
7033 Py_DECREF(sep);
7034 return res;
7035 }
7036 else {
7037 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007038 "%s arg must be None, unicode or str",
7039 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007040 return NULL;
7041 }
7042 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007043
Benjamin Peterson857ce152009-01-31 16:29:18 +00007044 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007045}
7046
7047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007048PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007049 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050\n\
7051Return a copy of the string S with leading and trailing\n\
7052whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007053If chars is given and not None, remove characters in chars instead.\n\
7054If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007055
7056static PyObject *
7057unicode_strip(PyUnicodeObject *self, PyObject *args)
7058{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007059 if (PyTuple_GET_SIZE(args) == 0)
7060 return do_strip(self, BOTHSTRIP); /* Common case */
7061 else
7062 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063}
7064
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007067 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068\n\
7069Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007070If chars is given and not None, remove characters in chars instead.\n\
7071If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007072
7073static PyObject *
7074unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7075{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007076 if (PyTuple_GET_SIZE(args) == 0)
7077 return do_strip(self, LEFTSTRIP); /* Common case */
7078 else
7079 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080}
7081
7082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007083PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007084 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085\n\
7086Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007087If chars is given and not None, remove characters in chars instead.\n\
7088If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089
7090static PyObject *
7091unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7092{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007093 if (PyTuple_GET_SIZE(args) == 0)
7094 return do_strip(self, RIGHTSTRIP); /* Common case */
7095 else
7096 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097}
7098
7099
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007101unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
7103 PyUnicodeObject *u;
7104 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007105 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007106 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107
7108 if (len < 0)
7109 len = 0;
7110
Tim Peters7a29bd52001-09-12 03:03:31 +00007111 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 /* no repeat, return original string */
7113 Py_INCREF(str);
7114 return (PyObject*) str;
7115 }
Tim Peters8f422462000-09-09 06:13:41 +00007116
7117 /* ensure # of chars needed doesn't overflow int and # of bytes
7118 * needed doesn't overflow size_t
7119 */
7120 nchars = len * str->length;
7121 if (len && nchars / len != str->length) {
7122 PyErr_SetString(PyExc_OverflowError,
7123 "repeated string is too long");
7124 return NULL;
7125 }
7126 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7127 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7128 PyErr_SetString(PyExc_OverflowError,
7129 "repeated string is too long");
7130 return NULL;
7131 }
7132 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 if (!u)
7134 return NULL;
7135
7136 p = u->str;
7137
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007138 if (str->length == 1 && len > 0) {
7139 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007140 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007141 Py_ssize_t done = 0; /* number of characters copied this far */
7142 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007143 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007144 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007145 }
7146 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007147 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007148 Py_UNICODE_COPY(p+done, p, n);
7149 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007150 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
7153 return (PyObject*) u;
7154}
7155
7156PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007157 PyObject *subobj,
7158 PyObject *replobj,
7159 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
7161 PyObject *self;
7162 PyObject *str1;
7163 PyObject *str2;
7164 PyObject *result;
7165
7166 self = PyUnicode_FromObject(obj);
7167 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 str1 = PyUnicode_FromObject(subobj);
7170 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 Py_DECREF(self);
7172 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 }
7174 str2 = PyUnicode_FromObject(replobj);
7175 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007176 Py_DECREF(self);
7177 Py_DECREF(str1);
7178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 }
Tim Petersced69f82003-09-16 20:30:58 +00007180 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007181 (PyUnicodeObject *)str1,
7182 (PyUnicodeObject *)str2,
7183 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 Py_DECREF(self);
7185 Py_DECREF(str1);
7186 Py_DECREF(str2);
7187 return result;
7188}
7189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007191 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192\n\
7193Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007194old replaced by new. If the optional argument count is\n\
7195given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197static PyObject*
7198unicode_replace(PyUnicodeObject *self, PyObject *args)
7199{
7200 PyUnicodeObject *str1;
7201 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007202 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 PyObject *result;
7204
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 return NULL;
7207 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7208 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007211 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007212 Py_DECREF(str1);
7213 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216 result = replace(self, str1, str2, maxcount);
7217
7218 Py_DECREF(str1);
7219 Py_DECREF(str2);
7220 return result;
7221}
7222
7223static
7224PyObject *unicode_repr(PyObject *unicode)
7225{
7226 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007227 PyUnicode_GET_SIZE(unicode),
7228 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229}
7230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007231PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007232 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233\n\
7234Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007235such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236arguments start and end are interpreted as in slice notation.\n\
7237\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239
7240static PyObject *
7241unicode_rfind(PyUnicodeObject *self, PyObject *args)
7242{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007243 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007244 Py_ssize_t start;
7245 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007246 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247
Facundo Batista57d56692007-11-16 18:04:14 +00007248 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007251 result = stringlib_rfind_slice(
7252 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7253 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7254 start, end
7255 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
7257 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007258
7259 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260}
7261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007262PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007263 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007265Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266
7267static PyObject *
7268unicode_rindex(PyUnicodeObject *self, PyObject *args)
7269{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007270 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007271 Py_ssize_t start;
7272 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007273 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
Facundo Batista57d56692007-11-16 18:04:14 +00007275 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007278 result = stringlib_rfind_slice(
7279 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7280 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7281 start, end
7282 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283
7284 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007285
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 if (result < 0) {
7287 PyErr_SetString(PyExc_ValueError, "substring not found");
7288 return NULL;
7289 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291}
7292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007293PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007294 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007296Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007297done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299static PyObject *
7300unicode_rjust(PyUnicodeObject *self, PyObject *args)
7301{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007302 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007303 Py_UNICODE fillchar = ' ';
7304
Martin v. Löwis412fb672006-04-13 06:34:32 +00007305 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return NULL;
7307
Tim Peters7a29bd52001-09-12 03:03:31 +00007308 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 Py_INCREF(self);
7310 return (PyObject*) self;
7311 }
7312
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007313 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314}
7315
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007317unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318{
7319 /* standard clamping */
7320 if (start < 0)
7321 start = 0;
7322 if (end < 0)
7323 end = 0;
7324 if (end > self->length)
7325 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007326 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 /* full slice, return original string */
7328 Py_INCREF(self);
7329 return (PyObject*) self;
7330 }
7331 if (start > end)
7332 start = end;
7333 /* copy slice */
7334 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007335 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336}
7337
7338PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007339 PyObject *sep,
7340 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341{
7342 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007343
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 s = PyUnicode_FromObject(s);
7345 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007346 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007347 if (sep != NULL) {
7348 sep = PyUnicode_FromObject(sep);
7349 if (sep == NULL) {
7350 Py_DECREF(s);
7351 return NULL;
7352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 }
7354
7355 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7356
7357 Py_DECREF(s);
7358 Py_XDECREF(sep);
7359 return result;
7360}
7361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007363 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364\n\
7365Return a list of the words in S, using sep as the\n\
7366delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007367splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007368whitespace string is a separator and empty strings are\n\
7369removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
7371static PyObject*
7372unicode_split(PyUnicodeObject *self, PyObject *args)
7373{
7374 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007375 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
Martin v. Löwis18e16552006-02-15 17:27:45 +00007377 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 return NULL;
7379
7380 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007381 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007383 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007385 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386}
7387
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007388PyObject *
7389PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7390{
7391 PyObject* str_obj;
7392 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007393 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007394
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007395 str_obj = PyUnicode_FromObject(str_in);
7396 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007397 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007398 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007399 if (!sep_obj) {
7400 Py_DECREF(str_obj);
7401 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007402 }
7403
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007404 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007405 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7406 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7407 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007408
Fredrik Lundhb9479482006-05-26 17:22:38 +00007409 Py_DECREF(sep_obj);
7410 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007411
7412 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007413}
7414
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007415
7416PyObject *
7417PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7418{
7419 PyObject* str_obj;
7420 PyObject* sep_obj;
7421 PyObject* out;
7422
7423 str_obj = PyUnicode_FromObject(str_in);
7424 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007425 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007426 sep_obj = PyUnicode_FromObject(sep_in);
7427 if (!sep_obj) {
7428 Py_DECREF(str_obj);
7429 return NULL;
7430 }
7431
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007432 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007433 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7434 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7435 );
7436
7437 Py_DECREF(sep_obj);
7438 Py_DECREF(str_obj);
7439
7440 return out;
7441}
7442
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007443PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007444 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007445\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007446Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007447the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007448found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007449
7450static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007451unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007452{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007453 return PyUnicode_Partition((PyObject *)self, separator);
7454}
7455
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007456PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007457 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007458\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007459Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007460the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007461separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007462
7463static PyObject*
7464unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7465{
7466 return PyUnicode_RPartition((PyObject *)self, separator);
7467}
7468
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007469PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007470 PyObject *sep,
7471 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007472{
7473 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007474
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007475 s = PyUnicode_FromObject(s);
7476 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007477 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007478 if (sep != NULL) {
7479 sep = PyUnicode_FromObject(sep);
7480 if (sep == NULL) {
7481 Py_DECREF(s);
7482 return NULL;
7483 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007484 }
7485
7486 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7487
7488 Py_DECREF(s);
7489 Py_XDECREF(sep);
7490 return result;
7491}
7492
7493PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007494 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007495\n\
7496Return a list of the words in S, using sep as the\n\
7497delimiter string, starting at the end of the string and\n\
7498working to the front. If maxsplit is given, at most maxsplit\n\
7499splits are done. If sep is not specified, any whitespace string\n\
7500is a separator.");
7501
7502static PyObject*
7503unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7504{
7505 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007506 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007507
Martin v. Löwis18e16552006-02-15 17:27:45 +00007508 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007509 return NULL;
7510
7511 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007512 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007513 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007514 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007515 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007516 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007517}
7518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007519PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007520 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521\n\
7522Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007523Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007524is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
7526static PyObject*
7527unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7528{
Guido van Rossum86662912000-04-11 15:38:46 +00007529 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530
Guido van Rossum86662912000-04-11 15:38:46 +00007531 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 return NULL;
7533
Guido van Rossum86662912000-04-11 15:38:46 +00007534 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535}
7536
7537static
7538PyObject *unicode_str(PyUnicodeObject *self)
7539{
Fred Drakee4315f52000-05-09 19:53:39 +00007540 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541}
7542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007543PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007544 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545\n\
7546Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007547and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548
7549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007550unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 return fixup(self, fixswapcase);
7553}
7554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007556 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557\n\
7558Return a copy of the string S, where all characters have been mapped\n\
7559through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007560Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7561Unmapped characters are left untouched. Characters mapped to None\n\
7562are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
7564static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007565unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
Tim Petersced69f82003-09-16 20:30:58 +00007567 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007568 self->length,
7569 table,
7570 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571}
7572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007573PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007574 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007579unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 return fixup(self, fixupper);
7582}
7583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007584PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007585 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586\n\
Georg Brandl98064072008-09-09 19:26:00 +00007587Pad a numeric string S with zeros on the left, to fill a field\n\
7588of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
7590static PyObject *
7591unicode_zfill(PyUnicodeObject *self, PyObject *args)
7592{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 PyUnicodeObject *u;
7595
Martin v. Löwis18e16552006-02-15 17:27:45 +00007596 Py_ssize_t width;
7597 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 return NULL;
7599
7600 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007601 if (PyUnicode_CheckExact(self)) {
7602 Py_INCREF(self);
7603 return (PyObject*) self;
7604 }
7605 else
7606 return PyUnicode_FromUnicode(
7607 PyUnicode_AS_UNICODE(self),
7608 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007609 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 }
7611
7612 fill = width - self->length;
7613
7614 u = pad(self, fill, 0, '0');
7615
Walter Dörwald068325e2002-04-15 13:36:47 +00007616 if (u == NULL)
7617 return NULL;
7618
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619 if (u->str[fill] == '+' || u->str[fill] == '-') {
7620 /* move sign to beginning of string */
7621 u->str[0] = u->str[fill];
7622 u->str[fill] = '0';
7623 }
7624
7625 return (PyObject*) u;
7626}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628#if 0
7629static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007630free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007632 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633}
7634#endif
7635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007637 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007639Return True if S starts with the specified prefix, False otherwise.\n\
7640With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007641With optional end, stop comparing S at that position.\n\
7642prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
7644static PyObject *
7645unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007646 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647{
Georg Brandl24250812006-06-09 18:45:48 +00007648 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007650 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007651 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007652 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
Georg Brandl24250812006-06-09 18:45:48 +00007654 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007655 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7656 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007657 if (PyTuple_Check(subobj)) {
7658 Py_ssize_t i;
7659 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7660 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007661 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007662 if (substring == NULL)
7663 return NULL;
7664 result = tailmatch(self, substring, start, end, -1);
7665 Py_DECREF(substring);
7666 if (result) {
7667 Py_RETURN_TRUE;
7668 }
7669 }
7670 /* nothing matched */
7671 Py_RETURN_FALSE;
7672 }
7673 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007675 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007676 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007678 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679}
7680
7681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007683 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007685Return True if S ends with the specified suffix, False otherwise.\n\
7686With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007687With optional end, stop comparing S at that position.\n\
7688suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
7690static PyObject *
7691unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007692 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693{
Georg Brandl24250812006-06-09 18:45:48 +00007694 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007696 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007697 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007698 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699
Georg Brandl24250812006-06-09 18:45:48 +00007700 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007701 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7702 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007703 if (PyTuple_Check(subobj)) {
7704 Py_ssize_t i;
7705 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7706 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007707 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007708 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007709 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007710 result = tailmatch(self, substring, start, end, +1);
7711 Py_DECREF(substring);
7712 if (result) {
7713 Py_RETURN_TRUE;
7714 }
7715 }
7716 Py_RETURN_FALSE;
7717 }
7718 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
Georg Brandl24250812006-06-09 18:45:48 +00007722 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007724 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725}
7726
7727
Eric Smitha9f7d622008-02-17 19:46:49 +00007728/* Implements do_string_format, which is unicode because of stringlib */
7729#include "stringlib/string_format.h"
7730
7731PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007732 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007733\n\
7734");
7735
Eric Smithdc13b792008-05-30 18:10:04 +00007736static PyObject *
7737unicode__format__(PyObject *self, PyObject *args)
7738{
7739 PyObject *format_spec;
7740 PyObject *result = NULL;
7741 PyObject *tmp = NULL;
7742
7743 /* If 2.x, convert format_spec to the same type as value */
7744 /* This is to allow things like u''.format('') */
7745 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7746 goto done;
7747 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7748 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007749 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007750 goto done;
7751 }
7752 tmp = PyObject_Unicode(format_spec);
7753 if (tmp == NULL)
7754 goto done;
7755 format_spec = tmp;
7756
7757 result = _PyUnicode_FormatAdvanced(self,
7758 PyUnicode_AS_UNICODE(format_spec),
7759 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007760 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007761 Py_XDECREF(tmp);
7762 return result;
7763}
7764
Eric Smitha9f7d622008-02-17 19:46:49 +00007765PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007766 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007767\n\
7768");
7769
Robert Schuppenies901c9972008-06-10 10:10:31 +00007770static PyObject *
7771unicode__sizeof__(PyUnicodeObject *v)
7772{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007773 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7774 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007775}
7776
7777PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007778 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007779\n\
7780");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007781
7782static PyObject *
7783unicode_getnewargs(PyUnicodeObject *v)
7784{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007785 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007786}
7787
7788
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789static PyMethodDef unicode_methods[] = {
7790
7791 /* Order is according to common usage: often used methods should
7792 appear first, since lookup is done sequentially. */
7793
Benjamin Peterson332d7212009-09-18 21:14:55 +00007794 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007795 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7796 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007797 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007798 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7799 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7800 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7801 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7802 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7803 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7804 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007805 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007806 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7807 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7808 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007809 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007810 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007811/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7812 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7813 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7814 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007815 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007816 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007817 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007818 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007819 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7820 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7821 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7822 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7823 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7824 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7825 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7826 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7827 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7828 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7829 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7830 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7831 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7832 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007833 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007834 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7835 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7836 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7837 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007838 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007839#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007840 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841#endif
7842
7843#if 0
7844 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007845 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846#endif
7847
Benjamin Peterson857ce152009-01-31 16:29:18 +00007848 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 {NULL, NULL}
7850};
7851
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007852static PyObject *
7853unicode_mod(PyObject *v, PyObject *w)
7854{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007855 if (!PyUnicode_Check(v)) {
7856 Py_INCREF(Py_NotImplemented);
7857 return Py_NotImplemented;
7858 }
7859 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007860}
7861
7862static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007863 0, /*nb_add*/
7864 0, /*nb_subtract*/
7865 0, /*nb_multiply*/
7866 0, /*nb_divide*/
7867 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007868};
7869
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007871 (lenfunc) unicode_length, /* sq_length */
7872 PyUnicode_Concat, /* sq_concat */
7873 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7874 (ssizeargfunc) unicode_getitem, /* sq_item */
7875 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7876 0, /* sq_ass_item */
7877 0, /* sq_ass_slice */
7878 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879};
7880
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007881static PyObject*
7882unicode_subscript(PyUnicodeObject* self, PyObject* item)
7883{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007884 if (PyIndex_Check(item)) {
7885 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007886 if (i == -1 && PyErr_Occurred())
7887 return NULL;
7888 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007889 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007890 return unicode_getitem(self, i);
7891 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007892 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007893 Py_UNICODE* source_buf;
7894 Py_UNICODE* result_buf;
7895 PyObject* result;
7896
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007897 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007898 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007899 return NULL;
7900 }
7901
7902 if (slicelength <= 0) {
7903 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007904 } else if (start == 0 && step == 1 && slicelength == self->length &&
7905 PyUnicode_CheckExact(self)) {
7906 Py_INCREF(self);
7907 return (PyObject *)self;
7908 } else if (step == 1) {
7909 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007910 } else {
7911 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007912 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7913 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007914
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007915 if (result_buf == NULL)
7916 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007917
7918 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7919 result_buf[i] = source_buf[cur];
7920 }
Tim Petersced69f82003-09-16 20:30:58 +00007921
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007922 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007923 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007924 return result;
7925 }
7926 } else {
7927 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7928 return NULL;
7929 }
7930}
7931
7932static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007933 (lenfunc)unicode_length, /* mp_length */
7934 (binaryfunc)unicode_subscript, /* mp_subscript */
7935 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007936};
7937
Martin v. Löwis18e16552006-02-15 17:27:45 +00007938static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007940 Py_ssize_t index,
7941 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942{
7943 if (index != 0) {
7944 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007945 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 return -1;
7947 }
7948 *ptr = (void *) self->str;
7949 return PyUnicode_GET_DATA_SIZE(self);
7950}
7951
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952static Py_ssize_t
7953unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007954 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955{
7956 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007957 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 return -1;
7959}
7960
7961static int
7962unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007963 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964{
7965 if (lenp)
7966 *lenp = PyUnicode_GET_DATA_SIZE(self);
7967 return 1;
7968}
7969
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007970static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007972 Py_ssize_t index,
7973 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974{
7975 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007976
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 if (index != 0) {
7978 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007979 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 return -1;
7981 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007982 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007984 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007985 *ptr = (void *) PyString_AS_STRING(str);
7986 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987}
7988
7989/* Helpers for PyUnicode_Format() */
7990
7991static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007992getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007994 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007996 (*p_argidx)++;
7997 if (arglen < 0)
7998 return args;
7999 else
8000 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 }
8002 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008003 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 return NULL;
8005}
8006
8007#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008008#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008010#define F_ALT (1<<3)
8011#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008014strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016 register Py_ssize_t i;
8017 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008019 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 return len;
8022}
8023
Neal Norwitzfc76d632006-01-10 06:03:13 +00008024static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008025longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8026{
Tim Peters15231542006-02-16 01:08:01 +00008027 Py_ssize_t result;
8028
Neal Norwitzfc76d632006-01-10 06:03:13 +00008029 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008030 result = strtounicode(buffer, (char *)buffer);
8031 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008032}
8033
Guido van Rossum078151d2002-08-11 04:24:12 +00008034/* XXX To save some code duplication, formatfloat/long/int could have been
8035 shared with stringobject.c, converting from 8-bit to Unicode after the
8036 formatting is done. */
8037
Mark Dickinson18cfada2009-11-23 18:46:41 +00008038/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8039
8040static PyObject *
8041formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008043 char *p;
8044 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008046
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 x = PyFloat_AsDouble(v);
8048 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008049 return NULL;
8050
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008052 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008053
Mark Dickinson18cfada2009-11-23 18:46:41 +00008054 p = PyOS_double_to_string(x, type, prec,
8055 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8056 if (p == NULL)
8057 return NULL;
8058 result = PyUnicode_FromStringAndSize(p, strlen(p));
8059 PyMem_Free(p);
8060 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061}
8062
Tim Peters38fd5b62000-09-21 05:43:11 +00008063static PyObject*
8064formatlong(PyObject *val, int flags, int prec, int type)
8065{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008066 char *buf;
8067 int i, len;
8068 PyObject *str; /* temporary string object. */
8069 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008070
Benjamin Peterson857ce152009-01-31 16:29:18 +00008071 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8072 if (!str)
8073 return NULL;
8074 result = _PyUnicode_New(len);
8075 if (!result) {
8076 Py_DECREF(str);
8077 return NULL;
8078 }
8079 for (i = 0; i < len; i++)
8080 result->str[i] = buf[i];
8081 result->str[len] = 0;
8082 Py_DECREF(str);
8083 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008084}
8085
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086static int
8087formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008088 size_t buflen,
8089 int flags,
8090 int prec,
8091 int type,
8092 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008094 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008095 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8096 * + 1 + 1
8097 * = 24
8098 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008099 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008100 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 long x;
8102
8103 x = PyInt_AsLong(v);
8104 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008105 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008106 if (x < 0 && type == 'u') {
8107 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008108 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008109 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8110 sign = "-";
8111 else
8112 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008114 prec = 1;
8115
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008116 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8117 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008118 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008119 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008120 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008121 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008122 return -1;
8123 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008124
8125 if ((flags & F_ALT) &&
8126 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008127 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008128 * of issues that cause pain:
8129 * - when 0 is being converted, the C standard leaves off
8130 * the '0x' or '0X', which is inconsistent with other
8131 * %#x/%#X conversions and inconsistent with Python's
8132 * hex() function
8133 * - there are platforms that violate the standard and
8134 * convert 0 with the '0x' or '0X'
8135 * (Metrowerks, Compaq Tru64)
8136 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008137 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008138 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008139 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008140 * We can achieve the desired consistency by inserting our
8141 * own '0x' or '0X' prefix, and substituting %x/%X in place
8142 * of %#x/%#X.
8143 *
8144 * Note that this is the same approach as used in
8145 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008146 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008147 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8148 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008149 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008150 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008151 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8152 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008153 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008154 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008155 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008156 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008157 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008158 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159}
8160
8161static int
8162formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008163 size_t buflen,
8164 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165{
Ezio Melotti32125152010-02-25 17:36:04 +00008166 PyObject *unistr;
8167 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008168 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008169 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008170 if (PyUnicode_GET_SIZE(v) != 1)
8171 goto onError;
8172 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008175 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008176 if (PyString_GET_SIZE(v) != 1)
8177 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008178 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8179 with a UnicodeDecodeError if 'char' is not decodable with the
8180 default encoding (usually ASCII, but it might be something else) */
8181 str = PyString_AS_STRING(v);
8182 if ((unsigned char)str[0] > 0x7F) {
8183 /* the char is not ASCII; try to decode the string using the
8184 default encoding and return -1 to let the UnicodeDecodeError
8185 be raised if the string can't be decoded */
8186 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8187 if (unistr == NULL)
8188 return -1;
8189 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8190 Py_DECREF(unistr);
8191 }
8192 else
8193 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
8196 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008197 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008199 x = PyInt_AsLong(v);
8200 if (x == -1 && PyErr_Occurred())
8201 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008202#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008203 if (x < 0 || x > 0x10ffff) {
8204 PyErr_SetString(PyExc_OverflowError,
8205 "%c arg not in range(0x110000) "
8206 "(wide Python build)");
8207 return -1;
8208 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008209#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008210 if (x < 0 || x > 0xffff) {
8211 PyErr_SetString(PyExc_OverflowError,
8212 "%c arg not in range(0x10000) "
8213 "(narrow Python build)");
8214 return -1;
8215 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008216#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008217 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 }
8219 buf[1] = '\0';
8220 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008221
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008222 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008223 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008224 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008225 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226}
8227
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008228/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8229
Mark Dickinson18cfada2009-11-23 18:46:41 +00008230 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008231 chars are formatted. XXX This is a magic number. Each formatting
8232 routine does bounds checking to ensure no overflow, but a better
8233 solution may be to malloc a buffer of appropriate size for each
8234 format. For now, the current solution is sufficient.
8235*/
8236#define FORMATBUFLEN (size_t)120
8237
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008239 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240{
8241 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008242 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 int args_owned = 0;
8244 PyUnicodeObject *result = NULL;
8245 PyObject *dict = NULL;
8246 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008247
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008249 PyErr_BadInternalCall();
8250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 }
8252 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008253 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 fmt = PyUnicode_AS_UNICODE(uformat);
8256 fmtcnt = PyUnicode_GET_SIZE(uformat);
8257
8258 reslen = rescnt = fmtcnt + 100;
8259 result = _PyUnicode_New(reslen);
8260 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008261 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 res = PyUnicode_AS_UNICODE(result);
8263
8264 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008265 arglen = PyTuple_Size(args);
8266 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 }
8268 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008269 arglen = -1;
8270 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
Christian Heimese93237d2007-12-19 02:37:44 +00008272 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008273 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008274 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
8276 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008277 if (*fmt != '%') {
8278 if (--rescnt < 0) {
8279 rescnt = fmtcnt + 100;
8280 reslen += rescnt;
8281 if (_PyUnicode_Resize(&result, reslen) < 0)
8282 goto onError;
8283 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8284 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008285 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008286 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008287 }
8288 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008289 /* Got a format specifier */
8290 int flags = 0;
8291 Py_ssize_t width = -1;
8292 int prec = -1;
8293 Py_UNICODE c = '\0';
8294 Py_UNICODE fill;
8295 int isnumok;
8296 PyObject *v = NULL;
8297 PyObject *temp = NULL;
8298 Py_UNICODE *pbuf;
8299 Py_UNICODE sign;
8300 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008301 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008302
8303 fmt++;
8304 if (*fmt == '(') {
8305 Py_UNICODE *keystart;
8306 Py_ssize_t keylen;
8307 PyObject *key;
8308 int pcount = 1;
8309
8310 if (dict == NULL) {
8311 PyErr_SetString(PyExc_TypeError,
8312 "format requires a mapping");
8313 goto onError;
8314 }
8315 ++fmt;
8316 --fmtcnt;
8317 keystart = fmt;
8318 /* Skip over balanced parentheses */
8319 while (pcount > 0 && --fmtcnt >= 0) {
8320 if (*fmt == ')')
8321 --pcount;
8322 else if (*fmt == '(')
8323 ++pcount;
8324 fmt++;
8325 }
8326 keylen = fmt - keystart - 1;
8327 if (fmtcnt < 0 || pcount > 0) {
8328 PyErr_SetString(PyExc_ValueError,
8329 "incomplete format key");
8330 goto onError;
8331 }
8332#if 0
8333 /* keys are converted to strings using UTF-8 and
8334 then looked up since Python uses strings to hold
8335 variables names etc. in its namespaces and we
8336 wouldn't want to break common idioms. */
8337 key = PyUnicode_EncodeUTF8(keystart,
8338 keylen,
8339 NULL);
8340#else
8341 key = PyUnicode_FromUnicode(keystart, keylen);
8342#endif
8343 if (key == NULL)
8344 goto onError;
8345 if (args_owned) {
8346 Py_DECREF(args);
8347 args_owned = 0;
8348 }
8349 args = PyObject_GetItem(dict, key);
8350 Py_DECREF(key);
8351 if (args == NULL) {
8352 goto onError;
8353 }
8354 args_owned = 1;
8355 arglen = -1;
8356 argidx = -2;
8357 }
8358 while (--fmtcnt >= 0) {
8359 switch (c = *fmt++) {
8360 case '-': flags |= F_LJUST; continue;
8361 case '+': flags |= F_SIGN; continue;
8362 case ' ': flags |= F_BLANK; continue;
8363 case '#': flags |= F_ALT; continue;
8364 case '0': flags |= F_ZERO; continue;
8365 }
8366 break;
8367 }
8368 if (c == '*') {
8369 v = getnextarg(args, arglen, &argidx);
8370 if (v == NULL)
8371 goto onError;
8372 if (!PyInt_Check(v)) {
8373 PyErr_SetString(PyExc_TypeError,
8374 "* wants int");
8375 goto onError;
8376 }
8377 width = PyInt_AsLong(v);
8378 if (width < 0) {
8379 flags |= F_LJUST;
8380 width = -width;
8381 }
8382 if (--fmtcnt >= 0)
8383 c = *fmt++;
8384 }
8385 else if (c >= '0' && c <= '9') {
8386 width = c - '0';
8387 while (--fmtcnt >= 0) {
8388 c = *fmt++;
8389 if (c < '0' || c > '9')
8390 break;
8391 if ((width*10) / 10 != width) {
8392 PyErr_SetString(PyExc_ValueError,
8393 "width too big");
8394 goto onError;
8395 }
8396 width = width*10 + (c - '0');
8397 }
8398 }
8399 if (c == '.') {
8400 prec = 0;
8401 if (--fmtcnt >= 0)
8402 c = *fmt++;
8403 if (c == '*') {
8404 v = getnextarg(args, arglen, &argidx);
8405 if (v == NULL)
8406 goto onError;
8407 if (!PyInt_Check(v)) {
8408 PyErr_SetString(PyExc_TypeError,
8409 "* wants int");
8410 goto onError;
8411 }
8412 prec = PyInt_AsLong(v);
8413 if (prec < 0)
8414 prec = 0;
8415 if (--fmtcnt >= 0)
8416 c = *fmt++;
8417 }
8418 else if (c >= '0' && c <= '9') {
8419 prec = c - '0';
8420 while (--fmtcnt >= 0) {
8421 c = Py_CHARMASK(*fmt++);
8422 if (c < '0' || c > '9')
8423 break;
8424 if ((prec*10) / 10 != prec) {
8425 PyErr_SetString(PyExc_ValueError,
8426 "prec too big");
8427 goto onError;
8428 }
8429 prec = prec*10 + (c - '0');
8430 }
8431 }
8432 } /* prec */
8433 if (fmtcnt >= 0) {
8434 if (c == 'h' || c == 'l' || c == 'L') {
8435 if (--fmtcnt >= 0)
8436 c = *fmt++;
8437 }
8438 }
8439 if (fmtcnt < 0) {
8440 PyErr_SetString(PyExc_ValueError,
8441 "incomplete format");
8442 goto onError;
8443 }
8444 if (c != '%') {
8445 v = getnextarg(args, arglen, &argidx);
8446 if (v == NULL)
8447 goto onError;
8448 }
8449 sign = 0;
8450 fill = ' ';
8451 switch (c) {
8452
8453 case '%':
8454 pbuf = formatbuf;
8455 /* presume that buffer length is at least 1 */
8456 pbuf[0] = '%';
8457 len = 1;
8458 break;
8459
8460 case 's':
8461 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008462 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008463 temp = v;
8464 Py_INCREF(temp);
8465 }
8466 else {
8467 PyObject *unicode;
8468 if (c == 's')
8469 temp = PyObject_Unicode(v);
8470 else
8471 temp = PyObject_Repr(v);
8472 if (temp == NULL)
8473 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008474 if (PyUnicode_Check(temp))
8475 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008476 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008477 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008478 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8479 PyString_GET_SIZE(temp),
8480 NULL,
8481 "strict");
8482 Py_DECREF(temp);
8483 temp = unicode;
8484 if (temp == NULL)
8485 goto onError;
8486 }
8487 else {
8488 Py_DECREF(temp);
8489 PyErr_SetString(PyExc_TypeError,
8490 "%s argument has non-string str()");
8491 goto onError;
8492 }
8493 }
8494 pbuf = PyUnicode_AS_UNICODE(temp);
8495 len = PyUnicode_GET_SIZE(temp);
8496 if (prec >= 0 && len > prec)
8497 len = prec;
8498 break;
8499
8500 case 'i':
8501 case 'd':
8502 case 'u':
8503 case 'o':
8504 case 'x':
8505 case 'X':
8506 if (c == 'i')
8507 c = 'd';
8508 isnumok = 0;
8509 if (PyNumber_Check(v)) {
8510 PyObject *iobj=NULL;
8511
8512 if (PyInt_Check(v) || (PyLong_Check(v))) {
8513 iobj = v;
8514 Py_INCREF(iobj);
8515 }
8516 else {
8517 iobj = PyNumber_Int(v);
8518 if (iobj==NULL) iobj = PyNumber_Long(v);
8519 }
8520 if (iobj!=NULL) {
8521 if (PyInt_Check(iobj)) {
8522 isnumok = 1;
8523 pbuf = formatbuf;
8524 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8525 flags, prec, c, iobj);
8526 Py_DECREF(iobj);
8527 if (len < 0)
8528 goto onError;
8529 sign = 1;
8530 }
8531 else if (PyLong_Check(iobj)) {
8532 isnumok = 1;
8533 temp = formatlong(iobj, flags, prec, c);
8534 Py_DECREF(iobj);
8535 if (!temp)
8536 goto onError;
8537 pbuf = PyUnicode_AS_UNICODE(temp);
8538 len = PyUnicode_GET_SIZE(temp);
8539 sign = 1;
8540 }
8541 else {
8542 Py_DECREF(iobj);
8543 }
8544 }
8545 }
8546 if (!isnumok) {
8547 PyErr_Format(PyExc_TypeError,
8548 "%%%c format: a number is required, "
8549 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8550 goto onError;
8551 }
8552 if (flags & F_ZERO)
8553 fill = '0';
8554 break;
8555
8556 case 'e':
8557 case 'E':
8558 case 'f':
8559 case 'F':
8560 case 'g':
8561 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008562 temp = formatfloat(v, flags, prec, c);
8563 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008564 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008565 pbuf = PyUnicode_AS_UNICODE(temp);
8566 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008567 sign = 1;
8568 if (flags & F_ZERO)
8569 fill = '0';
8570 break;
8571
8572 case 'c':
8573 pbuf = formatbuf;
8574 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8575 if (len < 0)
8576 goto onError;
8577 break;
8578
8579 default:
8580 PyErr_Format(PyExc_ValueError,
8581 "unsupported format character '%c' (0x%x) "
8582 "at index %zd",
8583 (31<=c && c<=126) ? (char)c : '?',
8584 (int)c,
8585 (Py_ssize_t)(fmt - 1 -
8586 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008587 goto onError;
8588 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008589 if (sign) {
8590 if (*pbuf == '-' || *pbuf == '+') {
8591 sign = *pbuf++;
8592 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008593 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008594 else if (flags & F_SIGN)
8595 sign = '+';
8596 else if (flags & F_BLANK)
8597 sign = ' ';
8598 else
8599 sign = 0;
8600 }
8601 if (width < len)
8602 width = len;
8603 if (rescnt - (sign != 0) < width) {
8604 reslen -= rescnt;
8605 rescnt = width + fmtcnt + 100;
8606 reslen += rescnt;
8607 if (reslen < 0) {
8608 Py_XDECREF(temp);
8609 PyErr_NoMemory();
8610 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008611 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008612 if (_PyUnicode_Resize(&result, reslen) < 0) {
8613 Py_XDECREF(temp);
8614 goto onError;
8615 }
8616 res = PyUnicode_AS_UNICODE(result)
8617 + reslen - rescnt;
8618 }
8619 if (sign) {
8620 if (fill != ' ')
8621 *res++ = sign;
8622 rescnt--;
8623 if (width > len)
8624 width--;
8625 }
8626 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8627 assert(pbuf[0] == '0');
8628 assert(pbuf[1] == c);
8629 if (fill != ' ') {
8630 *res++ = *pbuf++;
8631 *res++ = *pbuf++;
8632 }
8633 rescnt -= 2;
8634 width -= 2;
8635 if (width < 0)
8636 width = 0;
8637 len -= 2;
8638 }
8639 if (width > len && !(flags & F_LJUST)) {
8640 do {
8641 --rescnt;
8642 *res++ = fill;
8643 } while (--width > len);
8644 }
8645 if (fill == ' ') {
8646 if (sign)
8647 *res++ = sign;
8648 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8649 assert(pbuf[0] == '0');
8650 assert(pbuf[1] == c);
8651 *res++ = *pbuf++;
8652 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008653 }
8654 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008655 Py_UNICODE_COPY(res, pbuf, len);
8656 res += len;
8657 rescnt -= len;
8658 while (--width >= len) {
8659 --rescnt;
8660 *res++ = ' ';
8661 }
8662 if (dict && (argidx < arglen) && c != '%') {
8663 PyErr_SetString(PyExc_TypeError,
8664 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008665 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008666 goto onError;
8667 }
8668 Py_XDECREF(temp);
8669 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 } /* until end */
8671 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008672 PyErr_SetString(PyExc_TypeError,
8673 "not all arguments converted during string formatting");
8674 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 }
8676
Thomas Woutersa96affe2006-03-12 00:29:36 +00008677 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008680 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 }
8682 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 return (PyObject *)result;
8684
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 Py_XDECREF(result);
8687 Py_DECREF(uformat);
8688 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008689 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 }
8691 return NULL;
8692}
8693
8694static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 (readbufferproc) unicode_buffer_getreadbuf,
8696 (writebufferproc) unicode_buffer_getwritebuf,
8697 (segcountproc) unicode_buffer_getsegcount,
8698 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699};
8700
Jeremy Hylton938ace62002-07-17 16:30:39 +00008701static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008702unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8703
Tim Peters6d6c1a32001-08-02 04:15:00 +00008704static PyObject *
8705unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8706{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008707 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008708 static char *kwlist[] = {"string", "encoding", "errors", 0};
8709 char *encoding = NULL;
8710 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008711
Benjamin Peterson857ce152009-01-31 16:29:18 +00008712 if (type != &PyUnicode_Type)
8713 return unicode_subtype_new(type, args, kwds);
8714 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008715 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008716 return NULL;
8717 if (x == NULL)
8718 return (PyObject *)_PyUnicode_New(0);
8719 if (encoding == NULL && errors == NULL)
8720 return PyObject_Unicode(x);
8721 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008722 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008723}
8724
Guido van Rossume023fe02001-08-30 03:12:59 +00008725static PyObject *
8726unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8727{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008728 PyUnicodeObject *tmp, *pnew;
8729 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008730
Benjamin Peterson857ce152009-01-31 16:29:18 +00008731 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8732 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8733 if (tmp == NULL)
8734 return NULL;
8735 assert(PyUnicode_Check(tmp));
8736 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8737 if (pnew == NULL) {
8738 Py_DECREF(tmp);
8739 return NULL;
8740 }
8741 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8742 if (pnew->str == NULL) {
8743 _Py_ForgetReference((PyObject *)pnew);
8744 PyObject_Del(pnew);
8745 Py_DECREF(tmp);
8746 return PyErr_NoMemory();
8747 }
8748 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8749 pnew->length = n;
8750 pnew->hash = tmp->hash;
8751 Py_DECREF(tmp);
8752 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008753}
8754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008755PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008756 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008757\n\
8758Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008759encoding defaults to the current default string encoding.\n\
8760errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008761
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008763 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008764 "unicode", /* tp_name */
8765 sizeof(PyUnicodeObject), /* tp_size */
8766 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008768 (destructor)unicode_dealloc, /* tp_dealloc */
8769 0, /* tp_print */
8770 0, /* tp_getattr */
8771 0, /* tp_setattr */
8772 0, /* tp_compare */
8773 unicode_repr, /* tp_repr */
8774 &unicode_as_number, /* tp_as_number */
8775 &unicode_as_sequence, /* tp_as_sequence */
8776 &unicode_as_mapping, /* tp_as_mapping */
8777 (hashfunc) unicode_hash, /* tp_hash*/
8778 0, /* tp_call*/
8779 (reprfunc) unicode_str, /* tp_str */
8780 PyObject_GenericGetAttr, /* tp_getattro */
8781 0, /* tp_setattro */
8782 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008783 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008784 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008785 unicode_doc, /* tp_doc */
8786 0, /* tp_traverse */
8787 0, /* tp_clear */
8788 PyUnicode_RichCompare, /* tp_richcompare */
8789 0, /* tp_weaklistoffset */
8790 0, /* tp_iter */
8791 0, /* tp_iternext */
8792 unicode_methods, /* tp_methods */
8793 0, /* tp_members */
8794 0, /* tp_getset */
8795 &PyBaseString_Type, /* tp_base */
8796 0, /* tp_dict */
8797 0, /* tp_descr_get */
8798 0, /* tp_descr_set */
8799 0, /* tp_dictoffset */
8800 0, /* tp_init */
8801 0, /* tp_alloc */
8802 unicode_new, /* tp_new */
8803 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804};
8805
8806/* Initialize the Unicode implementation */
8807
Thomas Wouters78890102000-07-22 19:25:51 +00008808void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008810 int i;
8811
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008812 /* XXX - move this array to unicodectype.c ? */
8813 Py_UNICODE linebreak[] = {
8814 0x000A, /* LINE FEED */
8815 0x000D, /* CARRIAGE RETURN */
8816 0x001C, /* FILE SEPARATOR */
8817 0x001D, /* GROUP SEPARATOR */
8818 0x001E, /* RECORD SEPARATOR */
8819 0x0085, /* NEXT LINE */
8820 0x2028, /* LINE SEPARATOR */
8821 0x2029, /* PARAGRAPH SEPARATOR */
8822 };
8823
Fred Drakee4315f52000-05-09 19:53:39 +00008824 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008825 free_list = NULL;
8826 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008828 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008829 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008830
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008831 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008832 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008833 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008834 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008835 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008836
8837 /* initialize the linebreak bloom filter */
8838 bloom_linebreak = make_bloom_mask(
8839 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8840 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008841
8842 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843}
8844
8845/* Finalize the Unicode implementation */
8846
Christian Heimes3b718a72008-02-14 12:47:33 +00008847int
8848PyUnicode_ClearFreeList(void)
8849{
8850 int freelist_size = numfree;
8851 PyUnicodeObject *u;
8852
8853 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008854 PyUnicodeObject *v = u;
8855 u = *(PyUnicodeObject **)u;
8856 if (v->str)
8857 PyObject_DEL(v->str);
8858 Py_XDECREF(v->defenc);
8859 PyObject_Del(v);
8860 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008861 }
8862 free_list = NULL;
8863 assert(numfree == 0);
8864 return freelist_size;
8865}
8866
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867void
Thomas Wouters78890102000-07-22 19:25:51 +00008868_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008870 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008872 Py_XDECREF(unicode_empty);
8873 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008874
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008875 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008876 if (unicode_latin1[i]) {
8877 Py_DECREF(unicode_latin1[i]);
8878 unicode_latin1[i] = NULL;
8879 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008880 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008881 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008883
Anthony Baxterac6bd462006-04-13 02:06:09 +00008884#ifdef __cplusplus
8885}
8886#endif