blob: aab33b50c0e0e877c467767182de1919675888c0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000297 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000300 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Andrew Dalkee0df7622006-05-27 11:04:36 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
Neal Norwitze7d8be82008-07-31 17:17:14 +0000315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
318 }
319
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000329 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
332 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000333 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000334 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000337 }
338 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 }
340 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000341 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 if (unicode == NULL)
344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 }
348
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000349 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000350 PyErr_NoMemory();
351 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000352 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000353 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
359 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000360 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000362 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000378 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000380 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
385 }
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
389 }
390 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 }
395 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400}
401
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000402static
403int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404{
405 register PyUnicodeObject *v;
406
407 /* Argument checks */
408 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 PyErr_BadInternalCall();
410 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000412 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000414 PyErr_BadInternalCall();
415 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000421 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 }
432
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
436}
437
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000438int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
439{
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000456 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000457
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
468 }
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
471 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
Tim Petersced69f82003-09-16 20:30:58 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
477
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000480 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481
482 return (PyObject *)unicode;
483}
484
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000485PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
486{
487 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000488
Benjamin Peterson857ce152009-01-31 16:29:18 +0000489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000491 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 return NULL;
493 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000494
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
500
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000505 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000506
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
517 }
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
520 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000521
522 return PyUnicode_DecodeUTF8(u, size, NULL);
523 }
524
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
528
529 return (PyObject *)unicode;
530}
531
532PyObject *PyUnicode_FromString(const char *u)
533{
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
538 }
539
540 return PyUnicode_FromStringAndSize(u, size);
541}
542
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543#ifdef HAVE_WCHAR_H
544
Mark Dickinson6b265f12009-03-18 16:07:26 +0000545#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546# define CONVERT_WCHAR_TO_SURROGATES
547#endif
548
549#ifdef CONVERT_WCHAR_TO_SURROGATES
550
551/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
553
554PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
556{
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
561
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
573 }
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
578
579 /* Copy the wchar_t data into the new object */
580 {
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
589 }
590 else
591 *u++ = *w++;
592 }
593 }
594 return (PyObject *)unicode;
595}
596
597#else
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000600 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601{
602 PyUnicodeObject *unicode;
603
604 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 PyErr_BadInternalCall();
606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 }
608
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614#ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000616#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624#endif
625
626 return (PyObject *)unicode;
627}
628
Mark Dickinson6b265f12009-03-18 16:07:26 +0000629#endif /* CONVERT_WCHAR_TO_SURROGATES */
630
631#undef CONVERT_WCHAR_TO_SURROGATES
632
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633static void
634makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
635{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
641 }
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
650 }
651 *fmt++ = c;
652 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653}
654
655#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656
657PyObject *
658PyUnicode_FromFormatV(const char *format, va_list vargs)
659{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000680
681#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000682 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000683#else
684#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000687 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000688#endif
689#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000697 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
703 if (*f == 's')
704 ++callcount;
705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000706 }
707 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
714 }
715 callresult = callresults;
716 }
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
729 */
730 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
755 {
756 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000757 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000764 break;
765 }
766 case 'U':
767 {
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
772 }
773 case 'V':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
784 }
785 case 'S':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
797 }
798 case 'R':
799 {
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
810 }
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
817 */
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
829 }
830 } else
831 n++;
832 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000833 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
839 }
840 realbuffer = abuffer;
841 }
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000851
Benjamin Peterson857ce152009-01-31 16:29:18 +0000852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000854
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
870 }
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
876 }
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
881 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000882
Benjamin Peterson857ce152009-01-31 16:29:18 +0000883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
918 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000928 break;
929 }
930 case 'U':
931 {
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
937 }
938 case 'V':
939 {
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
948 }
949 break;
950 }
951 case 'S':
952 case 'R':
953 {
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
968 }
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
978 }
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
987 }
988 } else
989 *s++ = *f;
990 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000992 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1005 }
1006 PyObject_Free(callresults);
1007 }
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011}
1012
1013#undef appendstring
1014
1015PyObject *
1016PyUnicode_FromFormat(const char *format, ...)
1017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 PyObject* ret;
1019 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001020
1021#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001025#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001029}
1030
Martin v. Löwis18e16552006-02-15 17:27:45 +00001031Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001032 wchar_t *w,
1033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyErr_BadInternalCall();
1037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039
1040 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044#ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046#else
1047 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 }
1054#endif
1055
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060}
1061
1062#endif
1063
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064PyObject *PyUnicode_FromOrdinal(int ordinal)
1065{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001066 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067
1068#ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074 }
1075#else
1076 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#endif
1083
Hye-Shik Chang40574832004-04-06 07:24:51 +00001084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001086}
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_FromObject(register PyObject *obj)
1089{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001092 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 Py_INCREF(obj);
1094 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 }
1096 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103}
1104
1105PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 const char *encoding,
1107 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001110 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 PyErr_BadInternalCall();
1115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118#if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001123
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001126
1127 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 return PyObject_Unicode(obj);
1135 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136#else
1137 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#endif
1143
1144 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001145 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 }
Christian Heimes3497f942008-05-26 12:29:14 +00001149 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 }
Tim Petersced69f82003-09-16 20:30:58 +00001165
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001173
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 return v;
1175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184{
1185 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186
1187 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001189
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001211 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245 return NULL;
1246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252{
1253 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1261}
1262
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1266{
1267 PyObject *v;
1268
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1272 }
1273
1274 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284 return NULL;
1285}
1286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
Fred Drakee4315f52000-05-09 19:53:39 +00001297
Tim Petersced69f82003-09-16 20:30:58 +00001298 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001300
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001307#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001319 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001321 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001322 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 Py_DECREF(v);
1324 goto onError;
1325 }
1326 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 return NULL;
1330}
1331
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001333 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001334{
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_AS_UNICODE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return NULL;
1355}
1356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358{
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1362 }
1363 return PyUnicode_GET_SIZE(unicode);
1364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 return -1;
1367}
1368
Thomas Wouters78890102000-07-22 19:25:51 +00001369const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001370{
1371 return unicode_default_encoding;
1372}
1373
1374int PyUnicode_SetDefaultEncoding(const char *encoding)
1375{
1376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001377
Fred Drakee4315f52000-05-09 19:53:39 +00001378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 encoding,
1386 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001387 return 0;
1388
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return -1;
1391}
1392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393/* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001395 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 and adjust various state variables.
1397 return 0 on success, -1 on error
1398*/
1399
1400static
1401int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 int res = -1;
1417
1418 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 }
1423
1424 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001425 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 }
1430 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 }
1438
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 }
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001450 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 }
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_XDECREF(restuple);
1479 return res;
1480}
1481
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482/* --- UTF-7 Codec -------------------------------------------------------- */
1483
Antoine Pitrou653dece2009-05-04 18:32:32 +00001484/* See RFC2152 for details. We encode conservatively and decode liberally. */
1485
1486/* Three simple macros defining base-64. */
1487
1488/* Is c a base-64 character? */
1489
1490#define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493/* given that c is a base-64 character, what is its base-64 value? */
1494
1495#define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1500
1501/* What is the base-64 character of the bottom 6 bits of n? */
1502
1503#define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1510
1511#define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1513
1514/* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
Tim Petersced69f82003-09-16 20:30:58 +00001528static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001529char utf7_category[128] = {
1530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534/* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538/* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542/* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546};
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554#define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001561 Py_ssize_t size,
1562 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565}
1566
Antoine Pitrou653dece2009-05-04 18:32:32 +00001567/* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1573
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001574PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001605 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 e = s + size;
1607
1608 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625#ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628#else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631#endif
1632 surrogate = 0;
1633 }
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1638 }
1639 }
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1643 }
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1647 }
1648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
1866 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1867 illegal prefix. see RFC 2279 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1883 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001900 Py_ssize_t startinpos;
1901 Py_ssize_t endinpos;
1902 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 const char *e;
1904 PyUnicodeObject *unicode;
1905 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 PyObject *errorHandler = NULL;
1908 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
1910 /* Note: size will always be longer than the resulting Unicode
1911 character count */
1912 unicode = _PyUnicode_New(size);
1913 if (!unicode)
1914 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001915 if (size == 0) {
1916 if (consumed)
1917 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920
1921 /* Unpack UTF-8 encoded data */
1922 p = unicode->str;
1923 e = s + size;
1924
1925 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
1928 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001929 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 s++;
1931 continue;
1932 }
1933
1934 n = utf8_code_length[ch];
1935
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001937 if (consumed)
1938 break;
1939 else {
1940 errmsg = "unexpected end of data";
1941 startinpos = s-starts;
1942 endinpos = size;
1943 goto utf8Error;
1944 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946
1947 switch (n) {
1948
1949 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001950 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001951 startinpos = s-starts;
1952 endinpos = startinpos+1;
1953 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954
1955 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+1;
1959 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960
1961 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001962 if ((s[1] & 0xc0) != 0x80) {
1963 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001964 startinpos = s-starts;
1965 endinpos = startinpos+2;
1966 goto utf8Error;
1967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001970 startinpos = s-starts;
1971 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001972 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001973 goto utf8Error;
1974 }
1975 else
1976 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 break;
1978
1979 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001980 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001981 (s[2] & 0xc0) != 0x80) {
1982 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 startinpos = s-starts;
1984 endinpos = startinpos+3;
1985 goto utf8Error;
1986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001988 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 /* Note: UTF-8 encodings of surrogates are considered
1990 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001992 XXX For wide builds (UCS-4) we should probably try
1993 to recombine the surrogates into a single code
1994 unit.
1995 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001996 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+3;
1999 goto utf8Error;
2000 }
2001 else
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002008 (s[3] & 0xc0) != 0x80) {
2009 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002010 startinpos = s-starts;
2011 endinpos = startinpos+4;
2012 goto utf8Error;
2013 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002016 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002017 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002018 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002019 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002020 UTF-16 */
2021 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002023 startinpos = s-starts;
2024 endinpos = startinpos+4;
2025 goto utf8Error;
2026 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
2042
2043 default:
2044 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002045 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 startinpos = s-starts;
2047 endinpos = startinpos+n;
2048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 }
2050 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002051 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002052
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002053 utf8Error:
2054 outpos = p-PyUnicode_AS_UNICODE(unicode);
2055 if (unicode_decode_call_errorhandler(
2056 errors, &errorHandler,
2057 "utf8", errmsg,
2058 starts, size, &startinpos, &endinpos, &exc, &s,
2059 &unicode, &outpos, &p))
2060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 }
Walter Dörwald69652032004-09-07 20:24:22 +00002062 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002063 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
2065 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002066 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 goto onError;
2068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 Py_XDECREF(errorHandler);
2070 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 return (PyObject *)unicode;
2072
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002073 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 Py_XDECREF(errorHandler);
2075 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 Py_DECREF(unicode);
2077 return NULL;
2078}
2079
Tim Peters602f7402002-04-27 18:03:26 +00002080/* Allocation strategy: if the string is short, convert into a stack buffer
2081 and allocate exactly as much space needed at the end. Else allocate the
2082 maximum possible needed (4 result bytes per Unicode character), and return
2083 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002084*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002085PyObject *
2086PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002087 Py_ssize_t size,
2088 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089{
Tim Peters602f7402002-04-27 18:03:26 +00002090#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002091
Martin v. Löwis18e16552006-02-15 17:27:45 +00002092 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002093 PyObject *v; /* result string object */
2094 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002095 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002096 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002097 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002098
Tim Peters602f7402002-04-27 18:03:26 +00002099 assert(s != NULL);
2100 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101
Tim Peters602f7402002-04-27 18:03:26 +00002102 if (size <= MAX_SHORT_UNICHARS) {
2103 /* Write into the stack buffer; nallocated can't overflow.
2104 * At the end, we'll allocate exactly as much heap space as it
2105 * turns out we need.
2106 */
2107 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2108 v = NULL; /* will allocate after we're done */
2109 p = stackbuf;
2110 }
2111 else {
2112 /* Overallocate on the heap, and give the excess back at the end. */
2113 nallocated = size * 4;
2114 if (nallocated / 4 != size) /* overflow! */
2115 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002116 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002117 if (v == NULL)
2118 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002119 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002120 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002121
Tim Peters602f7402002-04-27 18:03:26 +00002122 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002123 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002124
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002125 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002126 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002128
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002130 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002131 *p++ = (char)(0xc0 | (ch >> 6));
2132 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002134 else {
Tim Peters602f7402002-04-27 18:03:26 +00002135 /* Encode UCS2 Unicode ordinals */
2136 if (ch < 0x10000) {
2137 /* Special case: check for high surrogate */
2138 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2139 Py_UCS4 ch2 = s[i];
2140 /* Check for low surrogate and combine the two to
2141 form a UCS4 value */
2142 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002143 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002144 i++;
2145 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002146 }
Tim Peters602f7402002-04-27 18:03:26 +00002147 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002148 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002149 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002150 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2151 *p++ = (char)(0x80 | (ch & 0x3f));
2152 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002153 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002154 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002155 /* Encode UCS4 Unicode ordinals */
2156 *p++ = (char)(0xf0 | (ch >> 18));
2157 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2158 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2159 *p++ = (char)(0x80 | (ch & 0x3f));
2160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002162
Tim Peters602f7402002-04-27 18:03:26 +00002163 if (v == NULL) {
2164 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002165 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002166 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002167 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
2169 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002170 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002171 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002172 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002173 if (_PyString_Resize(&v, nneeded))
2174 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002177
Tim Peters602f7402002-04-27 18:03:26 +00002178#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179}
2180
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2182{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 if (!PyUnicode_Check(unicode)) {
2184 PyErr_BadArgument();
2185 return NULL;
2186 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002187 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002188 PyUnicode_GET_SIZE(unicode),
2189 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190}
2191
Walter Dörwald6e390802007-08-17 16:41:28 +00002192/* --- UTF-32 Codec ------------------------------------------------------- */
2193
2194PyObject *
2195PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002196 Py_ssize_t size,
2197 const char *errors,
2198 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002199{
2200 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2201}
2202
2203PyObject *
2204PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002205 Py_ssize_t size,
2206 const char *errors,
2207 int *byteorder,
2208 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002209{
2210 const char *starts = s;
2211 Py_ssize_t startinpos;
2212 Py_ssize_t endinpos;
2213 Py_ssize_t outpos;
2214 PyUnicodeObject *unicode;
2215 Py_UNICODE *p;
2216#ifndef Py_UNICODE_WIDE
2217 int i, pairs;
2218#else
2219 const int pairs = 0;
2220#endif
2221 const unsigned char *q, *e;
2222 int bo = 0; /* assume native ordering by default */
2223 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002224 /* Offsets from q for retrieving bytes in the right order. */
2225#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2226 int iorder[] = {0, 1, 2, 3};
2227#else
2228 int iorder[] = {3, 2, 1, 0};
2229#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002230 PyObject *errorHandler = NULL;
2231 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002232 /* On narrow builds we split characters outside the BMP into two
2233 codepoints => count how much extra space we need. */
2234#ifndef Py_UNICODE_WIDE
2235 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002236 if (((Py_UCS4 *)s)[i] >= 0x10000)
2237 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002238#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002239
2240 /* This might be one to much, because of a BOM */
2241 unicode = _PyUnicode_New((size+3)/4+pairs);
2242 if (!unicode)
2243 return NULL;
2244 if (size == 0)
2245 return (PyObject *)unicode;
2246
2247 /* Unpack UTF-32 encoded data */
2248 p = unicode->str;
2249 q = (unsigned char *)s;
2250 e = q + size;
2251
2252 if (byteorder)
2253 bo = *byteorder;
2254
2255 /* Check for BOM marks (U+FEFF) in the input and adjust current
2256 byte order setting accordingly. In native mode, the leading BOM
2257 mark is skipped, in all other modes, it is copied to the output
2258 stream as-is (giving a ZWNBSP character). */
2259 if (bo == 0) {
2260 if (size >= 4) {
2261 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002262 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002263#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002264 if (bom == 0x0000FEFF) {
2265 q += 4;
2266 bo = -1;
2267 }
2268 else if (bom == 0xFFFE0000) {
2269 q += 4;
2270 bo = 1;
2271 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002272#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002273 if (bom == 0x0000FEFF) {
2274 q += 4;
2275 bo = 1;
2276 }
2277 else if (bom == 0xFFFE0000) {
2278 q += 4;
2279 bo = -1;
2280 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002281#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002282 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002283 }
2284
2285 if (bo == -1) {
2286 /* force LE */
2287 iorder[0] = 0;
2288 iorder[1] = 1;
2289 iorder[2] = 2;
2290 iorder[3] = 3;
2291 }
2292 else if (bo == 1) {
2293 /* force BE */
2294 iorder[0] = 3;
2295 iorder[1] = 2;
2296 iorder[2] = 1;
2297 iorder[3] = 0;
2298 }
2299
2300 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002301 Py_UCS4 ch;
2302 /* remaining bytes at the end? (size should be divisible by 4) */
2303 if (e-q<4) {
2304 if (consumed)
2305 break;
2306 errmsg = "truncated data";
2307 startinpos = ((const char *)q)-starts;
2308 endinpos = ((const char *)e)-starts;
2309 goto utf32Error;
2310 /* The remaining input chars are ignored if the callback
2311 chooses to skip the input */
2312 }
2313 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2314 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002316 if (ch >= 0x110000)
2317 {
2318 errmsg = "codepoint not in range(0x110000)";
2319 startinpos = ((const char *)q)-starts;
2320 endinpos = startinpos+4;
2321 goto utf32Error;
2322 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002323#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002324 if (ch >= 0x10000)
2325 {
2326 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2327 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2328 }
2329 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002330#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002331 *p++ = ch;
2332 q += 4;
2333 continue;
2334 utf32Error:
2335 outpos = p-PyUnicode_AS_UNICODE(unicode);
2336 if (unicode_decode_call_errorhandler(
2337 errors, &errorHandler,
2338 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002339 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002340 &unicode, &outpos, &p))
2341 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002342 }
2343
2344 if (byteorder)
2345 *byteorder = bo;
2346
2347 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002348 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002349
2350 /* Adjust length */
2351 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2352 goto onError;
2353
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return (PyObject *)unicode;
2357
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002358 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002359 Py_DECREF(unicode);
2360 Py_XDECREF(errorHandler);
2361 Py_XDECREF(exc);
2362 return NULL;
2363}
2364
2365PyObject *
2366PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002367 Py_ssize_t size,
2368 const char *errors,
2369 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002370{
2371 PyObject *v;
2372 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002373 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002374#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002375 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002376#else
2377 const int pairs = 0;
2378#endif
2379 /* Offsets from p for storing byte pairs in the right order. */
2380#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2381 int iorder[] = {0, 1, 2, 3};
2382#else
2383 int iorder[] = {3, 2, 1, 0};
2384#endif
2385
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002386#define STORECHAR(CH) \
2387 do { \
2388 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2389 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2390 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2391 p[iorder[0]] = (CH) & 0xff; \
2392 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002393 } while(0)
2394
2395 /* In narrow builds we can output surrogate pairs as one codepoint,
2396 so we need less space. */
2397#ifndef Py_UNICODE_WIDE
2398 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2400 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2401 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002402#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 nsize = (size - pairs + (byteorder == 0));
2404 bytesize = nsize * 4;
2405 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002406 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002407 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (v == NULL)
2409 return NULL;
2410
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002411 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002412 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002413 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002414 if (size == 0)
2415 return v;
2416
2417 if (byteorder == -1) {
2418 /* force LE */
2419 iorder[0] = 0;
2420 iorder[1] = 1;
2421 iorder[2] = 2;
2422 iorder[3] = 3;
2423 }
2424 else if (byteorder == 1) {
2425 /* force BE */
2426 iorder[0] = 3;
2427 iorder[1] = 2;
2428 iorder[2] = 1;
2429 iorder[3] = 0;
2430 }
2431
2432 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002433 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002434#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002435 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2436 Py_UCS4 ch2 = *s;
2437 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2438 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2439 s++;
2440 size--;
2441 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002442 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002443#endif
2444 STORECHAR(ch);
2445 }
2446 return v;
2447#undef STORECHAR
2448}
2449
2450PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2451{
2452 if (!PyUnicode_Check(unicode)) {
2453 PyErr_BadArgument();
2454 return NULL;
2455 }
2456 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002457 PyUnicode_GET_SIZE(unicode),
2458 NULL,
2459 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002460}
2461
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462/* --- UTF-16 Codec ------------------------------------------------------- */
2463
Tim Peters772747b2001-08-09 22:21:55 +00002464PyObject *
2465PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002466 Py_ssize_t size,
2467 const char *errors,
2468 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469{
Walter Dörwald69652032004-09-07 20:24:22 +00002470 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2471}
2472
2473PyObject *
2474PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002475 Py_ssize_t size,
2476 const char *errors,
2477 int *byteorder,
2478 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002481 Py_ssize_t startinpos;
2482 Py_ssize_t endinpos;
2483 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 PyUnicodeObject *unicode;
2485 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002486 const unsigned char *q, *e;
2487 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002489 /* Offsets from q for retrieving byte pairs in the right order. */
2490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2491 int ihi = 1, ilo = 0;
2492#else
2493 int ihi = 0, ilo = 1;
2494#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 PyObject *errorHandler = NULL;
2496 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497
2498 /* Note: size will always be longer than the resulting Unicode
2499 character count */
2500 unicode = _PyUnicode_New(size);
2501 if (!unicode)
2502 return NULL;
2503 if (size == 0)
2504 return (PyObject *)unicode;
2505
2506 /* Unpack UTF-16 encoded data */
2507 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002508 q = (unsigned char *)s;
2509 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510
2511 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002512 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002514 /* Check for BOM marks (U+FEFF) in the input and adjust current
2515 byte order setting accordingly. In native mode, the leading BOM
2516 mark is skipped, in all other modes, it is copied to the output
2517 stream as-is (giving a ZWNBSP character). */
2518 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002519 if (size >= 2) {
2520 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002521#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002522 if (bom == 0xFEFF) {
2523 q += 2;
2524 bo = -1;
2525 }
2526 else if (bom == 0xFFFE) {
2527 q += 2;
2528 bo = 1;
2529 }
Tim Petersced69f82003-09-16 20:30:58 +00002530#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002531 if (bom == 0xFEFF) {
2532 q += 2;
2533 bo = 1;
2534 }
2535 else if (bom == 0xFFFE) {
2536 q += 2;
2537 bo = -1;
2538 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002539#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002540 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542
Tim Peters772747b2001-08-09 22:21:55 +00002543 if (bo == -1) {
2544 /* force LE */
2545 ihi = 1;
2546 ilo = 0;
2547 }
2548 else if (bo == 1) {
2549 /* force BE */
2550 ihi = 0;
2551 ilo = 1;
2552 }
2553
2554 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002555 Py_UNICODE ch;
2556 /* remaining bytes at the end? (size should be even) */
2557 if (e-q<2) {
2558 if (consumed)
2559 break;
2560 errmsg = "truncated data";
2561 startinpos = ((const char *)q)-starts;
2562 endinpos = ((const char *)e)-starts;
2563 goto utf16Error;
2564 /* The remaining input chars are ignored if the callback
2565 chooses to skip the input */
2566 }
2567 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568
Benjamin Peterson857ce152009-01-31 16:29:18 +00002569 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002570
2571 if (ch < 0xD800 || ch > 0xDFFF) {
2572 *p++ = ch;
2573 continue;
2574 }
2575
2576 /* UTF-16 code pair: */
2577 if (q >= e) {
2578 errmsg = "unexpected end of data";
2579 startinpos = (((const char *)q)-2)-starts;
2580 endinpos = ((const char *)e)-starts;
2581 goto utf16Error;
2582 }
2583 if (0xD800 <= ch && ch <= 0xDBFF) {
2584 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2585 q += 2;
2586 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002587#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002588 *p++ = ch;
2589 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002591 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002592#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002593 continue;
2594 }
2595 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002596 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002597 startinpos = (((const char *)q)-4)-starts;
2598 endinpos = startinpos+2;
2599 goto utf16Error;
2600 }
2601
Benjamin Peterson857ce152009-01-31 16:29:18 +00002602 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002603 errmsg = "illegal encoding";
2604 startinpos = (((const char *)q)-2)-starts;
2605 endinpos = startinpos+2;
2606 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002607
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002608 utf16Error:
2609 outpos = p-PyUnicode_AS_UNICODE(unicode);
2610 if (unicode_decode_call_errorhandler(
2611 errors, &errorHandler,
2612 "utf16", errmsg,
2613 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2614 &unicode, &outpos, &p))
2615 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 }
2617
2618 if (byteorder)
2619 *byteorder = bo;
2620
Walter Dörwald69652032004-09-07 20:24:22 +00002621 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002622 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002623
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002625 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 goto onError;
2627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 Py_XDECREF(errorHandler);
2629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 return (PyObject *)unicode;
2631
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002632 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 Py_XDECREF(errorHandler);
2635 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 return NULL;
2637}
2638
Tim Peters772747b2001-08-09 22:21:55 +00002639PyObject *
2640PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002641 Py_ssize_t size,
2642 const char *errors,
2643 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644{
2645 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002646 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002647 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002648#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002649 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#else
2651 const int pairs = 0;
2652#endif
Tim Peters772747b2001-08-09 22:21:55 +00002653 /* Offsets from p for storing byte pairs in the right order. */
2654#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2655 int ihi = 1, ilo = 0;
2656#else
2657 int ihi = 0, ilo = 1;
2658#endif
2659
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002660#define STORECHAR(CH) \
2661 do { \
2662 p[ihi] = ((CH) >> 8) & 0xff; \
2663 p[ilo] = (CH) & 0xff; \
2664 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002665 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002667#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002668 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 if (s[i] >= 0x10000)
2670 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002671#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002672 /* 2 * (size + pairs + (byteorder == 0)) */
2673 if (size > PY_SSIZE_T_MAX ||
2674 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002675 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002676 nsize = size + pairs + (byteorder == 0);
2677 bytesize = nsize * 2;
2678 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002680 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 if (v == NULL)
2682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002684 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002686 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002687 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002688 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002689
2690 if (byteorder == -1) {
2691 /* force LE */
2692 ihi = 1;
2693 ilo = 0;
2694 }
2695 else if (byteorder == 1) {
2696 /* force BE */
2697 ihi = 0;
2698 ilo = 1;
2699 }
2700
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002701 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002702 Py_UNICODE ch = *s++;
2703 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002704#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002705 if (ch >= 0x10000) {
2706 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2707 ch = 0xD800 | ((ch-0x10000) >> 10);
2708 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002709#endif
Tim Peters772747b2001-08-09 22:21:55 +00002710 STORECHAR(ch);
2711 if (ch2)
2712 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002715#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716}
2717
2718PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2719{
2720 if (!PyUnicode_Check(unicode)) {
2721 PyErr_BadArgument();
2722 return NULL;
2723 }
2724 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002725 PyUnicode_GET_SIZE(unicode),
2726 NULL,
2727 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728}
2729
2730/* --- Unicode Escape Codec ----------------------------------------------- */
2731
Fredrik Lundh06d12682001-01-24 07:59:11 +00002732static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002735 Py_ssize_t size,
2736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002739 Py_ssize_t startinpos;
2740 Py_ssize_t endinpos;
2741 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746 char* message;
2747 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 PyObject *errorHandler = NULL;
2749 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002750
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 /* Escaped strings will always be longer than the resulting
2752 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 length after conversion to the true value.
2754 (but if the error callback returns a long replacement string
2755 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 v = _PyUnicode_New(size);
2757 if (v == NULL)
2758 goto onError;
2759 if (size == 0)
2760 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002764
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 while (s < end) {
2766 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002767 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769
2770 /* Non-escape characters are interpreted as Unicode ordinals */
2771 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002772 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 continue;
2774 }
2775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 /* \ - Escapes */
2778 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002779 c = *s++;
2780 if (s > end)
2781 c = '\0'; /* Invalid after \ */
2782 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002784 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 case '\n': break;
2786 case '\\': *p++ = '\\'; break;
2787 case '\'': *p++ = '\''; break;
2788 case '\"': *p++ = '\"'; break;
2789 case 'b': *p++ = '\b'; break;
2790 case 'f': *p++ = '\014'; break; /* FF */
2791 case 't': *p++ = '\t'; break;
2792 case 'n': *p++ = '\n'; break;
2793 case 'r': *p++ = '\r'; break;
2794 case 'v': *p++ = '\013'; break; /* VT */
2795 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2796
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002797 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 case '0': case '1': case '2': case '3':
2799 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002801 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002803 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002804 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002806 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 break;
2808
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002809 /* hex escapes */
2810 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812 digits = 2;
2813 message = "truncated \\xXX escape";
2814 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002816 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 digits = 4;
2819 message = "truncated \\uXXXX escape";
2820 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002822 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002823 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824 digits = 8;
2825 message = "truncated \\UXXXXXXXX escape";
2826 hexescape:
2827 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 outpos = p-PyUnicode_AS_UNICODE(v);
2829 if (s+digits>end) {
2830 endinpos = size;
2831 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002832 errors, &errorHandler,
2833 "unicodeescape", "end of string in escape sequence",
2834 starts, size, &startinpos, &endinpos, &exc, &s,
2835 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 goto onError;
2837 goto nextByte;
2838 }
2839 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002840 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002841 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 endinpos = (s+i+1)-starts;
2843 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002844 errors, &errorHandler,
2845 "unicodeescape", message,
2846 starts, size, &startinpos, &endinpos, &exc, &s,
2847 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002848 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002850 }
2851 chr = (chr<<4) & ~0xF;
2852 if (c >= '0' && c <= '9')
2853 chr += c - '0';
2854 else if (c >= 'a' && c <= 'f')
2855 chr += 10 + c - 'a';
2856 else
2857 chr += 10 + c - 'A';
2858 }
2859 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002860 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 /* _decoding_error will have already written into the
2862 target buffer. */
2863 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002864 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002865 /* when we get here, chr is a 32-bit unicode character */
2866 if (chr <= 0xffff)
2867 /* UCS-2 character */
2868 *p++ = (Py_UNICODE) chr;
2869 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002870 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002871 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002872#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002873 *p++ = chr;
2874#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002875 chr -= 0x10000L;
2876 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002877 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002878#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002879 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 endinpos = s-starts;
2881 outpos = p-PyUnicode_AS_UNICODE(v);
2882 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002883 errors, &errorHandler,
2884 "unicodeescape", "illegal Unicode character",
2885 starts, size, &startinpos, &endinpos, &exc, &s,
2886 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002887 goto onError;
2888 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 break;
2890
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002891 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 case 'N':
2893 message = "malformed \\N character escape";
2894 if (ucnhash_CAPI == NULL) {
2895 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002896 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002897 if (ucnhash_CAPI == NULL)
2898 goto ucnhashError;
2899 }
2900 if (*s == '{') {
2901 const char *start = s+1;
2902 /* look for the closing brace */
2903 while (*s != '}' && s < end)
2904 s++;
2905 if (s > start && s < end && *s == '}') {
2906 /* found a name. look it up in the unicode database */
2907 message = "unknown Unicode character name";
2908 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002909 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002910 goto store;
2911 }
2912 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 endinpos = s-starts;
2914 outpos = p-PyUnicode_AS_UNICODE(v);
2915 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002916 errors, &errorHandler,
2917 "unicodeescape", message,
2918 starts, size, &startinpos, &endinpos, &exc, &s,
2919 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002920 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002921 break;
2922
2923 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002924 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 message = "\\ at end of string";
2926 s--;
2927 endinpos = s-starts;
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002930 errors, &errorHandler,
2931 "unicodeescape", message,
2932 starts, size, &startinpos, &endinpos, &exc, &s,
2933 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002934 goto onError;
2935 }
2936 else {
2937 *p++ = '\\';
2938 *p++ = (unsigned char)s[-1];
2939 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002942 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002945 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002946 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002947 Py_XDECREF(errorHandler);
2948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002950
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002951 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002952 PyErr_SetString(
2953 PyExc_UnicodeError,
2954 "\\N escapes not supported (can't load unicodedata module)"
2955 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002959 return NULL;
2960
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002961 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 Py_XDECREF(errorHandler);
2964 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 return NULL;
2966}
2967
2968/* Return a Unicode-Escape string version of the Unicode object.
2969
2970 If quotes is true, the string is enclosed in u"" or u'' quotes as
2971 appropriate.
2972
2973*/
2974
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002975Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002976 Py_ssize_t size,
2977 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002978{
2979 /* like wcschr, but doesn't stop at NULL characters */
2980
2981 while (size-- > 0) {
2982 if (*s == ch)
2983 return s;
2984 s++;
2985 }
2986
2987 return NULL;
2988}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990static
2991PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002992 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 int quotes)
2994{
2995 PyObject *repr;
2996 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002998 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002999#ifdef Py_UNICODE_WIDE
3000 const Py_ssize_t expandsize = 10;
3001#else
3002 const Py_ssize_t expandsize = 6;
3003#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004
Neal Norwitz17753ec2006-08-21 22:21:19 +00003005 /* XXX(nnorwitz): rather than over-allocating, it would be
3006 better to choose a different scheme. Perhaps scan the
3007 first N-chars of the string and allocate based on that size.
3008 */
3009 /* Initial allocation is based on the longest-possible unichr
3010 escape.
3011
3012 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3013 unichr, so in this case it's the longest unichr escape. In
3014 narrow (UTF-16) builds this is five chars per source unichr
3015 since there are two unichrs in the surrogate pair, so in narrow
3016 (UTF-16) builds it's not the longest unichr escape.
3017
3018 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3019 so in the narrow (UTF-16) build case it's the longest unichr
3020 escape.
3021 */
3022
Neal Norwitze7d8be82008-07-31 17:17:14 +00003023 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003024 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003025
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003026 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003027 2
3028 + expandsize*size
3029 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (repr == NULL)
3031 return NULL;
3032
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003033 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034
3035 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003037 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 !findchar(s, size, '"')) ? '"' : '\'';
3039 }
3040 while (size-- > 0) {
3041 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003042
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003043 /* Escape quotes and backslashes */
3044 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003045 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 *p++ = '\\';
3047 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003048 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003049 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003050
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003051#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003052 /* Map 21-bit characters to '\U00xxxxxx' */
3053 else if (ch >= 0x10000) {
3054 *p++ = '\\';
3055 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003056 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3058 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3059 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3060 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3061 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3062 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003063 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003064 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003065 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003066#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003067 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3068 else if (ch >= 0xD800 && ch < 0xDC00) {
3069 Py_UNICODE ch2;
3070 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003072 ch2 = *s++;
3073 size--;
3074 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3075 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3076 *p++ = '\\';
3077 *p++ = 'U';
3078 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3080 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3081 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3082 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3083 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3084 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3085 *p++ = hexdigit[ucs & 0x0000000F];
3086 continue;
3087 }
3088 /* Fall through: isolated surrogates are copied as-is */
3089 s--;
3090 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003091 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003092#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003093
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003095 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 *p++ = '\\';
3097 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003098 *p++ = hexdigit[(ch >> 12) & 0x000F];
3099 *p++ = hexdigit[(ch >> 8) & 0x000F];
3100 *p++ = hexdigit[(ch >> 4) & 0x000F];
3101 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003103
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003104 /* Map special whitespace to '\t', \n', '\r' */
3105 else if (ch == '\t') {
3106 *p++ = '\\';
3107 *p++ = 't';
3108 }
3109 else if (ch == '\n') {
3110 *p++ = '\\';
3111 *p++ = 'n';
3112 }
3113 else if (ch == '\r') {
3114 *p++ = '\\';
3115 *p++ = 'r';
3116 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003117
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003118 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003119 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003121 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003122 *p++ = hexdigit[(ch >> 4) & 0x000F];
3123 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003125
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 /* Copy everything else as-is */
3127 else
3128 *p++ = (char) ch;
3129 }
3130 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003131 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132
3133 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003134 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 return repr;
3137}
3138
3139PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003140 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141{
3142 return unicodeescape_string(s, size, 0);
3143}
3144
3145PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3146{
3147 if (!PyUnicode_Check(unicode)) {
3148 PyErr_BadArgument();
3149 return NULL;
3150 }
3151 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153}
3154
3155/* --- Raw Unicode Escape Codec ------------------------------------------- */
3156
3157PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003158 Py_ssize_t size,
3159 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003162 Py_ssize_t startinpos;
3163 Py_ssize_t endinpos;
3164 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 const char *end;
3168 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 PyObject *errorHandler = NULL;
3170 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003171
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 /* Escaped strings will always be longer than the resulting
3173 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 length after conversion to the true value. (But decoding error
3175 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 v = _PyUnicode_New(size);
3177 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003180 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 end = s + size;
3183 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 unsigned char c;
3185 Py_UCS4 x;
3186 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003187 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003189 /* Non-escape characters are interpreted as Unicode ordinals */
3190 if (*s != '\\') {
3191 *p++ = (unsigned char)*s++;
3192 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003193 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003194 startinpos = s-starts;
3195
3196 /* \u-escapes are only interpreted iff the number of leading
3197 backslashes if odd */
3198 bs = s;
3199 for (;s < end;) {
3200 if (*s != '\\')
3201 break;
3202 *p++ = (unsigned char)*s++;
3203 }
3204 if (((s - bs) & 1) == 0 ||
3205 s >= end ||
3206 (*s != 'u' && *s != 'U')) {
3207 continue;
3208 }
3209 p--;
3210 count = *s=='u' ? 4 : 8;
3211 s++;
3212
3213 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3214 outpos = p-PyUnicode_AS_UNICODE(v);
3215 for (x = 0, i = 0; i < count; ++i, ++s) {
3216 c = (unsigned char)*s;
3217 if (!isxdigit(c)) {
3218 endinpos = s-starts;
3219 if (unicode_decode_call_errorhandler(
3220 errors, &errorHandler,
3221 "rawunicodeescape", "truncated \\uXXXX",
3222 starts, size, &startinpos, &endinpos, &exc, &s,
3223 &v, &outpos, &p))
3224 goto onError;
3225 goto nextByte;
3226 }
3227 x = (x<<4) & ~0xF;
3228 if (c >= '0' && c <= '9')
3229 x += c - '0';
3230 else if (c >= 'a' && c <= 'f')
3231 x += 10 + c - 'a';
3232 else
3233 x += 10 + c - 'A';
3234 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 /* UCS-2 character */
3237 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003238 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 /* UCS-4 character. Either store directly, or as
3240 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003242 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003243#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 x -= 0x10000L;
3245 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3246 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003247#endif
3248 } else {
3249 endinpos = s-starts;
3250 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 if (unicode_decode_call_errorhandler(
3252 errors, &errorHandler,
3253 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003254 starts, size, &startinpos, &endinpos, &exc, &s,
3255 &v, &outpos, &p))
3256 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003257 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 nextByte:
3259 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003261 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003266
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_XDECREF(errorHandler);
3270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 return NULL;
3272}
3273
3274PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
3277 PyObject *repr;
3278 char *p;
3279 char *q;
3280
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003281 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003282#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003283 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003286#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003287
Neal Norwitze7d8be82008-07-31 17:17:14 +00003288 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003290
Neal Norwitze7d8be82008-07-31 17:17:14 +00003291 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 if (repr == NULL)
3293 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003294 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003297 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 while (size-- > 0) {
3299 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 /* Map 32-bit characters to '\Uxxxxxxxx' */
3302 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003303 *p++ = '\\';
3304 *p++ = 'U';
3305 *p++ = hexdigit[(ch >> 28) & 0xf];
3306 *p++ = hexdigit[(ch >> 24) & 0xf];
3307 *p++ = hexdigit[(ch >> 20) & 0xf];
3308 *p++ = hexdigit[(ch >> 16) & 0xf];
3309 *p++ = hexdigit[(ch >> 12) & 0xf];
3310 *p++ = hexdigit[(ch >> 8) & 0xf];
3311 *p++ = hexdigit[(ch >> 4) & 0xf];
3312 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003313 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003314 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003315#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3317 if (ch >= 0xD800 && ch < 0xDC00) {
3318 Py_UNICODE ch2;
3319 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003320
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003321 ch2 = *s++;
3322 size--;
3323 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3324 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3325 *p++ = '\\';
3326 *p++ = 'U';
3327 *p++ = hexdigit[(ucs >> 28) & 0xf];
3328 *p++ = hexdigit[(ucs >> 24) & 0xf];
3329 *p++ = hexdigit[(ucs >> 20) & 0xf];
3330 *p++ = hexdigit[(ucs >> 16) & 0xf];
3331 *p++ = hexdigit[(ucs >> 12) & 0xf];
3332 *p++ = hexdigit[(ucs >> 8) & 0xf];
3333 *p++ = hexdigit[(ucs >> 4) & 0xf];
3334 *p++ = hexdigit[ucs & 0xf];
3335 continue;
3336 }
3337 /* Fall through: isolated surrogates are copied as-is */
3338 s--;
3339 size++;
3340 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003341#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003342 /* Map 16-bit characters to '\uxxxx' */
3343 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 *p++ = '\\';
3345 *p++ = 'u';
3346 *p++ = hexdigit[(ch >> 12) & 0xf];
3347 *p++ = hexdigit[(ch >> 8) & 0xf];
3348 *p++ = hexdigit[(ch >> 4) & 0xf];
3349 *p++ = hexdigit[ch & 15];
3350 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 /* Copy everything else as-is */
3352 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 *p++ = (char) ch;
3354 }
3355 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003356 if (_PyString_Resize(&repr, p - q))
3357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 return repr;
3359}
3360
3361PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3362{
3363 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 PyErr_BadArgument();
3365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
3367 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369}
3370
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003371/* --- Unicode Internal Codec ------------------------------------------- */
3372
3373PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003374 Py_ssize_t size,
3375 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003376{
3377 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003378 Py_ssize_t startinpos;
3379 Py_ssize_t endinpos;
3380 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003381 PyUnicodeObject *v;
3382 Py_UNICODE *p;
3383 const char *end;
3384 const char *reason;
3385 PyObject *errorHandler = NULL;
3386 PyObject *exc = NULL;
3387
Neal Norwitzd43069c2006-01-08 01:12:10 +00003388#ifdef Py_UNICODE_WIDE
3389 Py_UNICODE unimax = PyUnicode_GetMax();
3390#endif
3391
Armin Rigo7ccbca92006-10-04 12:17:45 +00003392 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003393 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3394 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003395 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003397 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003398 p = PyUnicode_AS_UNICODE(v);
3399 end = s + size;
3400
3401 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003402 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 /* We have to sanity check the raw data, otherwise doom looms for
3404 some malformed UCS-4 data. */
3405 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003407 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003408#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003409 end-s < Py_UNICODE_SIZE
3410 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003411 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003412 startinpos = s - starts;
3413 if (end-s < Py_UNICODE_SIZE) {
3414 endinpos = end-starts;
3415 reason = "truncated input";
3416 }
3417 else {
3418 endinpos = s - starts + Py_UNICODE_SIZE;
3419 reason = "illegal code point (> 0x10FFFF)";
3420 }
3421 outpos = p - PyUnicode_AS_UNICODE(v);
3422 if (unicode_decode_call_errorhandler(
3423 errors, &errorHandler,
3424 "unicode_internal", reason,
3425 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003426 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003427 goto onError;
3428 }
3429 }
3430 else {
3431 p++;
3432 s += Py_UNICODE_SIZE;
3433 }
3434 }
3435
Martin v. Löwis412fb672006-04-13 06:34:32 +00003436 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 goto onError;
3438 Py_XDECREF(errorHandler);
3439 Py_XDECREF(exc);
3440 return (PyObject *)v;
3441
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003443 Py_XDECREF(v);
3444 Py_XDECREF(errorHandler);
3445 Py_XDECREF(exc);
3446 return NULL;
3447}
3448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449/* --- Latin-1 Codec ------------------------------------------------------ */
3450
3451PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003452 Py_ssize_t size,
3453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
3455 PyUnicodeObject *v;
3456 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003457
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003459 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003460 Py_UNICODE r = *(unsigned char*)s;
3461 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003462 }
3463
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 v = _PyUnicode_New(size);
3465 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003468 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 p = PyUnicode_AS_UNICODE(v);
3470 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003471 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003473
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 Py_XDECREF(v);
3476 return NULL;
3477}
3478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479/* create or adjust a UnicodeEncodeError */
3480static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 const char *encoding,
3482 const Py_UNICODE *unicode, Py_ssize_t size,
3483 Py_ssize_t startpos, Py_ssize_t endpos,
3484 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003487 *exceptionObject = PyUnicodeEncodeError_Create(
3488 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
3490 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003491 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3492 goto onError;
3493 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3494 goto onError;
3495 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3496 goto onError;
3497 return;
3498 onError:
3499 Py_DECREF(*exceptionObject);
3500 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 }
3502}
3503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504/* raises a UnicodeEncodeError */
3505static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 const char *encoding,
3507 const Py_UNICODE *unicode, Py_ssize_t size,
3508 Py_ssize_t startpos, Py_ssize_t endpos,
3509 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510{
3511 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003514 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515}
3516
3517/* error handling callback helper:
3518 build arguments, call the callback and check the arguments,
3519 put the result into newpos and return the replacement string, which
3520 has to be freed by the caller */
3521static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003522 PyObject **errorHandler,
3523 const char *encoding, const char *reason,
3524 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3525 Py_ssize_t startpos, Py_ssize_t endpos,
3526 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529
3530 PyObject *restuple;
3531 PyObject *resunicode;
3532
3533 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
3538
3539 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543
3544 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003549 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 Py_DECREF(restuple);
3551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 }
3553 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 &resunicode, newpos)) {
3555 Py_DECREF(restuple);
3556 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 }
3558 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003559 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003560 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003561 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3562 Py_DECREF(restuple);
3563 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003564 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_INCREF(resunicode);
3566 Py_DECREF(restuple);
3567 return resunicode;
3568}
3569
3570static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003571 Py_ssize_t size,
3572 const char *errors,
3573 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574{
3575 /* output object */
3576 PyObject *res;
3577 /* pointers to the beginning and end+1 of input */
3578 const Py_UNICODE *startp = p;
3579 const Py_UNICODE *endp = p + size;
3580 /* pointer to the beginning of the unencodable characters */
3581 /* const Py_UNICODE *badp = NULL; */
3582 /* pointer into the output */
3583 char *str;
3584 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t respos = 0;
3586 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003587 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3588 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 PyObject *errorHandler = NULL;
3590 PyObject *exc = NULL;
3591 /* the following variable is used for caching string comparisons
3592 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3593 int known_errorHandler = -1;
3594
3595 /* allocate enough for a simple encoding without
3596 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003597 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 if (res == NULL)
3599 goto onError;
3600 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003602 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 ressize = size;
3604
3605 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003606 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003608 /* can we encode this? */
3609 if (c<limit) {
3610 /* no overflow check, because we know that the space is enough */
3611 *str++ = (char)c;
3612 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003613 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003614 else {
3615 Py_ssize_t unicodepos = p-startp;
3616 Py_ssize_t requiredsize;
3617 PyObject *repunicode;
3618 Py_ssize_t repsize;
3619 Py_ssize_t newpos;
3620 Py_ssize_t respos;
3621 Py_UNICODE *uni2;
3622 /* startpos for collecting unencodable chars */
3623 const Py_UNICODE *collstart = p;
3624 const Py_UNICODE *collend = p;
3625 /* find all unecodable characters */
3626 while ((collend < endp) && ((*collend)>=limit))
3627 ++collend;
3628 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3629 if (known_errorHandler==-1) {
3630 if ((errors==NULL) || (!strcmp(errors, "strict")))
3631 known_errorHandler = 1;
3632 else if (!strcmp(errors, "replace"))
3633 known_errorHandler = 2;
3634 else if (!strcmp(errors, "ignore"))
3635 known_errorHandler = 3;
3636 else if (!strcmp(errors, "xmlcharrefreplace"))
3637 known_errorHandler = 4;
3638 else
3639 known_errorHandler = 0;
3640 }
3641 switch (known_errorHandler) {
3642 case 1: /* strict */
3643 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3644 goto onError;
3645 case 2: /* replace */
3646 while (collstart++<collend)
3647 *str++ = '?'; /* fall through */
3648 case 3: /* ignore */
3649 p = collend;
3650 break;
3651 case 4: /* xmlcharrefreplace */
3652 respos = str-PyString_AS_STRING(res);
3653 /* determine replacement size (temporarily (mis)uses p) */
3654 for (p = collstart, repsize = 0; p < collend; ++p) {
3655 if (*p<10)
3656 repsize += 2+1+1;
3657 else if (*p<100)
3658 repsize += 2+2+1;
3659 else if (*p<1000)
3660 repsize += 2+3+1;
3661 else if (*p<10000)
3662 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003663#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003664 else
3665 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003666#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 else if (*p<100000)
3668 repsize += 2+5+1;
3669 else if (*p<1000000)
3670 repsize += 2+6+1;
3671 else
3672 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003673#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003674 }
3675 requiredsize = respos+repsize+(endp-collend);
3676 if (requiredsize > ressize) {
3677 if (requiredsize<2*ressize)
3678 requiredsize = 2*ressize;
3679 if (_PyString_Resize(&res, requiredsize))
3680 goto onError;
3681 str = PyString_AS_STRING(res) + respos;
3682 ressize = requiredsize;
3683 }
3684 /* generate replacement (temporarily (mis)uses p) */
3685 for (p = collstart; p < collend; ++p) {
3686 str += sprintf(str, "&#%d;", (int)*p);
3687 }
3688 p = collend;
3689 break;
3690 default:
3691 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3692 encoding, reason, startp, size, &exc,
3693 collstart-startp, collend-startp, &newpos);
3694 if (repunicode == NULL)
3695 goto onError;
3696 /* need more space? (at least enough for what we have+the
3697 replacement+the rest of the string, so we won't have to
3698 check space for encodable characters) */
3699 respos = str-PyString_AS_STRING(res);
3700 repsize = PyUnicode_GET_SIZE(repunicode);
3701 requiredsize = respos+repsize+(endp-collend);
3702 if (requiredsize > ressize) {
3703 if (requiredsize<2*ressize)
3704 requiredsize = 2*ressize;
3705 if (_PyString_Resize(&res, requiredsize)) {
3706 Py_DECREF(repunicode);
3707 goto onError;
3708 }
3709 str = PyString_AS_STRING(res) + respos;
3710 ressize = requiredsize;
3711 }
3712 /* check if there is anything unencodable in the replacement
3713 and copy it to the output */
3714 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3715 c = *uni2;
3716 if (c >= limit) {
3717 raise_encode_exception(&exc, encoding, startp, size,
3718 unicodepos, unicodepos+1, reason);
3719 Py_DECREF(repunicode);
3720 goto onError;
3721 }
3722 *str = (char)c;
3723 }
3724 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003725 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003726 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003727 }
3728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003730 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 /* If this falls res will be NULL */
3733 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc);
3736 return res;
3737
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003738 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 Py_XDECREF(res);
3740 Py_XDECREF(errorHandler);
3741 Py_XDECREF(exc);
3742 return NULL;
3743}
3744
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003746 Py_ssize_t size,
3747 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750}
3751
3752PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3753{
3754 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003755 PyErr_BadArgument();
3756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 }
3758 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 PyUnicode_GET_SIZE(unicode),
3760 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761}
3762
3763/* --- 7-bit ASCII Codec -------------------------------------------------- */
3764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003766 Py_ssize_t size,
3767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 PyUnicodeObject *v;
3771 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003772 Py_ssize_t startinpos;
3773 Py_ssize_t endinpos;
3774 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 const char *e;
3776 PyObject *errorHandler = NULL;
3777 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003780 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 Py_UNICODE r = *(unsigned char*)s;
3782 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003783 }
Tim Petersced69f82003-09-16 20:30:58 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 v = _PyUnicode_New(size);
3786 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 e = s + size;
3792 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003793 register unsigned char c = (unsigned char)*s;
3794 if (c < 128) {
3795 *p++ = c;
3796 ++s;
3797 }
3798 else {
3799 startinpos = s-starts;
3800 endinpos = startinpos + 1;
3801 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3802 if (unicode_decode_call_errorhandler(
3803 errors, &errorHandler,
3804 "ascii", "ordinal not in range(128)",
3805 starts, size, &startinpos, &endinpos, &exc, &s,
3806 &v, &outpos, &p))
3807 goto onError;
3808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003810 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3812 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 Py_XDECREF(errorHandler);
3814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003817 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 Py_XDECREF(errorHandler);
3820 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 return NULL;
3822}
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003825 Py_ssize_t size,
3826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829}
3830
3831PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3832{
3833 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 PyErr_BadArgument();
3835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 }
3837 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 PyUnicode_GET_SIZE(unicode),
3839 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840}
3841
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003842#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003843
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003844/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003845
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003846#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003847#define NEED_RETRY
3848#endif
3849
3850/* XXX This code is limited to "true" double-byte encodings, as
3851 a) it assumes an incomplete character consists of a single byte, and
3852 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003854
3855static int is_dbcs_lead_byte(const char *s, int offset)
3856{
3857 const char *curr = s + offset;
3858
3859 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003860 const char *prev = CharPrev(s, curr);
3861 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862 }
3863 return 0;
3864}
3865
3866/*
3867 * Decode MBCS string into unicode object. If 'final' is set, converts
3868 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3869 */
3870static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003871 const char *s, /* MBCS string */
3872 int size, /* sizeof MBCS string */
3873 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003874{
3875 Py_UNICODE *p;
3876 Py_ssize_t n = 0;
3877 int usize = 0;
3878
3879 assert(size >= 0);
3880
3881 /* Skip trailing lead-byte unless 'final' is set */
3882 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003884
3885 /* First get the size of the result */
3886 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3888 if (usize == 0) {
3889 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3890 return -1;
3891 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 }
3893
3894 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003895 /* Create unicode object */
3896 *v = _PyUnicode_New(usize);
3897 if (*v == NULL)
3898 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003899 }
3900 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003901 /* Extend unicode object */
3902 n = PyUnicode_GET_SIZE(*v);
3903 if (_PyUnicode_Resize(v, n + usize) < 0)
3904 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905 }
3906
3907 /* Do the conversion */
3908 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003909 p = PyUnicode_AS_UNICODE(*v) + n;
3910 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3911 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3912 return -1;
3913 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914 }
3915
3916 return size;
3917}
3918
3919PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003920 Py_ssize_t size,
3921 const char *errors,
3922 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923{
3924 PyUnicodeObject *v = NULL;
3925 int done;
3926
3927 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003928 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003929
3930#ifdef NEED_RETRY
3931 retry:
3932 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003933 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003934 else
3935#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937
3938 if (done < 0) {
3939 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941 }
3942
3943 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003944 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945
3946#ifdef NEED_RETRY
3947 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 s += done;
3949 size -= done;
3950 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003951 }
3952#endif
3953
3954 return (PyObject *)v;
3955}
3956
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003957PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003958 Py_ssize_t size,
3959 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003960{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003961 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3962}
3963
3964/*
3965 * Convert unicode into string object (MBCS).
3966 * Returns 0 if succeed, -1 otherwise.
3967 */
3968static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 const Py_UNICODE *p, /* unicode */
3970 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971{
3972 int mbcssize = 0;
3973 Py_ssize_t n = 0;
3974
3975 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976
3977 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003978 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3980 if (mbcssize == 0) {
3981 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3982 return -1;
3983 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003984 }
3985
Martin v. Löwisd8251432006-06-14 05:21:04 +00003986 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 /* Create string object */
3988 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3989 if (*repr == NULL)
3990 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003991 }
3992 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 /* Extend string object */
3994 n = PyString_Size(*repr);
3995 if (_PyString_Resize(repr, n + mbcssize) < 0)
3996 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 }
3998
3999 /* Do the conversion */
4000 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 char *s = PyString_AS_STRING(*repr) + n;
4002 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4003 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4004 return -1;
4005 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004006 }
4007
4008 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004009}
4010
4011PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 Py_ssize_t size,
4013 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004014{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 PyObject *repr = NULL;
4016 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004017
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004019 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004021 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022 else
4023#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004024 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004025
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 Py_XDECREF(repr);
4028 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004029 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030
4031#ifdef NEED_RETRY
4032 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004033 p += INT_MAX;
4034 size -= INT_MAX;
4035 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004036 }
4037#endif
4038
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004039 return repr;
4040}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004041
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004042PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4043{
4044 if (!PyUnicode_Check(unicode)) {
4045 PyErr_BadArgument();
4046 return NULL;
4047 }
4048 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004049 PyUnicode_GET_SIZE(unicode),
4050 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004051}
4052
Martin v. Löwisd8251432006-06-14 05:21:04 +00004053#undef NEED_RETRY
4054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004055#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057/* --- Character Mapping Codec -------------------------------------------- */
4058
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004060 Py_ssize_t size,
4061 PyObject *mapping,
4062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t startinpos;
4066 Py_ssize_t endinpos;
4067 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 PyUnicodeObject *v;
4070 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 PyObject *errorHandler = NULL;
4073 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004074 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004075 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 /* Default to Latin-1 */
4078 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080
4081 v = _PyUnicode_New(size);
4082 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004088 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 mapstring = PyUnicode_AS_UNICODE(mapping);
4090 maplen = PyUnicode_GET_SIZE(mapping);
4091 while (s < e) {
4092 unsigned char ch = *s;
4093 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004095 if (ch < maplen)
4096 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 if (x == 0xfffe) {
4099 /* undefined mapping */
4100 outpos = p-PyUnicode_AS_UNICODE(v);
4101 startinpos = s-starts;
4102 endinpos = startinpos+1;
4103 if (unicode_decode_call_errorhandler(
4104 errors, &errorHandler,
4105 "charmap", "character maps to <undefined>",
4106 starts, size, &startinpos, &endinpos, &exc, &s,
4107 &v, &outpos, &p)) {
4108 goto onError;
4109 }
4110 continue;
4111 }
4112 *p++ = x;
4113 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004114 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004115 }
4116 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 while (s < e) {
4118 unsigned char ch = *s;
4119 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004120
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004121 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4122 w = PyInt_FromLong((long)ch);
4123 if (w == NULL)
4124 goto onError;
4125 x = PyObject_GetItem(mapping, w);
4126 Py_DECREF(w);
4127 if (x == NULL) {
4128 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4129 /* No mapping found means: mapping is undefined. */
4130 PyErr_Clear();
4131 x = Py_None;
4132 Py_INCREF(x);
4133 } else
4134 goto onError;
4135 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004136
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004137 /* Apply mapping */
4138 if (PyInt_Check(x)) {
4139 long value = PyInt_AS_LONG(x);
4140 if (value < 0 || value > 65535) {
4141 PyErr_SetString(PyExc_TypeError,
4142 "character mapping must be in range(65536)");
4143 Py_DECREF(x);
4144 goto onError;
4145 }
4146 *p++ = (Py_UNICODE)value;
4147 }
4148 else if (x == Py_None) {
4149 /* undefined mapping */
4150 outpos = p-PyUnicode_AS_UNICODE(v);
4151 startinpos = s-starts;
4152 endinpos = startinpos+1;
4153 if (unicode_decode_call_errorhandler(
4154 errors, &errorHandler,
4155 "charmap", "character maps to <undefined>",
4156 starts, size, &startinpos, &endinpos, &exc, &s,
4157 &v, &outpos, &p)) {
4158 Py_DECREF(x);
4159 goto onError;
4160 }
4161 Py_DECREF(x);
4162 continue;
4163 }
4164 else if (PyUnicode_Check(x)) {
4165 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004166
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004167 if (targetsize == 1)
4168 /* 1-1 mapping */
4169 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004170
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004171 else if (targetsize > 1) {
4172 /* 1-n mapping */
4173 if (targetsize > extrachars) {
4174 /* resize first */
4175 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4176 Py_ssize_t needed = (targetsize - extrachars) + \
4177 (targetsize << 2);
4178 extrachars += needed;
4179 /* XXX overflow detection missing */
4180 if (_PyUnicode_Resize(&v,
4181 PyUnicode_GET_SIZE(v) + needed) < 0) {
4182 Py_DECREF(x);
4183 goto onError;
4184 }
4185 p = PyUnicode_AS_UNICODE(v) + oldpos;
4186 }
4187 Py_UNICODE_COPY(p,
4188 PyUnicode_AS_UNICODE(x),
4189 targetsize);
4190 p += targetsize;
4191 extrachars -= targetsize;
4192 }
4193 /* 1-0 mapping: skip the character */
4194 }
4195 else {
4196 /* wrong return value */
4197 PyErr_SetString(PyExc_TypeError,
4198 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004199 Py_DECREF(x);
4200 goto onError;
4201 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004202 Py_DECREF(x);
4203 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 }
4206 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004207 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 Py_XDECREF(errorHandler);
4210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004212
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004213 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 Py_XDECREF(errorHandler);
4215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 Py_XDECREF(v);
4217 return NULL;
4218}
4219
Martin v. Löwis3f767792006-06-04 19:36:28 +00004220/* Charmap encoding: the lookup table */
4221
4222struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004223 PyObject_HEAD
4224 unsigned char level1[32];
4225 int count2, count3;
4226 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004227};
4228
4229static PyObject*
4230encoding_map_size(PyObject *obj, PyObject* args)
4231{
4232 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004233 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004234 128*map->count3);
4235}
4236
4237static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004238 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004239 PyDoc_STR("Return the size (in bytes) of this object") },
4240 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004241};
4242
4243static void
4244encoding_map_dealloc(PyObject* o)
4245{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004246 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004247}
4248
4249static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004250 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004251 "EncodingMap", /*tp_name*/
4252 sizeof(struct encoding_map), /*tp_basicsize*/
4253 0, /*tp_itemsize*/
4254 /* methods */
4255 encoding_map_dealloc, /*tp_dealloc*/
4256 0, /*tp_print*/
4257 0, /*tp_getattr*/
4258 0, /*tp_setattr*/
4259 0, /*tp_compare*/
4260 0, /*tp_repr*/
4261 0, /*tp_as_number*/
4262 0, /*tp_as_sequence*/
4263 0, /*tp_as_mapping*/
4264 0, /*tp_hash*/
4265 0, /*tp_call*/
4266 0, /*tp_str*/
4267 0, /*tp_getattro*/
4268 0, /*tp_setattro*/
4269 0, /*tp_as_buffer*/
4270 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4271 0, /*tp_doc*/
4272 0, /*tp_traverse*/
4273 0, /*tp_clear*/
4274 0, /*tp_richcompare*/
4275 0, /*tp_weaklistoffset*/
4276 0, /*tp_iter*/
4277 0, /*tp_iternext*/
4278 encoding_map_methods, /*tp_methods*/
4279 0, /*tp_members*/
4280 0, /*tp_getset*/
4281 0, /*tp_base*/
4282 0, /*tp_dict*/
4283 0, /*tp_descr_get*/
4284 0, /*tp_descr_set*/
4285 0, /*tp_dictoffset*/
4286 0, /*tp_init*/
4287 0, /*tp_alloc*/
4288 0, /*tp_new*/
4289 0, /*tp_free*/
4290 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004291};
4292
4293PyObject*
4294PyUnicode_BuildEncodingMap(PyObject* string)
4295{
4296 Py_UNICODE *decode;
4297 PyObject *result;
4298 struct encoding_map *mresult;
4299 int i;
4300 int need_dict = 0;
4301 unsigned char level1[32];
4302 unsigned char level2[512];
4303 unsigned char *mlevel1, *mlevel2, *mlevel3;
4304 int count2 = 0, count3 = 0;
4305
4306 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4307 PyErr_BadArgument();
4308 return NULL;
4309 }
4310 decode = PyUnicode_AS_UNICODE(string);
4311 memset(level1, 0xFF, sizeof level1);
4312 memset(level2, 0xFF, sizeof level2);
4313
4314 /* If there isn't a one-to-one mapping of NULL to \0,
4315 or if there are non-BMP characters, we need to use
4316 a mapping dictionary. */
4317 if (decode[0] != 0)
4318 need_dict = 1;
4319 for (i = 1; i < 256; i++) {
4320 int l1, l2;
4321 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004322#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004323 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004324#endif
4325 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004326 need_dict = 1;
4327 break;
4328 }
4329 if (decode[i] == 0xFFFE)
4330 /* unmapped character */
4331 continue;
4332 l1 = decode[i] >> 11;
4333 l2 = decode[i] >> 7;
4334 if (level1[l1] == 0xFF)
4335 level1[l1] = count2++;
4336 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004337 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004338 }
4339
4340 if (count2 >= 0xFF || count3 >= 0xFF)
4341 need_dict = 1;
4342
4343 if (need_dict) {
4344 PyObject *result = PyDict_New();
4345 PyObject *key, *value;
4346 if (!result)
4347 return NULL;
4348 for (i = 0; i < 256; i++) {
4349 key = value = NULL;
4350 key = PyInt_FromLong(decode[i]);
4351 value = PyInt_FromLong(i);
4352 if (!key || !value)
4353 goto failed1;
4354 if (PyDict_SetItem(result, key, value) == -1)
4355 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004356 Py_DECREF(key);
4357 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004358 }
4359 return result;
4360 failed1:
4361 Py_XDECREF(key);
4362 Py_XDECREF(value);
4363 Py_DECREF(result);
4364 return NULL;
4365 }
4366
4367 /* Create a three-level trie */
4368 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4369 16*count2 + 128*count3 - 1);
4370 if (!result)
4371 return PyErr_NoMemory();
4372 PyObject_Init(result, &EncodingMapType);
4373 mresult = (struct encoding_map*)result;
4374 mresult->count2 = count2;
4375 mresult->count3 = count3;
4376 mlevel1 = mresult->level1;
4377 mlevel2 = mresult->level23;
4378 mlevel3 = mresult->level23 + 16*count2;
4379 memcpy(mlevel1, level1, 32);
4380 memset(mlevel2, 0xFF, 16*count2);
4381 memset(mlevel3, 0, 128*count3);
4382 count3 = 0;
4383 for (i = 1; i < 256; i++) {
4384 int o1, o2, o3, i2, i3;
4385 if (decode[i] == 0xFFFE)
4386 /* unmapped character */
4387 continue;
4388 o1 = decode[i]>>11;
4389 o2 = (decode[i]>>7) & 0xF;
4390 i2 = 16*mlevel1[o1] + o2;
4391 if (mlevel2[i2] == 0xFF)
4392 mlevel2[i2] = count3++;
4393 o3 = decode[i] & 0x7F;
4394 i3 = 128*mlevel2[i2] + o3;
4395 mlevel3[i3] = i;
4396 }
4397 return result;
4398}
4399
4400static int
4401encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4402{
4403 struct encoding_map *map = (struct encoding_map*)mapping;
4404 int l1 = c>>11;
4405 int l2 = (c>>7) & 0xF;
4406 int l3 = c & 0x7F;
4407 int i;
4408
4409#ifdef Py_UNICODE_WIDE
4410 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004411 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004412 }
4413#endif
4414 if (c == 0)
4415 return 0;
4416 /* level 1*/
4417 i = map->level1[l1];
4418 if (i == 0xFF) {
4419 return -1;
4420 }
4421 /* level 2*/
4422 i = map->level23[16*i+l2];
4423 if (i == 0xFF) {
4424 return -1;
4425 }
4426 /* level 3 */
4427 i = map->level23[16*map->count2 + 128*i + l3];
4428 if (i == 0) {
4429 return -1;
4430 }
4431 return i;
4432}
4433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434/* Lookup the character ch in the mapping. If the character
4435 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004436 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 PyObject *w = PyInt_FromLong((long)c);
4440 PyObject *x;
4441
4442 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 x = PyObject_GetItem(mapping, w);
4445 Py_DECREF(w);
4446 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004447 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4448 /* No mapping found means: mapping is undefined. */
4449 PyErr_Clear();
4450 x = Py_None;
4451 Py_INCREF(x);
4452 return x;
4453 } else
4454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004456 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004457 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004459 long value = PyInt_AS_LONG(x);
4460 if (value < 0 || value > 255) {
4461 PyErr_SetString(PyExc_TypeError,
4462 "character mapping must be in range(256)");
4463 Py_DECREF(x);
4464 return NULL;
4465 }
4466 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004468 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004469 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004471 /* wrong return value */
4472 PyErr_SetString(PyExc_TypeError,
4473 "character mapping must return integer, None or str");
4474 Py_DECREF(x);
4475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
4477}
4478
Martin v. Löwis3f767792006-06-04 19:36:28 +00004479static int
4480charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4481{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004482 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4483 /* exponentially overallocate to minimize reallocations */
4484 if (requiredsize < 2*outsize)
4485 requiredsize = 2*outsize;
4486 if (_PyString_Resize(outobj, requiredsize)) {
4487 return 0;
4488 }
4489 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004490}
4491
Benjamin Peterson857ce152009-01-31 16:29:18 +00004492typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004493 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004494}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495/* lookup the character, put the result in the output string and adjust
4496 various state variables. Reallocate the output string if not enough
4497 space is available. Return a new reference to the object that
4498 was put in the output buffer, or Py_None, if the mapping was undefined
4499 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004500 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004502charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004503 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505 PyObject *rep;
4506 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004507 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508
Christian Heimese93237d2007-12-19 02:37:44 +00004509 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004510 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004511 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512 if (res == -1)
4513 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004514 if (outsize<requiredsize)
4515 if (!charmapencode_resize(outobj, outpos, requiredsize))
4516 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004517 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 outstart[(*outpos)++] = (char)res;
4519 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004520 }
4521
4522 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004524 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004525 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004526 Py_DECREF(rep);
4527 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004528 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 if (PyInt_Check(rep)) {
4530 Py_ssize_t requiredsize = *outpos+1;
4531 if (outsize<requiredsize)
4532 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4533 Py_DECREF(rep);
4534 return enc_EXCEPTION;
4535 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004536 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004537 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004538 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004539 else {
4540 const char *repchars = PyString_AS_STRING(rep);
4541 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4542 Py_ssize_t requiredsize = *outpos+repsize;
4543 if (outsize<requiredsize)
4544 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4545 Py_DECREF(rep);
4546 return enc_EXCEPTION;
4547 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004548 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004549 memcpy(outstart + *outpos, repchars, repsize);
4550 *outpos += repsize;
4551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 }
Georg Brandl9f167602006-06-04 21:46:16 +00004553 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004554 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555}
4556
4557/* handle an error in PyUnicode_EncodeCharmap
4558 Return 0 on success, -1 on error */
4559static
4560int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004563 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565{
4566 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t repsize;
4568 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 Py_UNICODE *uni2;
4570 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004571 Py_ssize_t collstartpos = *inpos;
4572 Py_ssize_t collendpos = *inpos+1;
4573 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 char *encoding = "charmap";
4575 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004576 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 /* find all unencodable characters */
4579 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004580 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004581 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004582 int res = encoding_map_lookup(p[collendpos], mapping);
4583 if (res != -1)
4584 break;
4585 ++collendpos;
4586 continue;
4587 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004588
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004589 rep = charmapencode_lookup(p[collendpos], mapping);
4590 if (rep==NULL)
4591 return -1;
4592 else if (rep!=Py_None) {
4593 Py_DECREF(rep);
4594 break;
4595 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004596 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004597 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 }
4599 /* cache callback name lookup
4600 * (if not done yet, i.e. it's the first error) */
4601 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004602 if ((errors==NULL) || (!strcmp(errors, "strict")))
4603 *known_errorHandler = 1;
4604 else if (!strcmp(errors, "replace"))
4605 *known_errorHandler = 2;
4606 else if (!strcmp(errors, "ignore"))
4607 *known_errorHandler = 3;
4608 else if (!strcmp(errors, "xmlcharrefreplace"))
4609 *known_errorHandler = 4;
4610 else
4611 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 }
4613 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004614 case 1: /* strict */
4615 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4616 return -1;
4617 case 2: /* replace */
4618 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004619 x = charmapencode_output('?', mapping, res, respos);
4620 if (x==enc_EXCEPTION) {
4621 return -1;
4622 }
4623 else if (x==enc_FAILED) {
4624 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4625 return -1;
4626 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004627 }
4628 /* fall through */
4629 case 3: /* ignore */
4630 *inpos = collendpos;
4631 break;
4632 case 4: /* xmlcharrefreplace */
4633 /* generate replacement (temporarily (mis)uses p) */
4634 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004635 char buffer[2+29+1+1];
4636 char *cp;
4637 sprintf(buffer, "&#%d;", (int)p[collpos]);
4638 for (cp = buffer; *cp; ++cp) {
4639 x = charmapencode_output(*cp, mapping, res, respos);
4640 if (x==enc_EXCEPTION)
4641 return -1;
4642 else if (x==enc_FAILED) {
4643 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4644 return -1;
4645 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004646 }
4647 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004648 *inpos = collendpos;
4649 break;
4650 default:
4651 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004652 encoding, reason, p, size, exceptionObject,
4653 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004654 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004655 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004656 /* generate replacement */
4657 repsize = PyUnicode_GET_SIZE(repunicode);
4658 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004659 x = charmapencode_output(*uni2, mapping, res, respos);
4660 if (x==enc_EXCEPTION) {
4661 return -1;
4662 }
4663 else if (x==enc_FAILED) {
4664 Py_DECREF(repunicode);
4665 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4666 return -1;
4667 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004668 }
4669 *inpos = newpos;
4670 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671 }
4672 return 0;
4673}
4674
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004676 Py_ssize_t size,
4677 PyObject *mapping,
4678 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 /* output object */
4681 PyObject *res = NULL;
4682 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004683 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004685 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 PyObject *errorHandler = NULL;
4687 PyObject *exc = NULL;
4688 /* the following variable is used for caching string comparisons
4689 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4690 * 3=ignore, 4=xmlcharrefreplace */
4691 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
4693 /* Default to Latin-1 */
4694 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004695 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 /* allocate enough for a simple encoding without
4698 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004699 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 if (res == NULL)
4701 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004702 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004703 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004706 /* try to encode it */
4707 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4708 if (x==enc_EXCEPTION) /* error */
4709 goto onError;
4710 if (x==enc_FAILED) { /* unencodable character */
4711 if (charmap_encoding_error(p, size, &inpos, mapping,
4712 &exc,
4713 &known_errorHandler, &errorHandler, errors,
4714 &res, &respos)) {
4715 goto onError;
4716 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004717 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004718 else
4719 /* done with this character => adjust input position */
4720 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004724 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 if (_PyString_Resize(&res, respos))
4726 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 }
4728 Py_XDECREF(exc);
4729 Py_XDECREF(errorHandler);
4730 return res;
4731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(res);
4734 Py_XDECREF(exc);
4735 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 return NULL;
4737}
4738
4739PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741{
4742 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 PyErr_BadArgument();
4744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 }
4746 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 PyUnicode_GET_SIZE(unicode),
4748 mapping,
4749 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750}
4751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752/* create or adjust a UnicodeTranslateError */
4753static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004754 const Py_UNICODE *unicode, Py_ssize_t size,
4755 Py_ssize_t startpos, Py_ssize_t endpos,
4756 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004759 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004760 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 }
4762 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004763 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4764 goto onError;
4765 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4766 goto onError;
4767 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4768 goto onError;
4769 return;
4770 onError:
4771 Py_DECREF(*exceptionObject);
4772 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 }
4774}
4775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776/* raises a UnicodeTranslateError */
4777static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004778 const Py_UNICODE *unicode, Py_ssize_t size,
4779 Py_ssize_t startpos, Py_ssize_t endpos,
4780 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781{
4782 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004783 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004785 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786}
4787
4788/* error handling callback helper:
4789 build arguments, call the callback and check the arguments,
4790 put the result into newpos and return the replacement string, which
4791 has to be freed by the caller */
4792static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004793 PyObject **errorHandler,
4794 const char *reason,
4795 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4796 Py_ssize_t startpos, Py_ssize_t endpos,
4797 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004799 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800
Martin v. Löwis412fb672006-04-13 06:34:32 +00004801 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 PyObject *restuple;
4803 PyObject *resunicode;
4804
4805 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 }
4810
4811 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815
4816 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004817 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004821 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 Py_DECREF(restuple);
4823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
4825 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004826 &resunicode, &i_newpos)) {
4827 Py_DECREF(restuple);
4828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004830 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 else
4833 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004834 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004835 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4836 Py_DECREF(restuple);
4837 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 Py_INCREF(resunicode);
4840 Py_DECREF(restuple);
4841 return resunicode;
4842}
4843
4844/* Lookup the character ch in the mapping and put the result in result,
4845 which must be decrefed by the caller.
4846 Return 0 on success, -1 on error */
4847static
4848int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4849{
4850 PyObject *w = PyInt_FromLong((long)c);
4851 PyObject *x;
4852
4853 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 x = PyObject_GetItem(mapping, w);
4856 Py_DECREF(w);
4857 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004858 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4859 /* No mapping found means: use 1:1 mapping. */
4860 PyErr_Clear();
4861 *result = NULL;
4862 return 0;
4863 } else
4864 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 }
4866 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004867 *result = x;
4868 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 }
4870 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004871 long value = PyInt_AS_LONG(x);
4872 long max = PyUnicode_GetMax();
4873 if (value < 0 || value > max) {
4874 PyErr_Format(PyExc_TypeError,
4875 "character mapping must be in range(0x%lx)", max+1);
4876 Py_DECREF(x);
4877 return -1;
4878 }
4879 *result = x;
4880 return 0;
4881 }
4882 else if (PyUnicode_Check(x)) {
4883 *result = x;
4884 return 0;
4885 }
4886 else {
4887 /* wrong return value */
4888 PyErr_SetString(PyExc_TypeError,
4889 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004890 Py_DECREF(x);
4891 return -1;
4892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893}
4894/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 if not reallocate and adjust various state variables.
4896 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897static
Walter Dörwald4894c302003-10-24 14:25:28 +00004898int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004901 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004902 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004903 /* remember old output position */
4904 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4905 /* exponentially overallocate to minimize reallocations */
4906 if (requiredsize < 2 * oldsize)
4907 requiredsize = 2 * oldsize;
4908 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4909 return -1;
4910 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 }
4912 return 0;
4913}
4914/* lookup the character, put the result in the output string and adjust
4915 various state variables. Return a new reference to the object that
4916 was put in the output buffer in *result, or Py_None, if the mapping was
4917 undefined (in which case no character was written).
4918 The called must decref result.
4919 Return 0 on success, -1 on error. */
4920static
Walter Dörwald4894c302003-10-24 14:25:28 +00004921int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4923 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924{
Walter Dörwald4894c302003-10-24 14:25:28 +00004925 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004926 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 /* not found => default to 1:1 mapping */
4929 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 }
4931 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004932 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004934 /* no overflow check, because we know that the space is enough */
4935 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 }
4937 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004938 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4939 if (repsize==1) {
4940 /* no overflow check, because we know that the space is enough */
4941 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4942 }
4943 else if (repsize!=0) {
4944 /* more than one character */
4945 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4946 (insize - (curinp-startinp)) +
4947 repsize - 1;
4948 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4949 return -1;
4950 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4951 *outp += repsize;
4952 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 }
4954 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 return 0;
4957}
4958
4959PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004960 Py_ssize_t size,
4961 PyObject *mapping,
4962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 /* output object */
4965 PyObject *res = NULL;
4966 /* pointers to the beginning and end+1 of input */
4967 const Py_UNICODE *startp = p;
4968 const Py_UNICODE *endp = p + size;
4969 /* pointer into the output */
4970 Py_UNICODE *str;
4971 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 char *reason = "character maps to <undefined>";
4974 PyObject *errorHandler = NULL;
4975 PyObject *exc = NULL;
4976 /* the following variable is used for caching string comparisons
4977 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4978 * 3=ignore, 4=xmlcharrefreplace */
4979 int known_errorHandler = -1;
4980
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 PyErr_BadArgument();
4983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985
4986 /* allocate enough for a simple 1:1 translation without
4987 replacements, if we need more, we'll resize */
4988 res = PyUnicode_FromUnicode(NULL, size);
4989 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 /* try to encode it */
4997 PyObject *x = NULL;
4998 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4999 Py_XDECREF(x);
5000 goto onError;
5001 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005002 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005003 if (x!=Py_None) /* it worked => adjust input pointer */
5004 ++p;
5005 else { /* untranslatable character */
5006 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5007 Py_ssize_t repsize;
5008 Py_ssize_t newpos;
5009 Py_UNICODE *uni2;
5010 /* startpos for collecting untranslatable chars */
5011 const Py_UNICODE *collstart = p;
5012 const Py_UNICODE *collend = p+1;
5013 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005015 /* find all untranslatable characters */
5016 while (collend < endp) {
5017 if (charmaptranslate_lookup(*collend, mapping, &x))
5018 goto onError;
5019 Py_XDECREF(x);
5020 if (x!=Py_None)
5021 break;
5022 ++collend;
5023 }
5024 /* cache callback name lookup
5025 * (if not done yet, i.e. it's the first error) */
5026 if (known_errorHandler==-1) {
5027 if ((errors==NULL) || (!strcmp(errors, "strict")))
5028 known_errorHandler = 1;
5029 else if (!strcmp(errors, "replace"))
5030 known_errorHandler = 2;
5031 else if (!strcmp(errors, "ignore"))
5032 known_errorHandler = 3;
5033 else if (!strcmp(errors, "xmlcharrefreplace"))
5034 known_errorHandler = 4;
5035 else
5036 known_errorHandler = 0;
5037 }
5038 switch (known_errorHandler) {
5039 case 1: /* strict */
5040 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005041 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005042 case 2: /* replace */
5043 /* No need to check for space, this is a 1:1 replacement */
5044 for (coll = collstart; coll<collend; ++coll)
5045 *str++ = '?';
5046 /* fall through */
5047 case 3: /* ignore */
5048 p = collend;
5049 break;
5050 case 4: /* xmlcharrefreplace */
5051 /* generate replacement (temporarily (mis)uses p) */
5052 for (p = collstart; p < collend; ++p) {
5053 char buffer[2+29+1+1];
5054 char *cp;
5055 sprintf(buffer, "&#%d;", (int)*p);
5056 if (charmaptranslate_makespace(&res, &str,
5057 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5058 goto onError;
5059 for (cp = buffer; *cp; ++cp)
5060 *str++ = *cp;
5061 }
5062 p = collend;
5063 break;
5064 default:
5065 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5066 reason, startp, size, &exc,
5067 collstart-startp, collend-startp, &newpos);
5068 if (repunicode == NULL)
5069 goto onError;
5070 /* generate replacement */
5071 repsize = PyUnicode_GET_SIZE(repunicode);
5072 if (charmaptranslate_makespace(&res, &str,
5073 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5074 Py_DECREF(repunicode);
5075 goto onError;
5076 }
5077 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5078 *str++ = *uni2;
5079 p = startp + newpos;
5080 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005081 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005082 }
5083 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 /* Resize if we allocated to much */
5085 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005086 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005087 if (PyUnicode_Resize(&res, respos) < 0)
5088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 }
5090 Py_XDECREF(exc);
5091 Py_XDECREF(errorHandler);
5092 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005094 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005095 Py_XDECREF(res);
5096 Py_XDECREF(exc);
5097 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return NULL;
5099}
5100
5101PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005102 PyObject *mapping,
5103 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104{
5105 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 str = PyUnicode_FromObject(str);
5108 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005111 PyUnicode_GET_SIZE(str),
5112 mapping,
5113 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_DECREF(str);
5115 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005116
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005117 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 Py_XDECREF(str);
5119 return NULL;
5120}
Tim Petersced69f82003-09-16 20:30:58 +00005121
Guido van Rossum9e896b32000-04-05 20:11:21 +00005122/* --- Decimal Encoder ---------------------------------------------------- */
5123
5124int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005125 Py_ssize_t length,
5126 char *output,
5127 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005128{
5129 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 PyObject *errorHandler = NULL;
5131 PyObject *exc = NULL;
5132 const char *encoding = "decimal";
5133 const char *reason = "invalid decimal Unicode string";
5134 /* the following variable is used for caching string comparisons
5135 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5136 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005137
5138 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 PyErr_BadArgument();
5140 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005141 }
5142
5143 p = s;
5144 end = s + length;
5145 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005146 register Py_UNICODE ch = *p;
5147 int decimal;
5148 PyObject *repunicode;
5149 Py_ssize_t repsize;
5150 Py_ssize_t newpos;
5151 Py_UNICODE *uni2;
5152 Py_UNICODE *collstart;
5153 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005154
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005155 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005156 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005157 ++p;
5158 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005159 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005160 decimal = Py_UNICODE_TODECIMAL(ch);
5161 if (decimal >= 0) {
5162 *output++ = '0' + decimal;
5163 ++p;
5164 continue;
5165 }
5166 if (0 < ch && ch < 256) {
5167 *output++ = (char)ch;
5168 ++p;
5169 continue;
5170 }
5171 /* All other characters are considered unencodable */
5172 collstart = p;
5173 collend = p+1;
5174 while (collend < end) {
5175 if ((0 < *collend && *collend < 256) ||
5176 !Py_UNICODE_ISSPACE(*collend) ||
5177 Py_UNICODE_TODECIMAL(*collend))
5178 break;
5179 }
5180 /* cache callback name lookup
5181 * (if not done yet, i.e. it's the first error) */
5182 if (known_errorHandler==-1) {
5183 if ((errors==NULL) || (!strcmp(errors, "strict")))
5184 known_errorHandler = 1;
5185 else if (!strcmp(errors, "replace"))
5186 known_errorHandler = 2;
5187 else if (!strcmp(errors, "ignore"))
5188 known_errorHandler = 3;
5189 else if (!strcmp(errors, "xmlcharrefreplace"))
5190 known_errorHandler = 4;
5191 else
5192 known_errorHandler = 0;
5193 }
5194 switch (known_errorHandler) {
5195 case 1: /* strict */
5196 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5197 goto onError;
5198 case 2: /* replace */
5199 for (p = collstart; p < collend; ++p)
5200 *output++ = '?';
5201 /* fall through */
5202 case 3: /* ignore */
5203 p = collend;
5204 break;
5205 case 4: /* xmlcharrefreplace */
5206 /* generate replacement (temporarily (mis)uses p) */
5207 for (p = collstart; p < collend; ++p)
5208 output += sprintf(output, "&#%d;", (int)*p);
5209 p = collend;
5210 break;
5211 default:
5212 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5213 encoding, reason, s, length, &exc,
5214 collstart-s, collend-s, &newpos);
5215 if (repunicode == NULL)
5216 goto onError;
5217 /* generate replacement */
5218 repsize = PyUnicode_GET_SIZE(repunicode);
5219 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5220 Py_UNICODE ch = *uni2;
5221 if (Py_UNICODE_ISSPACE(ch))
5222 *output++ = ' ';
5223 else {
5224 decimal = Py_UNICODE_TODECIMAL(ch);
5225 if (decimal >= 0)
5226 *output++ = '0' + decimal;
5227 else if (0 < ch && ch < 256)
5228 *output++ = (char)ch;
5229 else {
5230 Py_DECREF(repunicode);
5231 raise_encode_exception(&exc, encoding,
5232 s, length, collstart-s, collend-s, reason);
5233 goto onError;
5234 }
5235 }
5236 }
5237 p = s + newpos;
5238 Py_DECREF(repunicode);
5239 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005240 }
5241 /* 0-terminate the output string */
5242 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 Py_XDECREF(exc);
5244 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005245 return 0;
5246
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005247 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 Py_XDECREF(exc);
5249 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005250 return -1;
5251}
5252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253/* --- Helpers ------------------------------------------------------------ */
5254
Eric Smitha9f7d622008-02-17 19:46:49 +00005255#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005256#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005257
5258#include "stringlib/count.h"
5259#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005260#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005261#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005262
Fredrik Lundhc8162812006-05-26 19:33:03 +00005263/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005264#define ADJUST_INDICES(start, end, len) \
5265 if (end > len) \
5266 end = len; \
5267 else if (end < 0) { \
5268 end += len; \
5269 if (end < 0) \
5270 end = 0; \
5271 } \
5272 if (start < 0) { \
5273 start += len; \
5274 if (start < 0) \
5275 start = 0; \
5276 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005277
Martin v. Löwis18e16552006-02-15 17:27:45 +00005278Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005279 PyObject *substr,
5280 Py_ssize_t start,
5281 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005283 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005284 PyUnicodeObject* str_obj;
5285 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005286
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005287 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5288 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005289 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005290 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5291 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005292 Py_DECREF(str_obj);
5293 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 }
Tim Petersced69f82003-09-16 20:30:58 +00005295
Antoine Pitrou64672132010-01-13 07:55:48 +00005296 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005297 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005298 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5299 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005300 );
5301
5302 Py_DECREF(sub_obj);
5303 Py_DECREF(str_obj);
5304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 return result;
5306}
5307
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005309 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005310 Py_ssize_t start,
5311 Py_ssize_t end,
5312 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005314 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005315
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005316 str = PyUnicode_FromObject(str);
5317 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005318 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005319 sub = PyUnicode_FromObject(sub);
5320 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005321 Py_DECREF(str);
5322 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 }
Tim Petersced69f82003-09-16 20:30:58 +00005324
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005325 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005326 result = stringlib_find_slice(
5327 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5328 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5329 start, end
5330 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005331 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005332 result = stringlib_rfind_slice(
5333 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5334 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5335 start, end
5336 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005337
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005338 Py_DECREF(str);
5339 Py_DECREF(sub);
5340
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 return result;
5342}
5343
Tim Petersced69f82003-09-16 20:30:58 +00005344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005346 PyUnicodeObject *substring,
5347 Py_ssize_t start,
5348 Py_ssize_t end,
5349 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 if (substring->length == 0)
5352 return 1;
5353
Antoine Pitrou64672132010-01-13 07:55:48 +00005354 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 end -= substring->length;
5356 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005357 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
5359 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005360 if (Py_UNICODE_MATCH(self, end, substring))
5361 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 } else {
5363 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 }
5366
5367 return 0;
5368}
5369
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005371 PyObject *substr,
5372 Py_ssize_t start,
5373 Py_ssize_t end,
5374 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005377
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 str = PyUnicode_FromObject(str);
5379 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 substr = PyUnicode_FromObject(substr);
5382 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 Py_DECREF(str);
5384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005388 (PyUnicodeObject *)substr,
5389 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 Py_DECREF(str);
5391 Py_DECREF(substr);
5392 return result;
5393}
5394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395/* Apply fixfct filter to the Unicode object self and return a
5396 reference to the modified object */
5397
Tim Petersced69f82003-09-16 20:30:58 +00005398static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005400 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
5402
5403 PyUnicodeObject *u;
5404
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005405 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005407 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005408
5409 Py_UNICODE_COPY(u->str, self->str, self->length);
5410
Tim Peters7a29bd52001-09-12 03:03:31 +00005411 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005412 /* fixfct should return TRUE if it modified the buffer. If
5413 FALSE, return a reference to the original buffer instead
5414 (to save space, not time) */
5415 Py_INCREF(self);
5416 Py_DECREF(u);
5417 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 }
5419 return (PyObject*) u;
5420}
5421
Tim Petersced69f82003-09-16 20:30:58 +00005422static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423int fixupper(PyUnicodeObject *self)
5424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 Py_UNICODE *s = self->str;
5427 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005428
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005430 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005431
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005432 ch = Py_UNICODE_TOUPPER(*s);
5433 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005435 *s = ch;
5436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 s++;
5438 }
5439
5440 return status;
5441}
5442
Tim Petersced69f82003-09-16 20:30:58 +00005443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444int fixlower(PyUnicodeObject *self)
5445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 Py_UNICODE *s = self->str;
5448 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005453 ch = Py_UNICODE_TOLOWER(*s);
5454 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005456 *s = ch;
5457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 s++;
5459 }
5460
5461 return status;
5462}
5463
Tim Petersced69f82003-09-16 20:30:58 +00005464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465int fixswapcase(PyUnicodeObject *self)
5466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 Py_UNICODE *s = self->str;
5469 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005470
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 while (len-- > 0) {
5472 if (Py_UNICODE_ISUPPER(*s)) {
5473 *s = Py_UNICODE_TOLOWER(*s);
5474 status = 1;
5475 } else if (Py_UNICODE_ISLOWER(*s)) {
5476 *s = Py_UNICODE_TOUPPER(*s);
5477 status = 1;
5478 }
5479 s++;
5480 }
5481
5482 return status;
5483}
5484
Tim Petersced69f82003-09-16 20:30:58 +00005485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486int fixcapitalize(PyUnicodeObject *self)
5487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005489 Py_UNICODE *s = self->str;
5490 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005491
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005492 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005493 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005494 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005495 *s = Py_UNICODE_TOUPPER(*s);
5496 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005498 s++;
5499 while (--len > 0) {
5500 if (Py_UNICODE_ISUPPER(*s)) {
5501 *s = Py_UNICODE_TOLOWER(*s);
5502 status = 1;
5503 }
5504 s++;
5505 }
5506 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507}
5508
5509static
5510int fixtitle(PyUnicodeObject *self)
5511{
5512 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5513 register Py_UNICODE *e;
5514 int previous_is_cased;
5515
5516 /* Shortcut for single character strings */
5517 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005518 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5519 if (*p != ch) {
5520 *p = ch;
5521 return 1;
5522 }
5523 else
5524 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Tim Petersced69f82003-09-16 20:30:58 +00005526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 e = p + PyUnicode_GET_SIZE(self);
5528 previous_is_cased = 0;
5529 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005530 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005531
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005532 if (previous_is_cased)
5533 *p = Py_UNICODE_TOLOWER(ch);
5534 else
5535 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005536
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005537 if (Py_UNICODE_ISLOWER(ch) ||
5538 Py_UNICODE_ISUPPER(ch) ||
5539 Py_UNICODE_ISTITLE(ch))
5540 previous_is_cased = 1;
5541 else
5542 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 }
5544 return 1;
5545}
5546
Tim Peters8ce9f162004-08-27 01:49:32 +00005547PyObject *
5548PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
Tim Peters8ce9f162004-08-27 01:49:32 +00005550 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005551 const Py_UNICODE blank = ' ';
5552 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005553 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005554 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005555 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5556 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5558 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005559 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005560 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005561 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 fseq = PySequence_Fast(seq, "");
5564 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005565 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005566 }
5567
Tim Peters91879ab2004-08-27 22:35:44 +00005568 /* Grrrr. A codec may be invoked to convert str objects to
5569 * Unicode, and so it's possible to call back into Python code
5570 * during PyUnicode_FromObject(), and so it's possible for a sick
5571 * codec to change the size of fseq (if seq is a list). Therefore
5572 * we have to keep refetching the size -- can't assume seqlen
5573 * is invariant.
5574 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 seqlen = PySequence_Fast_GET_SIZE(fseq);
5576 /* If empty sequence, return u"". */
5577 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005578 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5579 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005580 }
5581 /* If singleton sequence with an exact Unicode, return that. */
5582 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005583 item = PySequence_Fast_GET_ITEM(fseq, 0);
5584 if (PyUnicode_CheckExact(item)) {
5585 Py_INCREF(item);
5586 res = (PyUnicodeObject *)item;
5587 goto Done;
5588 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005589 }
5590
Tim Peters05eba1f2004-08-27 21:32:02 +00005591 /* At least two items to join, or one that isn't exact Unicode. */
5592 if (seqlen > 1) {
5593 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005594 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005595 sep = &blank;
5596 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005597 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005598 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005599 internal_separator = PyUnicode_FromObject(separator);
5600 if (internal_separator == NULL)
5601 goto onError;
5602 sep = PyUnicode_AS_UNICODE(internal_separator);
5603 seplen = PyUnicode_GET_SIZE(internal_separator);
5604 /* In case PyUnicode_FromObject() mutated seq. */
5605 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 }
5607 }
5608
5609 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005610 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005611 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005612 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005613 res_p = PyUnicode_AS_UNICODE(res);
5614 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005615
Tim Peters05eba1f2004-08-27 21:32:02 +00005616 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005617 Py_ssize_t itemlen;
5618 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005619
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005620 item = PySequence_Fast_GET_ITEM(fseq, i);
5621 /* Convert item to Unicode. */
5622 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5623 PyErr_Format(PyExc_TypeError,
5624 "sequence item %zd: expected string or Unicode,"
5625 " %.80s found",
5626 i, Py_TYPE(item)->tp_name);
5627 goto onError;
5628 }
5629 item = PyUnicode_FromObject(item);
5630 if (item == NULL)
5631 goto onError;
5632 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005633
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 /* In case PyUnicode_FromObject() mutated seq. */
5635 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005636
Tim Peters8ce9f162004-08-27 01:49:32 +00005637 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005638 itemlen = PyUnicode_GET_SIZE(item);
5639 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005640 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005641 goto Overflow;
5642 if (i < seqlen - 1) {
5643 new_res_used += seplen;
5644 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005645 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005646 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005647 if (new_res_used > res_alloc) {
5648 /* double allocated size until it's big enough */
5649 do {
5650 res_alloc += res_alloc;
5651 if (res_alloc <= 0)
5652 goto Overflow;
5653 } while (new_res_used > res_alloc);
5654 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5655 Py_DECREF(item);
5656 goto onError;
5657 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005658 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005659 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005660
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005661 /* Copy item, and maybe the separator. */
5662 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5663 res_p += itemlen;
5664 if (i < seqlen - 1) {
5665 Py_UNICODE_COPY(res_p, sep, seplen);
5666 res_p += seplen;
5667 }
5668 Py_DECREF(item);
5669 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005670 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005671
Tim Peters05eba1f2004-08-27 21:32:02 +00005672 /* Shrink res to match the used area; this probably can't fail,
5673 * but it's cheap to check.
5674 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005675 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005676 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005677
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005678 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005680 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return (PyObject *)res;
5682
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005683 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005684 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005685 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 Py_DECREF(item);
5687 /* fall through */
5688
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005689 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005690 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005691 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005692 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return NULL;
5694}
5695
Tim Petersced69f82003-09-16 20:30:58 +00005696static
5697PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005698 Py_ssize_t left,
5699 Py_ssize_t right,
5700 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
5702 PyUnicodeObject *u;
5703
5704 if (left < 0)
5705 left = 0;
5706 if (right < 0)
5707 right = 0;
5708
Tim Peters7a29bd52001-09-12 03:03:31 +00005709 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 Py_INCREF(self);
5711 return self;
5712 }
5713
Neal Norwitze7d8be82008-07-31 17:17:14 +00005714 if (left > PY_SSIZE_T_MAX - self->length ||
5715 right > PY_SSIZE_T_MAX - (left + self->length)) {
5716 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5717 return NULL;
5718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 u = _PyUnicode_New(left + self->length + right);
5720 if (u) {
5721 if (left)
5722 Py_UNICODE_FILL(u->str, fill, left);
5723 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5724 if (right)
5725 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5726 }
5727
5728 return u;
5729}
5730
Antoine Pitrou64672132010-01-13 07:55:48 +00005731PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
5735 string = PyUnicode_FromObject(string);
5736 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Antoine Pitrou64672132010-01-13 07:55:48 +00005739 list = stringlib_splitlines(
5740 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5741 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
5743 Py_DECREF(string);
5744 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745}
5746
Tim Petersced69f82003-09-16 20:30:58 +00005747static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005749 PyUnicodeObject *substring,
5750 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005753 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005756 return stringlib_split_whitespace(
5757 (PyObject*) self, self->str, self->length, maxcount
5758 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Antoine Pitrou64672132010-01-13 07:55:48 +00005760 return stringlib_split(
5761 (PyObject*) self, self->str, self->length,
5762 substring->str, substring->length,
5763 maxcount
5764 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
Tim Petersced69f82003-09-16 20:30:58 +00005767static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005769 PyUnicodeObject *substring,
5770 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005773 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005776 return stringlib_rsplit_whitespace(
5777 (PyObject*) self, self->str, self->length, maxcount
5778 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779
Antoine Pitrou64672132010-01-13 07:55:48 +00005780 return stringlib_rsplit(
5781 (PyObject*) self, self->str, self->length,
5782 substring->str, substring->length,
5783 maxcount
5784 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785}
5786
5787static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005789 PyUnicodeObject *str1,
5790 PyUnicodeObject *str2,
5791 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792{
5793 PyUnicodeObject *u;
5794
5795 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005796 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005797 else if (maxcount == 0 || self->length == 0)
5798 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Fredrik Lundh347ee272006-05-24 16:35:18 +00005800 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005801 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005802 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005803 if (str1->length == 0)
5804 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005805 if (str1->length == 1) {
5806 /* replace characters */
5807 Py_UNICODE u1, u2;
5808 if (!findchar(self->str, self->length, str1->str[0]))
5809 goto nothing;
5810 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5811 if (!u)
5812 return NULL;
5813 Py_UNICODE_COPY(u->str, self->str, self->length);
5814 u1 = str1->str[0];
5815 u2 = str2->str[0];
5816 for (i = 0; i < u->length; i++)
5817 if (u->str[i] == u1) {
5818 if (--maxcount < 0)
5819 break;
5820 u->str[i] = u2;
5821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005823 i = stringlib_find(
5824 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005826 if (i < 0)
5827 goto nothing;
5828 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5829 if (!u)
5830 return NULL;
5831 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005832
5833 /* change everything in-place, starting with this one */
5834 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5835 i += str1->length;
5836
5837 while ( --maxcount > 0) {
5838 i = stringlib_find(self->str+i, self->length-i,
5839 str1->str, str1->length,
5840 i);
5841 if (i == -1)
5842 break;
5843 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5844 i += str1->length;
5845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005848
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005849 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005850 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 Py_UNICODE *p;
5852
5853 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005854 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5855 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005856 if (n == 0)
5857 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005858 /* new_size = self->length + n * (str2->length - str1->length)); */
5859 delta = (str2->length - str1->length);
5860 if (delta == 0) {
5861 new_size = self->length;
5862 } else {
5863 product = n * (str2->length - str1->length);
5864 if ((product / (str2->length - str1->length)) != n) {
5865 PyErr_SetString(PyExc_OverflowError,
5866 "replace string is too long");
5867 return NULL;
5868 }
5869 new_size = self->length + product;
5870 if (new_size < 0) {
5871 PyErr_SetString(PyExc_OverflowError,
5872 "replace string is too long");
5873 return NULL;
5874 }
5875 }
5876 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005877 if (!u)
5878 return NULL;
5879 i = 0;
5880 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005881 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005882 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005883 while (n-- > 0) {
5884 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005885 j = stringlib_find(self->str+i, self->length-i,
5886 str1->str, str1->length,
5887 i);
5888 if (j == -1)
5889 break;
5890 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005891 /* copy unchanged part [i:j] */
5892 Py_UNICODE_COPY(p, self->str+i, j-i);
5893 p += j - i;
5894 }
5895 /* copy substitution string */
5896 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005897 Py_UNICODE_COPY(p, str2->str, str2->length);
5898 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005899 }
5900 i = j + str1->length;
5901 }
5902 if (i < self->length)
5903 /* copy tail [i:] */
5904 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005905 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005906 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005907 while (n > 0) {
5908 Py_UNICODE_COPY(p, str2->str, str2->length);
5909 p += str2->length;
5910 if (--n <= 0)
5911 break;
5912 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005914 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
5916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005918
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005919 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005920 /* nothing to replace; return original string (when possible) */
5921 if (PyUnicode_CheckExact(self)) {
5922 Py_INCREF(self);
5923 return (PyObject *) self;
5924 }
5925 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926}
5927
5928/* --- Unicode Object Methods --------------------------------------------- */
5929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005931 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932\n\
5933Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
5936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005937unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 return fixup(self, fixtitle);
5940}
5941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005943 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944\n\
5945Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005946have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
5948static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005949unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 return fixup(self, fixcapitalize);
5952}
5953
5954#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005956 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957\n\
5958Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
5961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005962unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963{
5964 PyObject *list;
5965 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005966 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 /* Split into words */
5969 list = split(self, NULL, -1);
5970 if (!list)
5971 return NULL;
5972
5973 /* Capitalize each word */
5974 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5975 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005976 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 if (item == NULL)
5978 goto onError;
5979 Py_DECREF(PyList_GET_ITEM(list, i));
5980 PyList_SET_ITEM(list, i, item);
5981 }
5982
5983 /* Join the words to form a new string */
5984 item = PyUnicode_Join(NULL, list);
5985
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005986 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 Py_DECREF(list);
5988 return (PyObject *)item;
5989}
5990#endif
5991
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005992/* Argument converter. Coerces to a single unicode character */
5993
5994static int
5995convert_uc(PyObject *obj, void *addr)
5996{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005997 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5998 PyObject *uniobj;
5999 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006000
Benjamin Peterson857ce152009-01-31 16:29:18 +00006001 uniobj = PyUnicode_FromObject(obj);
6002 if (uniobj == NULL) {
6003 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006004 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006005 return 0;
6006 }
6007 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6008 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006009 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006010 Py_DECREF(uniobj);
6011 return 0;
6012 }
6013 unistr = PyUnicode_AS_UNICODE(uniobj);
6014 *fillcharloc = unistr[0];
6015 Py_DECREF(uniobj);
6016 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006017}
6018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006020 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006022Return S centered in a Unicode string of length width. Padding is\n\
6023done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025static PyObject *
6026unicode_center(PyUnicodeObject *self, PyObject *args)
6027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006028 Py_ssize_t marg, left;
6029 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006030 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
Thomas Woutersde017742006-02-16 19:34:37 +00006032 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 return NULL;
6034
Tim Peters7a29bd52001-09-12 03:03:31 +00006035 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 Py_INCREF(self);
6037 return (PyObject*) self;
6038 }
6039
6040 marg = width - self->length;
6041 left = marg / 2 + (marg & width & 1);
6042
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006043 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044}
6045
Marc-André Lemburge5034372000-08-08 08:04:29 +00006046#if 0
6047
6048/* This code should go into some future Unicode collation support
6049 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006050 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006051
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006052/* speedy UTF-16 code point order comparison */
6053/* gleaned from: */
6054/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6055
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006056static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006057{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006058 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006059 0, 0, 0, 0, 0, 0, 0, 0,
6060 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006061 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006062};
6063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064static int
6065unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6066{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006067 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 Py_UNICODE *s1 = str1->str;
6070 Py_UNICODE *s2 = str2->str;
6071
6072 len1 = str1->length;
6073 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006076 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077
6078 c1 = *s1++;
6079 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006080
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006081 if (c1 > (1<<11) * 26)
6082 c1 += utf16Fixup[c1>>11];
6083 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006086
6087 if (c1 != c2)
6088 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006089
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006090 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 }
6092
6093 return (len1 < len2) ? -1 : (len1 != len2);
6094}
6095
Marc-André Lemburge5034372000-08-08 08:04:29 +00006096#else
6097
6098static int
6099unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6100{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006102
6103 Py_UNICODE *s1 = str1->str;
6104 Py_UNICODE *s2 = str2->str;
6105
6106 len1 = str1->length;
6107 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006108
Marc-André Lemburge5034372000-08-08 08:04:29 +00006109 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006110 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006111
Fredrik Lundh45714e92001-06-26 16:39:36 +00006112 c1 = *s1++;
6113 c2 = *s2++;
6114
6115 if (c1 != c2)
6116 return (c1 < c2) ? -1 : 1;
6117
Marc-André Lemburge5034372000-08-08 08:04:29 +00006118 len1--; len2--;
6119 }
6120
6121 return (len1 < len2) ? -1 : (len1 != len2);
6122}
6123
6124#endif
6125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006127 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
6129 PyUnicodeObject *u = NULL, *v = NULL;
6130 int result;
6131
6132 /* Coerce the two arguments */
6133 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6134 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006135 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6137 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006138 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Thomas Wouters7e474022000-07-16 12:04:32 +00006140 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006142 Py_DECREF(u);
6143 Py_DECREF(v);
6144 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 }
6146
6147 result = unicode_compare(u, v);
6148
6149 Py_DECREF(u);
6150 Py_DECREF(v);
6151 return result;
6152
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006153 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 Py_XDECREF(u);
6155 Py_XDECREF(v);
6156 return -1;
6157}
6158
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006159PyObject *PyUnicode_RichCompare(PyObject *left,
6160 PyObject *right,
6161 int op)
6162{
6163 int result;
6164
6165 result = PyUnicode_Compare(left, right);
6166 if (result == -1 && PyErr_Occurred())
6167 goto onError;
6168
6169 /* Convert the return value to a Boolean */
6170 switch (op) {
6171 case Py_EQ:
6172 result = (result == 0);
6173 break;
6174 case Py_NE:
6175 result = (result != 0);
6176 break;
6177 case Py_LE:
6178 result = (result <= 0);
6179 break;
6180 case Py_GE:
6181 result = (result >= 0);
6182 break;
6183 case Py_LT:
6184 result = (result == -1);
6185 break;
6186 case Py_GT:
6187 result = (result == 1);
6188 break;
6189 }
6190 return PyBool_FromLong(result);
6191
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006192 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006193
6194 /* Standard case
6195
6196 Type errors mean that PyUnicode_FromObject() could not convert
6197 one of the arguments (usually the right hand side) to Unicode,
6198 ie. we can't handle the comparison request. However, it is
6199 possible that the other object knows a comparison method, which
6200 is why we return Py_NotImplemented to give the other object a
6201 chance.
6202
6203 */
6204 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6205 PyErr_Clear();
6206 Py_INCREF(Py_NotImplemented);
6207 return Py_NotImplemented;
6208 }
6209 if (op != Py_EQ && op != Py_NE)
6210 return NULL;
6211
6212 /* Equality comparison.
6213
6214 This is a special case: we silence any PyExc_UnicodeDecodeError
6215 and instead turn it into a PyErr_UnicodeWarning.
6216
6217 */
6218 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6219 return NULL;
6220 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006221 if (PyErr_Warn(PyExc_UnicodeWarning,
6222 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006223 "Unicode equal comparison "
6224 "failed to convert both arguments to Unicode - "
6225 "interpreting them as being unequal" :
6226 "Unicode unequal comparison "
6227 "failed to convert both arguments to Unicode - "
6228 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006229 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006230 return NULL;
6231 result = (op == Py_NE);
6232 return PyBool_FromLong(result);
6233}
6234
Guido van Rossum403d68b2000-03-13 15:55:09 +00006235int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006236 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006237{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006238 PyObject *str, *sub;
6239 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006240
6241 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006242 sub = PyUnicode_FromObject(element);
6243 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006244 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006245 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006246
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006247 str = PyUnicode_FromObject(container);
6248 if (!str) {
6249 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006250 return -1;
6251 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006253 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006254
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006255 Py_DECREF(str);
6256 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006257
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006258 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006259}
6260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261/* Concat to string or Unicode object giving a new Unicode object. */
6262
6263PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006264 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
6266 PyUnicodeObject *u = NULL, *v = NULL, *w;
6267
6268 /* Coerce the two arguments */
6269 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6270 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6273 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
6276 /* Shortcuts */
6277 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006278 Py_DECREF(v);
6279 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 }
6281 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006282 Py_DECREF(u);
6283 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 }
6285
6286 /* Concat the two Unicode strings */
6287 w = _PyUnicode_New(u->length + v->length);
6288 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006289 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 Py_UNICODE_COPY(w->str, u->str, u->length);
6291 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6292
6293 Py_DECREF(u);
6294 Py_DECREF(v);
6295 return (PyObject *)w;
6296
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006297 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 Py_XDECREF(u);
6299 Py_XDECREF(v);
6300 return NULL;
6301}
6302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006303PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006304 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006306Return the number of non-overlapping occurrences of substring sub in\n\
6307Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309
6310static PyObject *
6311unicode_count(PyUnicodeObject *self, PyObject *args)
6312{
6313 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006314 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006315 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 PyObject *result;
6317
Guido van Rossumb8872e62000-05-09 14:14:27 +00006318 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006319 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 return NULL;
6321
6322 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006323 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006325 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006326
Antoine Pitrou64672132010-01-13 07:55:48 +00006327 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006328 result = PyInt_FromSsize_t(
6329 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006330 substring->str, substring->length,
6331 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006332 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
6334 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 return result;
6337}
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006340 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006342Encodes S using the codec registered for encoding. encoding defaults\n\
6343to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006344handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6346'xmlcharrefreplace' as well as any other name registered with\n\
6347codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348
6349static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006350unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006352 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 char *encoding = NULL;
6354 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006355 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006356
Benjamin Peterson332d7212009-09-18 21:14:55 +00006357 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6358 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006360 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006361 if (v == NULL)
6362 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006363 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006364 PyErr_Format(PyExc_TypeError,
6365 "encoder did not return a string/unicode object "
6366 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006367 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006368 Py_DECREF(v);
6369 return NULL;
6370 }
6371 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006372
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006373 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006374 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006375}
6376
6377PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006378 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006379\n\
6380Decodes S using the codec registered for encoding. encoding defaults\n\
6381to the default encoding. errors may be given to set a different error\n\
6382handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6383a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6384as well as any other name registerd with codecs.register_error that is\n\
6385able to handle UnicodeDecodeErrors.");
6386
6387static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006388unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006390 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006391 char *encoding = NULL;
6392 char *errors = NULL;
6393 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006394
Benjamin Peterson332d7212009-09-18 21:14:55 +00006395 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6396 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397 return NULL;
6398 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006399 if (v == NULL)
6400 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006401 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006402 PyErr_Format(PyExc_TypeError,
6403 "decoder did not return a string/unicode object "
6404 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006405 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006406 Py_DECREF(v);
6407 return NULL;
6408 }
6409 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006410
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006411 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413}
6414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006415PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006416 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417\n\
6418Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
6421static PyObject*
6422unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6423{
6424 Py_UNICODE *e;
6425 Py_UNICODE *p;
6426 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006427 Py_UNICODE *qe;
6428 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 PyUnicodeObject *u;
6430 int tabsize = 8;
6431
6432 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
Thomas Wouters7e474022000-07-16 12:04:32 +00006435 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006436 i = 0; /* chars up to and including most recent \n or \r */
6437 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6438 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 for (p = self->str; p < e; p++)
6440 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006441 if (tabsize > 0) {
6442 incr = tabsize - (j % tabsize); /* cannot overflow */
6443 if (j > PY_SSIZE_T_MAX - incr)
6444 goto overflow1;
6445 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006446 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006449 if (j > PY_SSIZE_T_MAX - 1)
6450 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 j++;
6452 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006453 if (i > PY_SSIZE_T_MAX - j)
6454 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006456 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 }
6458 }
6459
Guido van Rossum5bdff602008-03-11 21:18:06 +00006460 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006461 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006462
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 /* Second pass: create output string and fill it */
6464 u = _PyUnicode_New(i + j);
6465 if (!u)
6466 return NULL;
6467
Guido van Rossum5bdff602008-03-11 21:18:06 +00006468 j = 0; /* same as in first pass */
6469 q = u->str; /* next output char */
6470 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472 for (p = self->str; p < e; p++)
6473 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006474 if (tabsize > 0) {
6475 i = tabsize - (j % tabsize);
6476 j += i;
6477 while (i--) {
6478 if (q >= qe)
6479 goto overflow2;
6480 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006481 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006482 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006483 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006484 else {
6485 if (q >= qe)
6486 goto overflow2;
6487 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006488 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 if (*p == '\n' || *p == '\r')
6490 j = 0;
6491 }
6492
6493 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006494
6495 overflow2:
6496 Py_DECREF(u);
6497 overflow1:
6498 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500}
6501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006503 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504\n\
6505Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006506such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507arguments start and end are interpreted as in slice notation.\n\
6508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511static PyObject *
6512unicode_find(PyUnicodeObject *self, PyObject *args)
6513{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006514 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006515 Py_ssize_t start;
6516 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006517 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518
Facundo Batista57d56692007-11-16 18:04:14 +00006519 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006522 result = stringlib_find_slice(
6523 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6524 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6525 start, end
6526 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527
6528 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006529
6530 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531}
6532
6533static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006534unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535{
6536 if (index < 0 || index >= self->length) {
6537 PyErr_SetString(PyExc_IndexError, "string index out of range");
6538 return NULL;
6539 }
6540
6541 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6542}
6543
6544static long
6545unicode_hash(PyUnicodeObject *self)
6546{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006547 /* Since Unicode objects compare equal to their ASCII string
6548 counterparts, they should use the individual character values
6549 as basis for their hash value. This is needed to assure that
6550 strings and Unicode objects behave in the same way as
6551 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
Martin v. Löwis18e16552006-02-15 17:27:45 +00006553 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006554 register Py_UNICODE *p;
6555 register long x;
6556
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006558 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006559 len = PyUnicode_GET_SIZE(self);
6560 p = PyUnicode_AS_UNICODE(self);
6561 x = *p << 7;
6562 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006563 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006564 x ^= PyUnicode_GET_SIZE(self);
6565 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006566 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006567 self->hash = x;
6568 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569}
6570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006571PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006572 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006574Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
6576static PyObject *
6577unicode_index(PyUnicodeObject *self, PyObject *args)
6578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006580 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006581 Py_ssize_t start;
6582 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583
Facundo Batista57d56692007-11-16 18:04:14 +00006584 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006587 result = stringlib_find_slice(
6588 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6589 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6590 start, end
6591 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
6593 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006594
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 if (result < 0) {
6596 PyErr_SetString(PyExc_ValueError, "substring not found");
6597 return NULL;
6598 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006599
Martin v. Löwis18e16552006-02-15 17:27:45 +00006600 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601}
6602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006603PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006604 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006606Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006607at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
6609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006610unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611{
6612 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6613 register const Py_UNICODE *e;
6614 int cased;
6615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 /* Shortcut for single character strings */
6617 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006618 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006620 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006621 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006622 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 e = p + PyUnicode_GET_SIZE(self);
6625 cased = 0;
6626 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006627 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006628
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006629 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6630 return PyBool_FromLong(0);
6631 else if (!cased && Py_UNICODE_ISLOWER(ch))
6632 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006634 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635}
6636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006638 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006640Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006641at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
6643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006644unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
6646 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6647 register const Py_UNICODE *e;
6648 int cased;
6649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 /* Shortcut for single character strings */
6651 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006652 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006654 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006655 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006657
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 e = p + PyUnicode_GET_SIZE(self);
6659 cased = 0;
6660 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006661 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006662
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006663 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6664 return PyBool_FromLong(0);
6665 else if (!cased && Py_UNICODE_ISUPPER(ch))
6666 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006668 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669}
6670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006671PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006672 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006674Return True if S is a titlecased string and there is at least one\n\
6675character in S, i.e. upper- and titlecase characters may only\n\
6676follow uncased characters and lowercase characters only cased ones.\n\
6677Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
6679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006680unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681{
6682 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6683 register const Py_UNICODE *e;
6684 int cased, previous_is_cased;
6685
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 /* Shortcut for single character strings */
6687 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006688 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6689 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006691 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006692 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006693 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 e = p + PyUnicode_GET_SIZE(self);
6696 cased = 0;
6697 previous_is_cased = 0;
6698 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006699 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006701 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6702 if (previous_is_cased)
6703 return PyBool_FromLong(0);
6704 previous_is_cased = 1;
6705 cased = 1;
6706 }
6707 else if (Py_UNICODE_ISLOWER(ch)) {
6708 if (!previous_is_cased)
6709 return PyBool_FromLong(0);
6710 previous_is_cased = 1;
6711 cased = 1;
6712 }
6713 else
6714 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006716 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006720 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006722Return True if all characters in S are whitespace\n\
6723and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006726unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
6728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6729 register const Py_UNICODE *e;
6730
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 /* Shortcut for single character strings */
6732 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 Py_UNICODE_ISSPACE(*p))
6734 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006736 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006737 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006738 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 e = p + PyUnicode_GET_SIZE(self);
6741 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006742 if (!Py_UNICODE_ISSPACE(*p))
6743 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006745 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006748PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006749 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006750\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006751Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006752and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006753
6754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006755unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006756{
6757 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6758 register const Py_UNICODE *e;
6759
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006760 /* Shortcut for single character strings */
6761 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 Py_UNICODE_ISALPHA(*p))
6763 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006764
6765 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006766 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006767 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006768
6769 e = p + PyUnicode_GET_SIZE(self);
6770 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006771 if (!Py_UNICODE_ISALPHA(*p))
6772 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006778 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006780Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782
6783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006784unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785{
6786 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6787 register const Py_UNICODE *e;
6788
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789 /* Shortcut for single character strings */
6790 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 Py_UNICODE_ISALNUM(*p))
6792 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006793
6794 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006795 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006796 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797
6798 e = p + PyUnicode_GET_SIZE(self);
6799 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006800 if (!Py_UNICODE_ISALNUM(*p))
6801 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006803 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006813unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
6815 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6816 register const Py_UNICODE *e;
6817
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 /* Shortcut for single character strings */
6819 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006820 Py_UNICODE_ISDECIMAL(*p))
6821 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006823 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006824 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006825 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006826
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 e = p + PyUnicode_GET_SIZE(self);
6828 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 if (!Py_UNICODE_ISDECIMAL(*p))
6830 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006832 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
6834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006835PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006838Return True if all characters in S are digits\n\
6839and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840
6841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006842unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
6844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6845 register const Py_UNICODE *e;
6846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 /* Shortcut for single character strings */
6848 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006849 Py_UNICODE_ISDIGIT(*p))
6850 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006852 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006853 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006854 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 e = p + PyUnicode_GET_SIZE(self);
6857 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 if (!Py_UNICODE_ISDIGIT(*p))
6859 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862}
6863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006864PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006867Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006868False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869
6870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006871unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
6873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6874 register const Py_UNICODE *e;
6875
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 /* Shortcut for single character strings */
6877 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006878 Py_UNICODE_ISNUMERIC(*p))
6879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006881 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006882 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006884
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 e = p + PyUnicode_GET_SIZE(self);
6886 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006887 if (!Py_UNICODE_ISNUMERIC(*p))
6888 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891}
6892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006893PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006894 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895\n\
6896Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006897iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898
6899static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006900unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006902 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903}
6904
Martin v. Löwis18e16552006-02-15 17:27:45 +00006905static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906unicode_length(PyUnicodeObject *self)
6907{
6908 return self->length;
6909}
6910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006911PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006912 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006914Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006915done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
6917static PyObject *
6918unicode_ljust(PyUnicodeObject *self, PyObject *args)
6919{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006920 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006921 Py_UNICODE fillchar = ' ';
6922
Martin v. Löwis412fb672006-04-13 06:34:32 +00006923 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 return NULL;
6925
Tim Peters7a29bd52001-09-12 03:03:31 +00006926 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 Py_INCREF(self);
6928 return (PyObject*) self;
6929 }
6930
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006931 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932}
6933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006934PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006935 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006940unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 return fixup(self, fixlower);
6943}
6944
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006945#define LEFTSTRIP 0
6946#define RIGHTSTRIP 1
6947#define BOTHSTRIP 2
6948
6949/* Arrays indexed by above */
6950static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6951
6952#define STRIPNAME(i) (stripformat[i]+3)
6953
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006954/* externally visible for str.strip(unicode) */
6955PyObject *
6956_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6957{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006958 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6959 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6960 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6961 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6962 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006963
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006964 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006965
Benjamin Peterson857ce152009-01-31 16:29:18 +00006966 i = 0;
6967 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006968 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6969 i++;
6970 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006971 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006972
Benjamin Peterson857ce152009-01-31 16:29:18 +00006973 j = len;
6974 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006975 do {
6976 j--;
6977 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6978 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006979 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006980
Benjamin Peterson857ce152009-01-31 16:29:18 +00006981 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006982 Py_INCREF(self);
6983 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006984 }
6985 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006986 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006987}
6988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989
6990static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006991do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006993 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6994 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006995
Benjamin Peterson857ce152009-01-31 16:29:18 +00006996 i = 0;
6997 if (striptype != RIGHTSTRIP) {
6998 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6999 i++;
7000 }
7001 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007002
Benjamin Peterson857ce152009-01-31 16:29:18 +00007003 j = len;
7004 if (striptype != LEFTSTRIP) {
7005 do {
7006 j--;
7007 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7008 j++;
7009 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007010
Benjamin Peterson857ce152009-01-31 16:29:18 +00007011 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7012 Py_INCREF(self);
7013 return (PyObject*)self;
7014 }
7015 else
7016 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019
7020static PyObject *
7021do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7022{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007023 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007024
Benjamin Peterson857ce152009-01-31 16:29:18 +00007025 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7026 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007027
Benjamin Peterson857ce152009-01-31 16:29:18 +00007028 if (sep != NULL && sep != Py_None) {
7029 if (PyUnicode_Check(sep))
7030 return _PyUnicode_XStrip(self, striptype, sep);
7031 else if (PyString_Check(sep)) {
7032 PyObject *res;
7033 sep = PyUnicode_FromObject(sep);
7034 if (sep==NULL)
7035 return NULL;
7036 res = _PyUnicode_XStrip(self, striptype, sep);
7037 Py_DECREF(sep);
7038 return res;
7039 }
7040 else {
7041 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007042 "%s arg must be None, unicode or str",
7043 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007044 return NULL;
7045 }
7046 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007047
Benjamin Peterson857ce152009-01-31 16:29:18 +00007048 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049}
7050
7051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007053 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054\n\
7055Return a copy of the string S with leading and trailing\n\
7056whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007057If chars is given and not None, remove characters in chars instead.\n\
7058If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007059
7060static PyObject *
7061unicode_strip(PyUnicodeObject *self, PyObject *args)
7062{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007063 if (PyTuple_GET_SIZE(args) == 0)
7064 return do_strip(self, BOTHSTRIP); /* Common case */
7065 else
7066 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067}
7068
7069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007070PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007071 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007072\n\
7073Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007074If chars is given and not None, remove characters in chars instead.\n\
7075If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076
7077static PyObject *
7078unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7079{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007080 if (PyTuple_GET_SIZE(args) == 0)
7081 return do_strip(self, LEFTSTRIP); /* Common case */
7082 else
7083 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084}
7085
7086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007088 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089\n\
7090Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007091If chars is given and not None, remove characters in chars instead.\n\
7092If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093
7094static PyObject *
7095unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7096{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007097 if (PyTuple_GET_SIZE(args) == 0)
7098 return do_strip(self, RIGHTSTRIP); /* Common case */
7099 else
7100 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101}
7102
7103
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007105unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106{
7107 PyUnicodeObject *u;
7108 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007109 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007110 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111
7112 if (len < 0)
7113 len = 0;
7114
Tim Peters7a29bd52001-09-12 03:03:31 +00007115 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 /* no repeat, return original string */
7117 Py_INCREF(str);
7118 return (PyObject*) str;
7119 }
Tim Peters8f422462000-09-09 06:13:41 +00007120
7121 /* ensure # of chars needed doesn't overflow int and # of bytes
7122 * needed doesn't overflow size_t
7123 */
7124 nchars = len * str->length;
7125 if (len && nchars / len != str->length) {
7126 PyErr_SetString(PyExc_OverflowError,
7127 "repeated string is too long");
7128 return NULL;
7129 }
7130 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7131 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7132 PyErr_SetString(PyExc_OverflowError,
7133 "repeated string is too long");
7134 return NULL;
7135 }
7136 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 if (!u)
7138 return NULL;
7139
7140 p = u->str;
7141
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007142 if (str->length == 1 && len > 0) {
7143 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007144 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007145 Py_ssize_t done = 0; /* number of characters copied this far */
7146 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007147 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007148 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007149 }
7150 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007151 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007152 Py_UNICODE_COPY(p+done, p, n);
7153 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007154 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157 return (PyObject*) u;
7158}
7159
7160PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007161 PyObject *subobj,
7162 PyObject *replobj,
7163 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164{
7165 PyObject *self;
7166 PyObject *str1;
7167 PyObject *str2;
7168 PyObject *result;
7169
7170 self = PyUnicode_FromObject(obj);
7171 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007172 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 str1 = PyUnicode_FromObject(subobj);
7174 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007175 Py_DECREF(self);
7176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 }
7178 str2 = PyUnicode_FromObject(replobj);
7179 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007180 Py_DECREF(self);
7181 Py_DECREF(str1);
7182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 }
Tim Petersced69f82003-09-16 20:30:58 +00007184 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007185 (PyUnicodeObject *)str1,
7186 (PyUnicodeObject *)str2,
7187 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 Py_DECREF(self);
7189 Py_DECREF(str1);
7190 Py_DECREF(str2);
7191 return result;
7192}
7193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007194PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007195 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196\n\
7197Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007198old replaced by new. If the optional argument count is\n\
7199given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200
7201static PyObject*
7202unicode_replace(PyUnicodeObject *self, PyObject *args)
7203{
7204 PyUnicodeObject *str1;
7205 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 PyObject *result;
7208
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 return NULL;
7211 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7212 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007215 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007216 Py_DECREF(str1);
7217 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219
7220 result = replace(self, str1, str2, maxcount);
7221
7222 Py_DECREF(str1);
7223 Py_DECREF(str2);
7224 return result;
7225}
7226
7227static
7228PyObject *unicode_repr(PyObject *unicode)
7229{
7230 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007231 PyUnicode_GET_SIZE(unicode),
7232 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233}
7234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007236 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237\n\
7238Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007239such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240arguments start and end are interpreted as in slice notation.\n\
7241\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007242Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243
7244static PyObject *
7245unicode_rfind(PyUnicodeObject *self, PyObject *args)
7246{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007247 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007248 Py_ssize_t start;
7249 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007250 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
Facundo Batista57d56692007-11-16 18:04:14 +00007252 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007255 result = stringlib_rfind_slice(
7256 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7257 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7258 start, end
7259 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260
7261 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007262
7263 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264}
7265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007266PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007267 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007269Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
7271static PyObject *
7272unicode_rindex(PyUnicodeObject *self, PyObject *args)
7273{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007274 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007275 Py_ssize_t start;
7276 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007277 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278
Facundo Batista57d56692007-11-16 18:04:14 +00007279 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007282 result = stringlib_rfind_slice(
7283 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7284 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7285 start, end
7286 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
7288 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007289
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290 if (result < 0) {
7291 PyErr_SetString(PyExc_ValueError, "substring not found");
7292 return NULL;
7293 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007294 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295}
7296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007297PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007298 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007300Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007301done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302
7303static PyObject *
7304unicode_rjust(PyUnicodeObject *self, PyObject *args)
7305{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007306 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007307 Py_UNICODE fillchar = ' ';
7308
Martin v. Löwis412fb672006-04-13 06:34:32 +00007309 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 return NULL;
7311
Tim Peters7a29bd52001-09-12 03:03:31 +00007312 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 Py_INCREF(self);
7314 return (PyObject*) self;
7315 }
7316
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007317 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318}
7319
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007321unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322{
7323 /* standard clamping */
7324 if (start < 0)
7325 start = 0;
7326 if (end < 0)
7327 end = 0;
7328 if (end > self->length)
7329 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007330 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 /* full slice, return original string */
7332 Py_INCREF(self);
7333 return (PyObject*) self;
7334 }
7335 if (start > end)
7336 start = end;
7337 /* copy slice */
7338 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007339 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340}
7341
7342PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007343 PyObject *sep,
7344 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345{
7346 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007347
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 s = PyUnicode_FromObject(s);
7349 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007350 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007351 if (sep != NULL) {
7352 sep = PyUnicode_FromObject(sep);
7353 if (sep == NULL) {
7354 Py_DECREF(s);
7355 return NULL;
7356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 }
7358
7359 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7360
7361 Py_DECREF(s);
7362 Py_XDECREF(sep);
7363 return result;
7364}
7365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007367 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368\n\
7369Return a list of the words in S, using sep as the\n\
7370delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007371splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007372whitespace string is a separator and empty strings are\n\
7373removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374
7375static PyObject*
7376unicode_split(PyUnicodeObject *self, PyObject *args)
7377{
7378 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007379 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380
Martin v. Löwis18e16552006-02-15 17:27:45 +00007381 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 return NULL;
7383
7384 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007385 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007387 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007389 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007392PyObject *
7393PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7394{
7395 PyObject* str_obj;
7396 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007397 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007398
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007399 str_obj = PyUnicode_FromObject(str_in);
7400 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007401 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007402 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007403 if (!sep_obj) {
7404 Py_DECREF(str_obj);
7405 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007406 }
7407
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007408 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007409 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7410 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7411 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007412
Fredrik Lundhb9479482006-05-26 17:22:38 +00007413 Py_DECREF(sep_obj);
7414 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007415
7416 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007417}
7418
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007419
7420PyObject *
7421PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7422{
7423 PyObject* str_obj;
7424 PyObject* sep_obj;
7425 PyObject* out;
7426
7427 str_obj = PyUnicode_FromObject(str_in);
7428 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007429 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007430 sep_obj = PyUnicode_FromObject(sep_in);
7431 if (!sep_obj) {
7432 Py_DECREF(str_obj);
7433 return NULL;
7434 }
7435
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007436 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007437 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7438 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7439 );
7440
7441 Py_DECREF(sep_obj);
7442 Py_DECREF(str_obj);
7443
7444 return out;
7445}
7446
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007447PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007448 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007449\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007450Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007451the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007452found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007453
7454static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007455unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007456{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007457 return PyUnicode_Partition((PyObject *)self, separator);
7458}
7459
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007460PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007461 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007462\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007463Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007464the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007465separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007466
7467static PyObject*
7468unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7469{
7470 return PyUnicode_RPartition((PyObject *)self, separator);
7471}
7472
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007473PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007474 PyObject *sep,
7475 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007476{
7477 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007478
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007479 s = PyUnicode_FromObject(s);
7480 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007481 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007482 if (sep != NULL) {
7483 sep = PyUnicode_FromObject(sep);
7484 if (sep == NULL) {
7485 Py_DECREF(s);
7486 return NULL;
7487 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007488 }
7489
7490 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7491
7492 Py_DECREF(s);
7493 Py_XDECREF(sep);
7494 return result;
7495}
7496
7497PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007498 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007499\n\
7500Return a list of the words in S, using sep as the\n\
7501delimiter string, starting at the end of the string and\n\
7502working to the front. If maxsplit is given, at most maxsplit\n\
7503splits are done. If sep is not specified, any whitespace string\n\
7504is a separator.");
7505
7506static PyObject*
7507unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7508{
7509 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007510 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007511
Martin v. Löwis18e16552006-02-15 17:27:45 +00007512 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007513 return NULL;
7514
7515 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007516 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007517 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007518 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007519 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007520 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007521}
7522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007524 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525\n\
7526Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007527Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
7530static PyObject*
7531unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7532{
Guido van Rossum86662912000-04-11 15:38:46 +00007533 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534
Guido van Rossum86662912000-04-11 15:38:46 +00007535 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 return NULL;
7537
Guido van Rossum86662912000-04-11 15:38:46 +00007538 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539}
7540
7541static
7542PyObject *unicode_str(PyUnicodeObject *self)
7543{
Fred Drakee4315f52000-05-09 19:53:39 +00007544 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545}
7546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007547PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007548 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549\n\
7550Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552
7553static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007554unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 return fixup(self, fixswapcase);
7557}
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007560 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
7562Return a copy of the string S, where all characters have been mapped\n\
7563through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007564Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7565Unmapped characters are left untouched. Characters mapped to None\n\
7566are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
7568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007569unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570{
Tim Petersced69f82003-09-16 20:30:58 +00007571 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007572 self->length,
7573 table,
7574 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575}
7576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007577PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007578 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007580Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
7582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007583unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 return fixup(self, fixupper);
7586}
7587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007588PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007589 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590\n\
Georg Brandl98064072008-09-09 19:26:00 +00007591Pad a numeric string S with zeros on the left, to fill a field\n\
7592of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
7594static PyObject *
7595unicode_zfill(PyUnicodeObject *self, PyObject *args)
7596{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007597 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 PyUnicodeObject *u;
7599
Martin v. Löwis18e16552006-02-15 17:27:45 +00007600 Py_ssize_t width;
7601 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 return NULL;
7603
7604 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007605 if (PyUnicode_CheckExact(self)) {
7606 Py_INCREF(self);
7607 return (PyObject*) self;
7608 }
7609 else
7610 return PyUnicode_FromUnicode(
7611 PyUnicode_AS_UNICODE(self),
7612 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007613 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 }
7615
7616 fill = width - self->length;
7617
7618 u = pad(self, fill, 0, '0');
7619
Walter Dörwald068325e2002-04-15 13:36:47 +00007620 if (u == NULL)
7621 return NULL;
7622
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623 if (u->str[fill] == '+' || u->str[fill] == '-') {
7624 /* move sign to beginning of string */
7625 u->str[0] = u->str[fill];
7626 u->str[fill] = '0';
7627 }
7628
7629 return (PyObject*) u;
7630}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
7632#if 0
7633static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007634free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007636 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637}
7638#endif
7639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007640PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007641 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007643Return True if S starts with the specified prefix, False otherwise.\n\
7644With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007645With optional end, stop comparing S at that position.\n\
7646prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647
7648static PyObject *
7649unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007650 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651{
Georg Brandl24250812006-06-09 18:45:48 +00007652 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007654 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007655 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007656 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Georg Brandl24250812006-06-09 18:45:48 +00007658 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007659 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7660 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007661 if (PyTuple_Check(subobj)) {
7662 Py_ssize_t i;
7663 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7664 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007665 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007666 if (substring == NULL)
7667 return NULL;
7668 result = tailmatch(self, substring, start, end, -1);
7669 Py_DECREF(substring);
7670 if (result) {
7671 Py_RETURN_TRUE;
7672 }
7673 }
7674 /* nothing matched */
7675 Py_RETURN_FALSE;
7676 }
7677 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007679 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007680 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007682 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683}
7684
7685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007686PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007687 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007689Return True if S ends with the specified suffix, False otherwise.\n\
7690With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007691With optional end, stop comparing S at that position.\n\
7692suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694static PyObject *
7695unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007696 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697{
Georg Brandl24250812006-06-09 18:45:48 +00007698 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007700 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007701 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007702 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703
Georg Brandl24250812006-06-09 18:45:48 +00007704 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007705 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7706 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007707 if (PyTuple_Check(subobj)) {
7708 Py_ssize_t i;
7709 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7710 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007711 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007712 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007713 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007714 result = tailmatch(self, substring, start, end, +1);
7715 Py_DECREF(substring);
7716 if (result) {
7717 Py_RETURN_TRUE;
7718 }
7719 }
7720 Py_RETURN_FALSE;
7721 }
7722 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
Georg Brandl24250812006-06-09 18:45:48 +00007726 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007728 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729}
7730
7731
Eric Smitha9f7d622008-02-17 19:46:49 +00007732/* Implements do_string_format, which is unicode because of stringlib */
7733#include "stringlib/string_format.h"
7734
7735PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007736 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007737\n\
7738");
7739
Eric Smithdc13b792008-05-30 18:10:04 +00007740static PyObject *
7741unicode__format__(PyObject *self, PyObject *args)
7742{
7743 PyObject *format_spec;
7744 PyObject *result = NULL;
7745 PyObject *tmp = NULL;
7746
7747 /* If 2.x, convert format_spec to the same type as value */
7748 /* This is to allow things like u''.format('') */
7749 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7750 goto done;
7751 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7752 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007753 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007754 goto done;
7755 }
7756 tmp = PyObject_Unicode(format_spec);
7757 if (tmp == NULL)
7758 goto done;
7759 format_spec = tmp;
7760
7761 result = _PyUnicode_FormatAdvanced(self,
7762 PyUnicode_AS_UNICODE(format_spec),
7763 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007764 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007765 Py_XDECREF(tmp);
7766 return result;
7767}
7768
Eric Smitha9f7d622008-02-17 19:46:49 +00007769PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007770 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007771\n\
7772");
7773
Robert Schuppenies901c9972008-06-10 10:10:31 +00007774static PyObject *
7775unicode__sizeof__(PyUnicodeObject *v)
7776{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007777 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7778 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007779}
7780
7781PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007782 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007783\n\
7784");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007785
7786static PyObject *
7787unicode_getnewargs(PyUnicodeObject *v)
7788{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007789 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007790}
7791
7792
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793static PyMethodDef unicode_methods[] = {
7794
7795 /* Order is according to common usage: often used methods should
7796 appear first, since lookup is done sequentially. */
7797
Benjamin Peterson332d7212009-09-18 21:14:55 +00007798 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007799 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7800 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007801 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7803 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7804 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7805 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7806 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7807 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7808 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007809 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007810 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7811 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7812 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007814 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007815/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7816 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7817 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7818 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007819 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007820 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007821 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007823 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7824 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7825 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7826 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7827 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7828 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7829 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7830 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7831 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7832 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7833 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7834 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7835 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7836 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007837 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007838 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7839 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7840 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7841 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007842 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007843#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007844 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845#endif
7846
7847#if 0
7848 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007849 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850#endif
7851
Benjamin Peterson857ce152009-01-31 16:29:18 +00007852 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 {NULL, NULL}
7854};
7855
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007856static PyObject *
7857unicode_mod(PyObject *v, PyObject *w)
7858{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007859 if (!PyUnicode_Check(v)) {
7860 Py_INCREF(Py_NotImplemented);
7861 return Py_NotImplemented;
7862 }
7863 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007864}
7865
7866static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007867 0, /*nb_add*/
7868 0, /*nb_subtract*/
7869 0, /*nb_multiply*/
7870 0, /*nb_divide*/
7871 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007872};
7873
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007875 (lenfunc) unicode_length, /* sq_length */
7876 PyUnicode_Concat, /* sq_concat */
7877 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7878 (ssizeargfunc) unicode_getitem, /* sq_item */
7879 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7880 0, /* sq_ass_item */
7881 0, /* sq_ass_slice */
7882 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883};
7884
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007885static PyObject*
7886unicode_subscript(PyUnicodeObject* self, PyObject* item)
7887{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007888 if (PyIndex_Check(item)) {
7889 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007890 if (i == -1 && PyErr_Occurred())
7891 return NULL;
7892 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007893 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007894 return unicode_getitem(self, i);
7895 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007896 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007897 Py_UNICODE* source_buf;
7898 Py_UNICODE* result_buf;
7899 PyObject* result;
7900
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007901 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007902 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007903 return NULL;
7904 }
7905
7906 if (slicelength <= 0) {
7907 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007908 } else if (start == 0 && step == 1 && slicelength == self->length &&
7909 PyUnicode_CheckExact(self)) {
7910 Py_INCREF(self);
7911 return (PyObject *)self;
7912 } else if (step == 1) {
7913 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007914 } else {
7915 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007916 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7917 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007918
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007919 if (result_buf == NULL)
7920 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007921
7922 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7923 result_buf[i] = source_buf[cur];
7924 }
Tim Petersced69f82003-09-16 20:30:58 +00007925
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007926 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007927 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007928 return result;
7929 }
7930 } else {
7931 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7932 return NULL;
7933 }
7934}
7935
7936static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007937 (lenfunc)unicode_length, /* mp_length */
7938 (binaryfunc)unicode_subscript, /* mp_subscript */
7939 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007940};
7941
Martin v. Löwis18e16552006-02-15 17:27:45 +00007942static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007944 Py_ssize_t index,
7945 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946{
7947 if (index != 0) {
7948 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007949 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 return -1;
7951 }
7952 *ptr = (void *) self->str;
7953 return PyUnicode_GET_DATA_SIZE(self);
7954}
7955
Martin v. Löwis18e16552006-02-15 17:27:45 +00007956static Py_ssize_t
7957unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007958 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959{
7960 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007961 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 return -1;
7963}
7964
7965static int
7966unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007967 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968{
7969 if (lenp)
7970 *lenp = PyUnicode_GET_DATA_SIZE(self);
7971 return 1;
7972}
7973
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007974static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007976 Py_ssize_t index,
7977 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978{
7979 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (index != 0) {
7982 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007983 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 return -1;
7985 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007986 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007988 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007989 *ptr = (void *) PyString_AS_STRING(str);
7990 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991}
7992
7993/* Helpers for PyUnicode_Format() */
7994
7995static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007996getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007998 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008000 (*p_argidx)++;
8001 if (arglen < 0)
8002 return args;
8003 else
8004 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 }
8006 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008007 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 return NULL;
8009}
8010
8011#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008012#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008014#define F_ALT (1<<3)
8015#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
Martin v. Löwis18e16552006-02-15 17:27:45 +00008017static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008018strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008020 register Py_ssize_t i;
8021 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008023 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 return len;
8026}
8027
Neal Norwitzfc76d632006-01-10 06:03:13 +00008028static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008029longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8030{
Tim Peters15231542006-02-16 01:08:01 +00008031 Py_ssize_t result;
8032
Neal Norwitzfc76d632006-01-10 06:03:13 +00008033 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008034 result = strtounicode(buffer, (char *)buffer);
8035 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008036}
8037
Guido van Rossum078151d2002-08-11 04:24:12 +00008038/* XXX To save some code duplication, formatfloat/long/int could have been
8039 shared with stringobject.c, converting from 8-bit to Unicode after the
8040 formatting is done. */
8041
Mark Dickinson18cfada2009-11-23 18:46:41 +00008042/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8043
8044static PyObject *
8045formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008047 char *p;
8048 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008050
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 x = PyFloat_AsDouble(v);
8052 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008053 return NULL;
8054
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008056 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008057
Mark Dickinson18cfada2009-11-23 18:46:41 +00008058 p = PyOS_double_to_string(x, type, prec,
8059 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8060 if (p == NULL)
8061 return NULL;
8062 result = PyUnicode_FromStringAndSize(p, strlen(p));
8063 PyMem_Free(p);
8064 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065}
8066
Tim Peters38fd5b62000-09-21 05:43:11 +00008067static PyObject*
8068formatlong(PyObject *val, int flags, int prec, int type)
8069{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008070 char *buf;
8071 int i, len;
8072 PyObject *str; /* temporary string object. */
8073 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008074
Benjamin Peterson857ce152009-01-31 16:29:18 +00008075 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8076 if (!str)
8077 return NULL;
8078 result = _PyUnicode_New(len);
8079 if (!result) {
8080 Py_DECREF(str);
8081 return NULL;
8082 }
8083 for (i = 0; i < len; i++)
8084 result->str[i] = buf[i];
8085 result->str[len] = 0;
8086 Py_DECREF(str);
8087 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008088}
8089
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090static int
8091formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008092 size_t buflen,
8093 int flags,
8094 int prec,
8095 int type,
8096 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008098 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008099 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8100 * + 1 + 1
8101 * = 24
8102 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008103 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008104 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 long x;
8106
8107 x = PyInt_AsLong(v);
8108 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008109 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008110 if (x < 0 && type == 'u') {
8111 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008112 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008113 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8114 sign = "-";
8115 else
8116 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008118 prec = 1;
8119
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008120 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8121 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008122 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008123 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008124 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008125 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008126 return -1;
8127 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008128
8129 if ((flags & F_ALT) &&
8130 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008131 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008132 * of issues that cause pain:
8133 * - when 0 is being converted, the C standard leaves off
8134 * the '0x' or '0X', which is inconsistent with other
8135 * %#x/%#X conversions and inconsistent with Python's
8136 * hex() function
8137 * - there are platforms that violate the standard and
8138 * convert 0 with the '0x' or '0X'
8139 * (Metrowerks, Compaq Tru64)
8140 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008141 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008142 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008143 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008144 * We can achieve the desired consistency by inserting our
8145 * own '0x' or '0X' prefix, and substituting %x/%X in place
8146 * of %#x/%#X.
8147 *
8148 * Note that this is the same approach as used in
8149 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008150 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008151 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8152 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008153 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008154 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008155 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8156 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008157 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008158 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008159 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008160 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008161 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008162 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163}
8164
8165static int
8166formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008167 size_t buflen,
8168 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169{
Ezio Melotti32125152010-02-25 17:36:04 +00008170 PyObject *unistr;
8171 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008172 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008173 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008174 if (PyUnicode_GET_SIZE(v) != 1)
8175 goto onError;
8176 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008179 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008180 if (PyString_GET_SIZE(v) != 1)
8181 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008182 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8183 with a UnicodeDecodeError if 'char' is not decodable with the
8184 default encoding (usually ASCII, but it might be something else) */
8185 str = PyString_AS_STRING(v);
8186 if ((unsigned char)str[0] > 0x7F) {
8187 /* the char is not ASCII; try to decode the string using the
8188 default encoding and return -1 to let the UnicodeDecodeError
8189 be raised if the string can't be decoded */
8190 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8191 if (unistr == NULL)
8192 return -1;
8193 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8194 Py_DECREF(unistr);
8195 }
8196 else
8197 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199
8200 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008201 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008203 x = PyInt_AsLong(v);
8204 if (x == -1 && PyErr_Occurred())
8205 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008206#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008207 if (x < 0 || x > 0x10ffff) {
8208 PyErr_SetString(PyExc_OverflowError,
8209 "%c arg not in range(0x110000) "
8210 "(wide Python build)");
8211 return -1;
8212 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008213#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008214 if (x < 0 || x > 0xffff) {
8215 PyErr_SetString(PyExc_OverflowError,
8216 "%c arg not in range(0x10000) "
8217 "(narrow Python build)");
8218 return -1;
8219 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008220#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008221 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 }
8223 buf[1] = '\0';
8224 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008225
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008226 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008227 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008228 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008229 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230}
8231
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008232/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8233
Mark Dickinson18cfada2009-11-23 18:46:41 +00008234 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008235 chars are formatted. XXX This is a magic number. Each formatting
8236 routine does bounds checking to ensure no overflow, but a better
8237 solution may be to malloc a buffer of appropriate size for each
8238 format. For now, the current solution is sufficient.
8239*/
8240#define FORMATBUFLEN (size_t)120
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008243 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244{
8245 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008246 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 int args_owned = 0;
8248 PyUnicodeObject *result = NULL;
8249 PyObject *dict = NULL;
8250 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008251
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008253 PyErr_BadInternalCall();
8254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 }
8256 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008257 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 fmt = PyUnicode_AS_UNICODE(uformat);
8260 fmtcnt = PyUnicode_GET_SIZE(uformat);
8261
8262 reslen = rescnt = fmtcnt + 100;
8263 result = _PyUnicode_New(reslen);
8264 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 res = PyUnicode_AS_UNICODE(result);
8267
8268 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008269 arglen = PyTuple_Size(args);
8270 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
8272 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008273 arglen = -1;
8274 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 }
Christian Heimese93237d2007-12-19 02:37:44 +00008276 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008277 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008278 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279
8280 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008281 if (*fmt != '%') {
8282 if (--rescnt < 0) {
8283 rescnt = fmtcnt + 100;
8284 reslen += rescnt;
8285 if (_PyUnicode_Resize(&result, reslen) < 0)
8286 goto onError;
8287 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8288 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008289 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008290 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008291 }
8292 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008293 /* Got a format specifier */
8294 int flags = 0;
8295 Py_ssize_t width = -1;
8296 int prec = -1;
8297 Py_UNICODE c = '\0';
8298 Py_UNICODE fill;
8299 int isnumok;
8300 PyObject *v = NULL;
8301 PyObject *temp = NULL;
8302 Py_UNICODE *pbuf;
8303 Py_UNICODE sign;
8304 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008305 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008306
8307 fmt++;
8308 if (*fmt == '(') {
8309 Py_UNICODE *keystart;
8310 Py_ssize_t keylen;
8311 PyObject *key;
8312 int pcount = 1;
8313
8314 if (dict == NULL) {
8315 PyErr_SetString(PyExc_TypeError,
8316 "format requires a mapping");
8317 goto onError;
8318 }
8319 ++fmt;
8320 --fmtcnt;
8321 keystart = fmt;
8322 /* Skip over balanced parentheses */
8323 while (pcount > 0 && --fmtcnt >= 0) {
8324 if (*fmt == ')')
8325 --pcount;
8326 else if (*fmt == '(')
8327 ++pcount;
8328 fmt++;
8329 }
8330 keylen = fmt - keystart - 1;
8331 if (fmtcnt < 0 || pcount > 0) {
8332 PyErr_SetString(PyExc_ValueError,
8333 "incomplete format key");
8334 goto onError;
8335 }
8336#if 0
8337 /* keys are converted to strings using UTF-8 and
8338 then looked up since Python uses strings to hold
8339 variables names etc. in its namespaces and we
8340 wouldn't want to break common idioms. */
8341 key = PyUnicode_EncodeUTF8(keystart,
8342 keylen,
8343 NULL);
8344#else
8345 key = PyUnicode_FromUnicode(keystart, keylen);
8346#endif
8347 if (key == NULL)
8348 goto onError;
8349 if (args_owned) {
8350 Py_DECREF(args);
8351 args_owned = 0;
8352 }
8353 args = PyObject_GetItem(dict, key);
8354 Py_DECREF(key);
8355 if (args == NULL) {
8356 goto onError;
8357 }
8358 args_owned = 1;
8359 arglen = -1;
8360 argidx = -2;
8361 }
8362 while (--fmtcnt >= 0) {
8363 switch (c = *fmt++) {
8364 case '-': flags |= F_LJUST; continue;
8365 case '+': flags |= F_SIGN; continue;
8366 case ' ': flags |= F_BLANK; continue;
8367 case '#': flags |= F_ALT; continue;
8368 case '0': flags |= F_ZERO; continue;
8369 }
8370 break;
8371 }
8372 if (c == '*') {
8373 v = getnextarg(args, arglen, &argidx);
8374 if (v == NULL)
8375 goto onError;
8376 if (!PyInt_Check(v)) {
8377 PyErr_SetString(PyExc_TypeError,
8378 "* wants int");
8379 goto onError;
8380 }
8381 width = PyInt_AsLong(v);
8382 if (width < 0) {
8383 flags |= F_LJUST;
8384 width = -width;
8385 }
8386 if (--fmtcnt >= 0)
8387 c = *fmt++;
8388 }
8389 else if (c >= '0' && c <= '9') {
8390 width = c - '0';
8391 while (--fmtcnt >= 0) {
8392 c = *fmt++;
8393 if (c < '0' || c > '9')
8394 break;
8395 if ((width*10) / 10 != width) {
8396 PyErr_SetString(PyExc_ValueError,
8397 "width too big");
8398 goto onError;
8399 }
8400 width = width*10 + (c - '0');
8401 }
8402 }
8403 if (c == '.') {
8404 prec = 0;
8405 if (--fmtcnt >= 0)
8406 c = *fmt++;
8407 if (c == '*') {
8408 v = getnextarg(args, arglen, &argidx);
8409 if (v == NULL)
8410 goto onError;
8411 if (!PyInt_Check(v)) {
8412 PyErr_SetString(PyExc_TypeError,
8413 "* wants int");
8414 goto onError;
8415 }
8416 prec = PyInt_AsLong(v);
8417 if (prec < 0)
8418 prec = 0;
8419 if (--fmtcnt >= 0)
8420 c = *fmt++;
8421 }
8422 else if (c >= '0' && c <= '9') {
8423 prec = c - '0';
8424 while (--fmtcnt >= 0) {
8425 c = Py_CHARMASK(*fmt++);
8426 if (c < '0' || c > '9')
8427 break;
8428 if ((prec*10) / 10 != prec) {
8429 PyErr_SetString(PyExc_ValueError,
8430 "prec too big");
8431 goto onError;
8432 }
8433 prec = prec*10 + (c - '0');
8434 }
8435 }
8436 } /* prec */
8437 if (fmtcnt >= 0) {
8438 if (c == 'h' || c == 'l' || c == 'L') {
8439 if (--fmtcnt >= 0)
8440 c = *fmt++;
8441 }
8442 }
8443 if (fmtcnt < 0) {
8444 PyErr_SetString(PyExc_ValueError,
8445 "incomplete format");
8446 goto onError;
8447 }
8448 if (c != '%') {
8449 v = getnextarg(args, arglen, &argidx);
8450 if (v == NULL)
8451 goto onError;
8452 }
8453 sign = 0;
8454 fill = ' ';
8455 switch (c) {
8456
8457 case '%':
8458 pbuf = formatbuf;
8459 /* presume that buffer length is at least 1 */
8460 pbuf[0] = '%';
8461 len = 1;
8462 break;
8463
8464 case 's':
8465 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008466 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008467 temp = v;
8468 Py_INCREF(temp);
8469 }
8470 else {
8471 PyObject *unicode;
8472 if (c == 's')
8473 temp = PyObject_Unicode(v);
8474 else
8475 temp = PyObject_Repr(v);
8476 if (temp == NULL)
8477 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008478 if (PyUnicode_Check(temp))
8479 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008480 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008481 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008482 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8483 PyString_GET_SIZE(temp),
8484 NULL,
8485 "strict");
8486 Py_DECREF(temp);
8487 temp = unicode;
8488 if (temp == NULL)
8489 goto onError;
8490 }
8491 else {
8492 Py_DECREF(temp);
8493 PyErr_SetString(PyExc_TypeError,
8494 "%s argument has non-string str()");
8495 goto onError;
8496 }
8497 }
8498 pbuf = PyUnicode_AS_UNICODE(temp);
8499 len = PyUnicode_GET_SIZE(temp);
8500 if (prec >= 0 && len > prec)
8501 len = prec;
8502 break;
8503
8504 case 'i':
8505 case 'd':
8506 case 'u':
8507 case 'o':
8508 case 'x':
8509 case 'X':
8510 if (c == 'i')
8511 c = 'd';
8512 isnumok = 0;
8513 if (PyNumber_Check(v)) {
8514 PyObject *iobj=NULL;
8515
8516 if (PyInt_Check(v) || (PyLong_Check(v))) {
8517 iobj = v;
8518 Py_INCREF(iobj);
8519 }
8520 else {
8521 iobj = PyNumber_Int(v);
8522 if (iobj==NULL) iobj = PyNumber_Long(v);
8523 }
8524 if (iobj!=NULL) {
8525 if (PyInt_Check(iobj)) {
8526 isnumok = 1;
8527 pbuf = formatbuf;
8528 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8529 flags, prec, c, iobj);
8530 Py_DECREF(iobj);
8531 if (len < 0)
8532 goto onError;
8533 sign = 1;
8534 }
8535 else if (PyLong_Check(iobj)) {
8536 isnumok = 1;
8537 temp = formatlong(iobj, flags, prec, c);
8538 Py_DECREF(iobj);
8539 if (!temp)
8540 goto onError;
8541 pbuf = PyUnicode_AS_UNICODE(temp);
8542 len = PyUnicode_GET_SIZE(temp);
8543 sign = 1;
8544 }
8545 else {
8546 Py_DECREF(iobj);
8547 }
8548 }
8549 }
8550 if (!isnumok) {
8551 PyErr_Format(PyExc_TypeError,
8552 "%%%c format: a number is required, "
8553 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8554 goto onError;
8555 }
8556 if (flags & F_ZERO)
8557 fill = '0';
8558 break;
8559
8560 case 'e':
8561 case 'E':
8562 case 'f':
8563 case 'F':
8564 case 'g':
8565 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008566 temp = formatfloat(v, flags, prec, c);
8567 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008568 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008569 pbuf = PyUnicode_AS_UNICODE(temp);
8570 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008571 sign = 1;
8572 if (flags & F_ZERO)
8573 fill = '0';
8574 break;
8575
8576 case 'c':
8577 pbuf = formatbuf;
8578 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8579 if (len < 0)
8580 goto onError;
8581 break;
8582
8583 default:
8584 PyErr_Format(PyExc_ValueError,
8585 "unsupported format character '%c' (0x%x) "
8586 "at index %zd",
8587 (31<=c && c<=126) ? (char)c : '?',
8588 (int)c,
8589 (Py_ssize_t)(fmt - 1 -
8590 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008591 goto onError;
8592 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008593 if (sign) {
8594 if (*pbuf == '-' || *pbuf == '+') {
8595 sign = *pbuf++;
8596 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008597 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008598 else if (flags & F_SIGN)
8599 sign = '+';
8600 else if (flags & F_BLANK)
8601 sign = ' ';
8602 else
8603 sign = 0;
8604 }
8605 if (width < len)
8606 width = len;
8607 if (rescnt - (sign != 0) < width) {
8608 reslen -= rescnt;
8609 rescnt = width + fmtcnt + 100;
8610 reslen += rescnt;
8611 if (reslen < 0) {
8612 Py_XDECREF(temp);
8613 PyErr_NoMemory();
8614 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008615 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008616 if (_PyUnicode_Resize(&result, reslen) < 0) {
8617 Py_XDECREF(temp);
8618 goto onError;
8619 }
8620 res = PyUnicode_AS_UNICODE(result)
8621 + reslen - rescnt;
8622 }
8623 if (sign) {
8624 if (fill != ' ')
8625 *res++ = sign;
8626 rescnt--;
8627 if (width > len)
8628 width--;
8629 }
8630 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8631 assert(pbuf[0] == '0');
8632 assert(pbuf[1] == c);
8633 if (fill != ' ') {
8634 *res++ = *pbuf++;
8635 *res++ = *pbuf++;
8636 }
8637 rescnt -= 2;
8638 width -= 2;
8639 if (width < 0)
8640 width = 0;
8641 len -= 2;
8642 }
8643 if (width > len && !(flags & F_LJUST)) {
8644 do {
8645 --rescnt;
8646 *res++ = fill;
8647 } while (--width > len);
8648 }
8649 if (fill == ' ') {
8650 if (sign)
8651 *res++ = sign;
8652 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8653 assert(pbuf[0] == '0');
8654 assert(pbuf[1] == c);
8655 *res++ = *pbuf++;
8656 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008657 }
8658 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008659 Py_UNICODE_COPY(res, pbuf, len);
8660 res += len;
8661 rescnt -= len;
8662 while (--width >= len) {
8663 --rescnt;
8664 *res++ = ' ';
8665 }
8666 if (dict && (argidx < arglen) && c != '%') {
8667 PyErr_SetString(PyExc_TypeError,
8668 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008669 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008670 goto onError;
8671 }
8672 Py_XDECREF(temp);
8673 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 } /* until end */
8675 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008676 PyErr_SetString(PyExc_TypeError,
8677 "not all arguments converted during string formatting");
8678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 }
8680
Thomas Woutersa96affe2006-03-12 00:29:36 +00008681 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008682 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008684 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 }
8686 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 return (PyObject *)result;
8688
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008689 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 Py_XDECREF(result);
8691 Py_DECREF(uformat);
8692 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008693 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 }
8695 return NULL;
8696}
8697
8698static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 (readbufferproc) unicode_buffer_getreadbuf,
8700 (writebufferproc) unicode_buffer_getwritebuf,
8701 (segcountproc) unicode_buffer_getsegcount,
8702 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703};
8704
Jeremy Hylton938ace62002-07-17 16:30:39 +00008705static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008706unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8707
Tim Peters6d6c1a32001-08-02 04:15:00 +00008708static PyObject *
8709unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8710{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008711 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008712 static char *kwlist[] = {"string", "encoding", "errors", 0};
8713 char *encoding = NULL;
8714 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008715
Benjamin Peterson857ce152009-01-31 16:29:18 +00008716 if (type != &PyUnicode_Type)
8717 return unicode_subtype_new(type, args, kwds);
8718 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008719 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008720 return NULL;
8721 if (x == NULL)
8722 return (PyObject *)_PyUnicode_New(0);
8723 if (encoding == NULL && errors == NULL)
8724 return PyObject_Unicode(x);
8725 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008726 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008727}
8728
Guido van Rossume023fe02001-08-30 03:12:59 +00008729static PyObject *
8730unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8731{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008732 PyUnicodeObject *tmp, *pnew;
8733 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008734
Benjamin Peterson857ce152009-01-31 16:29:18 +00008735 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8736 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8737 if (tmp == NULL)
8738 return NULL;
8739 assert(PyUnicode_Check(tmp));
8740 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8741 if (pnew == NULL) {
8742 Py_DECREF(tmp);
8743 return NULL;
8744 }
8745 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8746 if (pnew->str == NULL) {
8747 _Py_ForgetReference((PyObject *)pnew);
8748 PyObject_Del(pnew);
8749 Py_DECREF(tmp);
8750 return PyErr_NoMemory();
8751 }
8752 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8753 pnew->length = n;
8754 pnew->hash = tmp->hash;
8755 Py_DECREF(tmp);
8756 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008757}
8758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008759PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008760 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008761\n\
8762Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008763encoding defaults to the current default string encoding.\n\
8764errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008765
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008767 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008768 "unicode", /* tp_name */
8769 sizeof(PyUnicodeObject), /* tp_size */
8770 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008772 (destructor)unicode_dealloc, /* tp_dealloc */
8773 0, /* tp_print */
8774 0, /* tp_getattr */
8775 0, /* tp_setattr */
8776 0, /* tp_compare */
8777 unicode_repr, /* tp_repr */
8778 &unicode_as_number, /* tp_as_number */
8779 &unicode_as_sequence, /* tp_as_sequence */
8780 &unicode_as_mapping, /* tp_as_mapping */
8781 (hashfunc) unicode_hash, /* tp_hash*/
8782 0, /* tp_call*/
8783 (reprfunc) unicode_str, /* tp_str */
8784 PyObject_GenericGetAttr, /* tp_getattro */
8785 0, /* tp_setattro */
8786 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008787 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008788 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008789 unicode_doc, /* tp_doc */
8790 0, /* tp_traverse */
8791 0, /* tp_clear */
8792 PyUnicode_RichCompare, /* tp_richcompare */
8793 0, /* tp_weaklistoffset */
8794 0, /* tp_iter */
8795 0, /* tp_iternext */
8796 unicode_methods, /* tp_methods */
8797 0, /* tp_members */
8798 0, /* tp_getset */
8799 &PyBaseString_Type, /* tp_base */
8800 0, /* tp_dict */
8801 0, /* tp_descr_get */
8802 0, /* tp_descr_set */
8803 0, /* tp_dictoffset */
8804 0, /* tp_init */
8805 0, /* tp_alloc */
8806 unicode_new, /* tp_new */
8807 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808};
8809
8810/* Initialize the Unicode implementation */
8811
Thomas Wouters78890102000-07-22 19:25:51 +00008812void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008814 int i;
8815
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008816 /* XXX - move this array to unicodectype.c ? */
8817 Py_UNICODE linebreak[] = {
8818 0x000A, /* LINE FEED */
8819 0x000D, /* CARRIAGE RETURN */
8820 0x001C, /* FILE SEPARATOR */
8821 0x001D, /* GROUP SEPARATOR */
8822 0x001E, /* RECORD SEPARATOR */
8823 0x0085, /* NEXT LINE */
8824 0x2028, /* LINE SEPARATOR */
8825 0x2029, /* PARAGRAPH SEPARATOR */
8826 };
8827
Fred Drakee4315f52000-05-09 19:53:39 +00008828 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008829 free_list = NULL;
8830 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008832 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008833 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008834
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008835 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008836 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008837 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008838 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008839 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008840
8841 /* initialize the linebreak bloom filter */
8842 bloom_linebreak = make_bloom_mask(
8843 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8844 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008845
8846 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847}
8848
8849/* Finalize the Unicode implementation */
8850
Christian Heimes3b718a72008-02-14 12:47:33 +00008851int
8852PyUnicode_ClearFreeList(void)
8853{
8854 int freelist_size = numfree;
8855 PyUnicodeObject *u;
8856
8857 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008858 PyUnicodeObject *v = u;
8859 u = *(PyUnicodeObject **)u;
8860 if (v->str)
8861 PyObject_DEL(v->str);
8862 Py_XDECREF(v->defenc);
8863 PyObject_Del(v);
8864 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008865 }
8866 free_list = NULL;
8867 assert(numfree == 0);
8868 return freelist_size;
8869}
8870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871void
Thomas Wouters78890102000-07-22 19:25:51 +00008872_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008874 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008876 Py_XDECREF(unicode_empty);
8877 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008878
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008879 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008880 if (unicode_latin1[i]) {
8881 Py_DECREF(unicode_latin1[i]);
8882 unicode_latin1[i] = NULL;
8883 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008884 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008885 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008887
Anthony Baxterac6bd462006-04-13 02:06:09 +00008888#ifdef __cplusplus
8889}
8890#endif