blob: 7c7e06269aa8799e607df198855f18e2304fc773 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000297 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000300 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Andrew Dalkee0df7622006-05-27 11:04:36 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
Neal Norwitze7d8be82008-07-31 17:17:14 +0000315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
318 }
319
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000329 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
332 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000333 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000334 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000337 }
338 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 }
340 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000341 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 if (unicode == NULL)
344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 }
348
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000349 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000350 PyErr_NoMemory();
351 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000352 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000353 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
359 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000360 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000362 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000378 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000380 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
385 }
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
389 }
390 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 }
395 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400}
401
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000402static
403int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404{
405 register PyUnicodeObject *v;
406
407 /* Argument checks */
408 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 PyErr_BadInternalCall();
410 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000412 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000414 PyErr_BadInternalCall();
415 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000421 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 }
432
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
436}
437
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000438int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
439{
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000456 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000457
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
468 }
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
471 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
Tim Petersced69f82003-09-16 20:30:58 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
477
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000480 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481
482 return (PyObject *)unicode;
483}
484
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000485PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
486{
487 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000488
Benjamin Peterson857ce152009-01-31 16:29:18 +0000489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000491 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 return NULL;
493 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000494
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
500
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000505 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000506
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
517 }
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
520 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000521
522 return PyUnicode_DecodeUTF8(u, size, NULL);
523 }
524
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
528
529 return (PyObject *)unicode;
530}
531
532PyObject *PyUnicode_FromString(const char *u)
533{
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
538 }
539
540 return PyUnicode_FromStringAndSize(u, size);
541}
542
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543#ifdef HAVE_WCHAR_H
544
Mark Dickinson6b265f12009-03-18 16:07:26 +0000545#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546# define CONVERT_WCHAR_TO_SURROGATES
547#endif
548
549#ifdef CONVERT_WCHAR_TO_SURROGATES
550
551/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
553
554PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
556{
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
561
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
573 }
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
578
579 /* Copy the wchar_t data into the new object */
580 {
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
589 }
590 else
591 *u++ = *w++;
592 }
593 }
594 return (PyObject *)unicode;
595}
596
597#else
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000600 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601{
602 PyUnicodeObject *unicode;
603
604 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 PyErr_BadInternalCall();
606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 }
608
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614#ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000616#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624#endif
625
626 return (PyObject *)unicode;
627}
628
Mark Dickinson6b265f12009-03-18 16:07:26 +0000629#endif /* CONVERT_WCHAR_TO_SURROGATES */
630
631#undef CONVERT_WCHAR_TO_SURROGATES
632
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633static void
634makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
635{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
641 }
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
650 }
651 *fmt++ = c;
652 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653}
654
655#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656
657PyObject *
658PyUnicode_FromFormatV(const char *format, va_list vargs)
659{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000680
681#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000682 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000683#else
684#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000687 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000688#endif
689#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000697 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
703 if (*f == 's')
704 ++callcount;
705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000706 }
707 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
714 }
715 callresult = callresults;
716 }
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
729 */
730 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
755 {
756 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000757 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000764 break;
765 }
766 case 'U':
767 {
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
772 }
773 case 'V':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
784 }
785 case 'S':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
797 }
798 case 'R':
799 {
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
810 }
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
817 */
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
829 }
830 } else
831 n++;
832 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000833 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
839 }
840 realbuffer = abuffer;
841 }
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000851
Benjamin Peterson857ce152009-01-31 16:29:18 +0000852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000854
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
870 }
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
876 }
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
881 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000882
Benjamin Peterson857ce152009-01-31 16:29:18 +0000883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
918 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000928 break;
929 }
930 case 'U':
931 {
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
937 }
938 case 'V':
939 {
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
948 }
949 break;
950 }
951 case 'S':
952 case 'R':
953 {
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
968 }
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
978 }
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
987 }
988 } else
989 *s++ = *f;
990 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000992 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1005 }
1006 PyObject_Free(callresults);
1007 }
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011}
1012
1013#undef appendstring
1014
1015PyObject *
1016PyUnicode_FromFormat(const char *format, ...)
1017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 PyObject* ret;
1019 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001020
1021#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001025#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001029}
1030
Martin v. Löwis18e16552006-02-15 17:27:45 +00001031Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001032 wchar_t *w,
1033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyErr_BadInternalCall();
1037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039
1040 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044#ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046#else
1047 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 }
1054#endif
1055
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060}
1061
1062#endif
1063
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064PyObject *PyUnicode_FromOrdinal(int ordinal)
1065{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001066 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067
1068#ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074 }
1075#else
1076 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#endif
1083
Hye-Shik Chang40574832004-04-06 07:24:51 +00001084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001086}
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_FromObject(register PyObject *obj)
1089{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001092 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 Py_INCREF(obj);
1094 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 }
1096 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103}
1104
1105PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 const char *encoding,
1107 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001110 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 PyErr_BadInternalCall();
1115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118#if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001123
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001126
1127 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 return PyObject_Unicode(obj);
1135 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136#else
1137 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#endif
1143
1144 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001145 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 }
Christian Heimes3497f942008-05-26 12:29:14 +00001149 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 }
Tim Petersced69f82003-09-16 20:30:58 +00001165
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001173
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 return v;
1175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184{
1185 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186
1187 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001189
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001211 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245 return NULL;
1246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252{
1253 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1261}
1262
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1266{
1267 PyObject *v;
1268
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1272 }
1273
1274 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284 return NULL;
1285}
1286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
Fred Drakee4315f52000-05-09 19:53:39 +00001297
Tim Petersced69f82003-09-16 20:30:58 +00001298 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001300
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001307#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001319 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001321 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001322 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 Py_DECREF(v);
1324 goto onError;
1325 }
1326 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 return NULL;
1330}
1331
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001333 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001334{
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_AS_UNICODE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return NULL;
1355}
1356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358{
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1362 }
1363 return PyUnicode_GET_SIZE(unicode);
1364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 return -1;
1367}
1368
Thomas Wouters78890102000-07-22 19:25:51 +00001369const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001370{
1371 return unicode_default_encoding;
1372}
1373
1374int PyUnicode_SetDefaultEncoding(const char *encoding)
1375{
1376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001377
Fred Drakee4315f52000-05-09 19:53:39 +00001378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 encoding,
1386 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001387 return 0;
1388
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return -1;
1391}
1392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393/* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001395 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 and adjust various state variables.
1397 return 0 on success, -1 on error
1398*/
1399
1400static
1401int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 int res = -1;
1417
1418 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 }
1423
1424 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001425 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 }
1430 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 }
1438
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 }
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001450 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 }
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_XDECREF(restuple);
1479 return res;
1480}
1481
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482/* --- UTF-7 Codec -------------------------------------------------------- */
1483
Antoine Pitrou653dece2009-05-04 18:32:32 +00001484/* See RFC2152 for details. We encode conservatively and decode liberally. */
1485
1486/* Three simple macros defining base-64. */
1487
1488/* Is c a base-64 character? */
1489
1490#define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493/* given that c is a base-64 character, what is its base-64 value? */
1494
1495#define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1500
1501/* What is the base-64 character of the bottom 6 bits of n? */
1502
1503#define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1510
1511#define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1513
1514/* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
Tim Petersced69f82003-09-16 20:30:58 +00001528static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001529char utf7_category[128] = {
1530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534/* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538/* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542/* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546};
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554#define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001561 Py_ssize_t size,
1562 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565}
1566
Antoine Pitrou653dece2009-05-04 18:32:32 +00001567/* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1573
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001574PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001605 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 e = s + size;
1607
1608 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625#ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628#else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631#endif
1632 surrogate = 0;
1633 }
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1638 }
1639 }
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1643 }
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1647 }
1648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
1866 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1867 illegal prefix. see RFC 2279 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1883 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001900 Py_ssize_t startinpos;
1901 Py_ssize_t endinpos;
1902 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 const char *e;
1904 PyUnicodeObject *unicode;
1905 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 PyObject *errorHandler = NULL;
1908 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
1910 /* Note: size will always be longer than the resulting Unicode
1911 character count */
1912 unicode = _PyUnicode_New(size);
1913 if (!unicode)
1914 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001915 if (size == 0) {
1916 if (consumed)
1917 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920
1921 /* Unpack UTF-8 encoded data */
1922 p = unicode->str;
1923 e = s + size;
1924
1925 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
1928 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001929 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 s++;
1931 continue;
1932 }
1933
1934 n = utf8_code_length[ch];
1935
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001937 if (consumed)
1938 break;
1939 else {
1940 errmsg = "unexpected end of data";
1941 startinpos = s-starts;
1942 endinpos = size;
1943 goto utf8Error;
1944 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946
1947 switch (n) {
1948
1949 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001950 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001951 startinpos = s-starts;
1952 endinpos = startinpos+1;
1953 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954
1955 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+1;
1959 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960
1961 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001962 if ((s[1] & 0xc0) != 0x80) {
1963 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001964 startinpos = s-starts;
1965 endinpos = startinpos+2;
1966 goto utf8Error;
1967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001970 startinpos = s-starts;
1971 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001972 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001973 goto utf8Error;
1974 }
1975 else
1976 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 break;
1978
1979 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001980 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001981 (s[2] & 0xc0) != 0x80) {
1982 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 startinpos = s-starts;
1984 endinpos = startinpos+3;
1985 goto utf8Error;
1986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001988 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 /* Note: UTF-8 encodings of surrogates are considered
1990 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001992 XXX For wide builds (UCS-4) we should probably try
1993 to recombine the surrogates into a single code
1994 unit.
1995 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001996 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+3;
1999 goto utf8Error;
2000 }
2001 else
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002008 (s[3] & 0xc0) != 0x80) {
2009 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002010 startinpos = s-starts;
2011 endinpos = startinpos+4;
2012 goto utf8Error;
2013 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002016 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002017 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002018 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002019 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002020 UTF-16 */
2021 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002023 startinpos = s-starts;
2024 endinpos = startinpos+4;
2025 goto utf8Error;
2026 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
2042
2043 default:
2044 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002045 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 startinpos = s-starts;
2047 endinpos = startinpos+n;
2048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 }
2050 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002051 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002052
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002053 utf8Error:
2054 outpos = p-PyUnicode_AS_UNICODE(unicode);
2055 if (unicode_decode_call_errorhandler(
2056 errors, &errorHandler,
2057 "utf8", errmsg,
2058 starts, size, &startinpos, &endinpos, &exc, &s,
2059 &unicode, &outpos, &p))
2060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 }
Walter Dörwald69652032004-09-07 20:24:22 +00002062 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002063 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
2065 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002066 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 goto onError;
2068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 Py_XDECREF(errorHandler);
2070 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 return (PyObject *)unicode;
2072
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002073 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 Py_XDECREF(errorHandler);
2075 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 Py_DECREF(unicode);
2077 return NULL;
2078}
2079
Tim Peters602f7402002-04-27 18:03:26 +00002080/* Allocation strategy: if the string is short, convert into a stack buffer
2081 and allocate exactly as much space needed at the end. Else allocate the
2082 maximum possible needed (4 result bytes per Unicode character), and return
2083 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002084*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002085PyObject *
2086PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002087 Py_ssize_t size,
2088 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089{
Tim Peters602f7402002-04-27 18:03:26 +00002090#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002091
Martin v. Löwis18e16552006-02-15 17:27:45 +00002092 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002093 PyObject *v; /* result string object */
2094 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002095 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002096 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002097 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002098
Tim Peters602f7402002-04-27 18:03:26 +00002099 assert(s != NULL);
2100 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101
Tim Peters602f7402002-04-27 18:03:26 +00002102 if (size <= MAX_SHORT_UNICHARS) {
2103 /* Write into the stack buffer; nallocated can't overflow.
2104 * At the end, we'll allocate exactly as much heap space as it
2105 * turns out we need.
2106 */
2107 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2108 v = NULL; /* will allocate after we're done */
2109 p = stackbuf;
2110 }
2111 else {
2112 /* Overallocate on the heap, and give the excess back at the end. */
2113 nallocated = size * 4;
2114 if (nallocated / 4 != size) /* overflow! */
2115 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002116 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002117 if (v == NULL)
2118 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002119 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002120 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002121
Tim Peters602f7402002-04-27 18:03:26 +00002122 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002123 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002124
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002125 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002126 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002128
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002130 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002131 *p++ = (char)(0xc0 | (ch >> 6));
2132 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002134 else {
Tim Peters602f7402002-04-27 18:03:26 +00002135 /* Encode UCS2 Unicode ordinals */
2136 if (ch < 0x10000) {
2137 /* Special case: check for high surrogate */
2138 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2139 Py_UCS4 ch2 = s[i];
2140 /* Check for low surrogate and combine the two to
2141 form a UCS4 value */
2142 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002143 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002144 i++;
2145 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002146 }
Tim Peters602f7402002-04-27 18:03:26 +00002147 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002148 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002149 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002150 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2151 *p++ = (char)(0x80 | (ch & 0x3f));
2152 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002153 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002154 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002155 /* Encode UCS4 Unicode ordinals */
2156 *p++ = (char)(0xf0 | (ch >> 18));
2157 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2158 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2159 *p++ = (char)(0x80 | (ch & 0x3f));
2160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002162
Tim Peters602f7402002-04-27 18:03:26 +00002163 if (v == NULL) {
2164 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002165 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002166 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002167 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
2169 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002170 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002171 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002172 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002173 if (_PyString_Resize(&v, nneeded))
2174 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002177
Tim Peters602f7402002-04-27 18:03:26 +00002178#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179}
2180
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2182{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 if (!PyUnicode_Check(unicode)) {
2184 PyErr_BadArgument();
2185 return NULL;
2186 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002187 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002188 PyUnicode_GET_SIZE(unicode),
2189 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190}
2191
Walter Dörwald6e390802007-08-17 16:41:28 +00002192/* --- UTF-32 Codec ------------------------------------------------------- */
2193
2194PyObject *
2195PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002196 Py_ssize_t size,
2197 const char *errors,
2198 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002199{
2200 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2201}
2202
2203PyObject *
2204PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002205 Py_ssize_t size,
2206 const char *errors,
2207 int *byteorder,
2208 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002209{
2210 const char *starts = s;
2211 Py_ssize_t startinpos;
2212 Py_ssize_t endinpos;
2213 Py_ssize_t outpos;
2214 PyUnicodeObject *unicode;
2215 Py_UNICODE *p;
2216#ifndef Py_UNICODE_WIDE
2217 int i, pairs;
2218#else
2219 const int pairs = 0;
2220#endif
2221 const unsigned char *q, *e;
2222 int bo = 0; /* assume native ordering by default */
2223 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002224 /* Offsets from q for retrieving bytes in the right order. */
2225#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2226 int iorder[] = {0, 1, 2, 3};
2227#else
2228 int iorder[] = {3, 2, 1, 0};
2229#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002230 PyObject *errorHandler = NULL;
2231 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002232 /* On narrow builds we split characters outside the BMP into two
2233 codepoints => count how much extra space we need. */
2234#ifndef Py_UNICODE_WIDE
2235 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002236 if (((Py_UCS4 *)s)[i] >= 0x10000)
2237 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002238#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002239
2240 /* This might be one to much, because of a BOM */
2241 unicode = _PyUnicode_New((size+3)/4+pairs);
2242 if (!unicode)
2243 return NULL;
2244 if (size == 0)
2245 return (PyObject *)unicode;
2246
2247 /* Unpack UTF-32 encoded data */
2248 p = unicode->str;
2249 q = (unsigned char *)s;
2250 e = q + size;
2251
2252 if (byteorder)
2253 bo = *byteorder;
2254
2255 /* Check for BOM marks (U+FEFF) in the input and adjust current
2256 byte order setting accordingly. In native mode, the leading BOM
2257 mark is skipped, in all other modes, it is copied to the output
2258 stream as-is (giving a ZWNBSP character). */
2259 if (bo == 0) {
2260 if (size >= 4) {
2261 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002262 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002263#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002264 if (bom == 0x0000FEFF) {
2265 q += 4;
2266 bo = -1;
2267 }
2268 else if (bom == 0xFFFE0000) {
2269 q += 4;
2270 bo = 1;
2271 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002272#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002273 if (bom == 0x0000FEFF) {
2274 q += 4;
2275 bo = 1;
2276 }
2277 else if (bom == 0xFFFE0000) {
2278 q += 4;
2279 bo = -1;
2280 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002281#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002282 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002283 }
2284
2285 if (bo == -1) {
2286 /* force LE */
2287 iorder[0] = 0;
2288 iorder[1] = 1;
2289 iorder[2] = 2;
2290 iorder[3] = 3;
2291 }
2292 else if (bo == 1) {
2293 /* force BE */
2294 iorder[0] = 3;
2295 iorder[1] = 2;
2296 iorder[2] = 1;
2297 iorder[3] = 0;
2298 }
2299
2300 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002301 Py_UCS4 ch;
2302 /* remaining bytes at the end? (size should be divisible by 4) */
2303 if (e-q<4) {
2304 if (consumed)
2305 break;
2306 errmsg = "truncated data";
2307 startinpos = ((const char *)q)-starts;
2308 endinpos = ((const char *)e)-starts;
2309 goto utf32Error;
2310 /* The remaining input chars are ignored if the callback
2311 chooses to skip the input */
2312 }
2313 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2314 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002316 if (ch >= 0x110000)
2317 {
2318 errmsg = "codepoint not in range(0x110000)";
2319 startinpos = ((const char *)q)-starts;
2320 endinpos = startinpos+4;
2321 goto utf32Error;
2322 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002323#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002324 if (ch >= 0x10000)
2325 {
2326 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2327 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2328 }
2329 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002330#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002331 *p++ = ch;
2332 q += 4;
2333 continue;
2334 utf32Error:
2335 outpos = p-PyUnicode_AS_UNICODE(unicode);
2336 if (unicode_decode_call_errorhandler(
2337 errors, &errorHandler,
2338 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002339 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002340 &unicode, &outpos, &p))
2341 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002342 }
2343
2344 if (byteorder)
2345 *byteorder = bo;
2346
2347 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002348 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002349
2350 /* Adjust length */
2351 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2352 goto onError;
2353
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return (PyObject *)unicode;
2357
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002358 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002359 Py_DECREF(unicode);
2360 Py_XDECREF(errorHandler);
2361 Py_XDECREF(exc);
2362 return NULL;
2363}
2364
2365PyObject *
2366PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002367 Py_ssize_t size,
2368 const char *errors,
2369 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002370{
2371 PyObject *v;
2372 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002373 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002374#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002375 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002376#else
2377 const int pairs = 0;
2378#endif
2379 /* Offsets from p for storing byte pairs in the right order. */
2380#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2381 int iorder[] = {0, 1, 2, 3};
2382#else
2383 int iorder[] = {3, 2, 1, 0};
2384#endif
2385
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002386#define STORECHAR(CH) \
2387 do { \
2388 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2389 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2390 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2391 p[iorder[0]] = (CH) & 0xff; \
2392 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002393 } while(0)
2394
2395 /* In narrow builds we can output surrogate pairs as one codepoint,
2396 so we need less space. */
2397#ifndef Py_UNICODE_WIDE
2398 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2400 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2401 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002402#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 nsize = (size - pairs + (byteorder == 0));
2404 bytesize = nsize * 4;
2405 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002406 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002407 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (v == NULL)
2409 return NULL;
2410
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002411 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002412 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002413 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002414 if (size == 0)
2415 return v;
2416
2417 if (byteorder == -1) {
2418 /* force LE */
2419 iorder[0] = 0;
2420 iorder[1] = 1;
2421 iorder[2] = 2;
2422 iorder[3] = 3;
2423 }
2424 else if (byteorder == 1) {
2425 /* force BE */
2426 iorder[0] = 3;
2427 iorder[1] = 2;
2428 iorder[2] = 1;
2429 iorder[3] = 0;
2430 }
2431
2432 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002433 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002434#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002435 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2436 Py_UCS4 ch2 = *s;
2437 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2438 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2439 s++;
2440 size--;
2441 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002442 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002443#endif
2444 STORECHAR(ch);
2445 }
2446 return v;
2447#undef STORECHAR
2448}
2449
2450PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2451{
2452 if (!PyUnicode_Check(unicode)) {
2453 PyErr_BadArgument();
2454 return NULL;
2455 }
2456 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002457 PyUnicode_GET_SIZE(unicode),
2458 NULL,
2459 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002460}
2461
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462/* --- UTF-16 Codec ------------------------------------------------------- */
2463
Tim Peters772747b2001-08-09 22:21:55 +00002464PyObject *
2465PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002466 Py_ssize_t size,
2467 const char *errors,
2468 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469{
Walter Dörwald69652032004-09-07 20:24:22 +00002470 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2471}
2472
2473PyObject *
2474PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002475 Py_ssize_t size,
2476 const char *errors,
2477 int *byteorder,
2478 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002481 Py_ssize_t startinpos;
2482 Py_ssize_t endinpos;
2483 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 PyUnicodeObject *unicode;
2485 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002486 const unsigned char *q, *e;
2487 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002489 /* Offsets from q for retrieving byte pairs in the right order. */
2490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2491 int ihi = 1, ilo = 0;
2492#else
2493 int ihi = 0, ilo = 1;
2494#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 PyObject *errorHandler = NULL;
2496 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497
2498 /* Note: size will always be longer than the resulting Unicode
2499 character count */
2500 unicode = _PyUnicode_New(size);
2501 if (!unicode)
2502 return NULL;
2503 if (size == 0)
2504 return (PyObject *)unicode;
2505
2506 /* Unpack UTF-16 encoded data */
2507 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002508 q = (unsigned char *)s;
2509 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510
2511 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002512 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002514 /* Check for BOM marks (U+FEFF) in the input and adjust current
2515 byte order setting accordingly. In native mode, the leading BOM
2516 mark is skipped, in all other modes, it is copied to the output
2517 stream as-is (giving a ZWNBSP character). */
2518 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002519 if (size >= 2) {
2520 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002521#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002522 if (bom == 0xFEFF) {
2523 q += 2;
2524 bo = -1;
2525 }
2526 else if (bom == 0xFFFE) {
2527 q += 2;
2528 bo = 1;
2529 }
Tim Petersced69f82003-09-16 20:30:58 +00002530#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002531 if (bom == 0xFEFF) {
2532 q += 2;
2533 bo = 1;
2534 }
2535 else if (bom == 0xFFFE) {
2536 q += 2;
2537 bo = -1;
2538 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002539#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002540 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542
Tim Peters772747b2001-08-09 22:21:55 +00002543 if (bo == -1) {
2544 /* force LE */
2545 ihi = 1;
2546 ilo = 0;
2547 }
2548 else if (bo == 1) {
2549 /* force BE */
2550 ihi = 0;
2551 ilo = 1;
2552 }
2553
2554 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002555 Py_UNICODE ch;
2556 /* remaining bytes at the end? (size should be even) */
2557 if (e-q<2) {
2558 if (consumed)
2559 break;
2560 errmsg = "truncated data";
2561 startinpos = ((const char *)q)-starts;
2562 endinpos = ((const char *)e)-starts;
2563 goto utf16Error;
2564 /* The remaining input chars are ignored if the callback
2565 chooses to skip the input */
2566 }
2567 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568
Benjamin Peterson857ce152009-01-31 16:29:18 +00002569 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002570
2571 if (ch < 0xD800 || ch > 0xDFFF) {
2572 *p++ = ch;
2573 continue;
2574 }
2575
2576 /* UTF-16 code pair: */
2577 if (q >= e) {
2578 errmsg = "unexpected end of data";
2579 startinpos = (((const char *)q)-2)-starts;
2580 endinpos = ((const char *)e)-starts;
2581 goto utf16Error;
2582 }
2583 if (0xD800 <= ch && ch <= 0xDBFF) {
2584 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2585 q += 2;
2586 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002587#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002588 *p++ = ch;
2589 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002591 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002592#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002593 continue;
2594 }
2595 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002596 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002597 startinpos = (((const char *)q)-4)-starts;
2598 endinpos = startinpos+2;
2599 goto utf16Error;
2600 }
2601
Benjamin Peterson857ce152009-01-31 16:29:18 +00002602 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002603 errmsg = "illegal encoding";
2604 startinpos = (((const char *)q)-2)-starts;
2605 endinpos = startinpos+2;
2606 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002607
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002608 utf16Error:
2609 outpos = p-PyUnicode_AS_UNICODE(unicode);
2610 if (unicode_decode_call_errorhandler(
2611 errors, &errorHandler,
2612 "utf16", errmsg,
2613 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2614 &unicode, &outpos, &p))
2615 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 }
2617
2618 if (byteorder)
2619 *byteorder = bo;
2620
Walter Dörwald69652032004-09-07 20:24:22 +00002621 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002622 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002623
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002625 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 goto onError;
2627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 Py_XDECREF(errorHandler);
2629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 return (PyObject *)unicode;
2631
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002632 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 Py_XDECREF(errorHandler);
2635 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 return NULL;
2637}
2638
Tim Peters772747b2001-08-09 22:21:55 +00002639PyObject *
2640PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002641 Py_ssize_t size,
2642 const char *errors,
2643 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644{
2645 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002646 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002647 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002648#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002649 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#else
2651 const int pairs = 0;
2652#endif
Tim Peters772747b2001-08-09 22:21:55 +00002653 /* Offsets from p for storing byte pairs in the right order. */
2654#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2655 int ihi = 1, ilo = 0;
2656#else
2657 int ihi = 0, ilo = 1;
2658#endif
2659
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002660#define STORECHAR(CH) \
2661 do { \
2662 p[ihi] = ((CH) >> 8) & 0xff; \
2663 p[ilo] = (CH) & 0xff; \
2664 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002665 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002667#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002668 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 if (s[i] >= 0x10000)
2670 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002671#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002672 /* 2 * (size + pairs + (byteorder == 0)) */
2673 if (size > PY_SSIZE_T_MAX ||
2674 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002675 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002676 nsize = size + pairs + (byteorder == 0);
2677 bytesize = nsize * 2;
2678 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002680 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 if (v == NULL)
2682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002684 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002686 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002687 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002688 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002689
2690 if (byteorder == -1) {
2691 /* force LE */
2692 ihi = 1;
2693 ilo = 0;
2694 }
2695 else if (byteorder == 1) {
2696 /* force BE */
2697 ihi = 0;
2698 ilo = 1;
2699 }
2700
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002701 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002702 Py_UNICODE ch = *s++;
2703 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002704#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002705 if (ch >= 0x10000) {
2706 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2707 ch = 0xD800 | ((ch-0x10000) >> 10);
2708 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002709#endif
Tim Peters772747b2001-08-09 22:21:55 +00002710 STORECHAR(ch);
2711 if (ch2)
2712 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002715#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716}
2717
2718PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2719{
2720 if (!PyUnicode_Check(unicode)) {
2721 PyErr_BadArgument();
2722 return NULL;
2723 }
2724 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002725 PyUnicode_GET_SIZE(unicode),
2726 NULL,
2727 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728}
2729
2730/* --- Unicode Escape Codec ----------------------------------------------- */
2731
Fredrik Lundh06d12682001-01-24 07:59:11 +00002732static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002735 Py_ssize_t size,
2736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002739 Py_ssize_t startinpos;
2740 Py_ssize_t endinpos;
2741 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746 char* message;
2747 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 PyObject *errorHandler = NULL;
2749 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002750
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 /* Escaped strings will always be longer than the resulting
2752 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 length after conversion to the true value.
2754 (but if the error callback returns a long replacement string
2755 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 v = _PyUnicode_New(size);
2757 if (v == NULL)
2758 goto onError;
2759 if (size == 0)
2760 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002764
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 while (s < end) {
2766 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002767 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769
2770 /* Non-escape characters are interpreted as Unicode ordinals */
2771 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002772 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 continue;
2774 }
2775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 /* \ - Escapes */
2778 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002779 c = *s++;
2780 if (s > end)
2781 c = '\0'; /* Invalid after \ */
2782 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002784 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 case '\n': break;
2786 case '\\': *p++ = '\\'; break;
2787 case '\'': *p++ = '\''; break;
2788 case '\"': *p++ = '\"'; break;
2789 case 'b': *p++ = '\b'; break;
2790 case 'f': *p++ = '\014'; break; /* FF */
2791 case 't': *p++ = '\t'; break;
2792 case 'n': *p++ = '\n'; break;
2793 case 'r': *p++ = '\r'; break;
2794 case 'v': *p++ = '\013'; break; /* VT */
2795 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2796
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002797 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 case '0': case '1': case '2': case '3':
2799 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002801 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002803 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002804 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002806 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 break;
2808
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002809 /* hex escapes */
2810 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812 digits = 2;
2813 message = "truncated \\xXX escape";
2814 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002816 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 digits = 4;
2819 message = "truncated \\uXXXX escape";
2820 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002822 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002823 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824 digits = 8;
2825 message = "truncated \\UXXXXXXXX escape";
2826 hexescape:
2827 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 outpos = p-PyUnicode_AS_UNICODE(v);
2829 if (s+digits>end) {
2830 endinpos = size;
2831 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002832 errors, &errorHandler,
2833 "unicodeescape", "end of string in escape sequence",
2834 starts, size, &startinpos, &endinpos, &exc, &s,
2835 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 goto onError;
2837 goto nextByte;
2838 }
2839 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002840 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002841 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 endinpos = (s+i+1)-starts;
2843 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002844 errors, &errorHandler,
2845 "unicodeescape", message,
2846 starts, size, &startinpos, &endinpos, &exc, &s,
2847 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002848 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002850 }
2851 chr = (chr<<4) & ~0xF;
2852 if (c >= '0' && c <= '9')
2853 chr += c - '0';
2854 else if (c >= 'a' && c <= 'f')
2855 chr += 10 + c - 'a';
2856 else
2857 chr += 10 + c - 'A';
2858 }
2859 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002860 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 /* _decoding_error will have already written into the
2862 target buffer. */
2863 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002864 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002865 /* when we get here, chr is a 32-bit unicode character */
2866 if (chr <= 0xffff)
2867 /* UCS-2 character */
2868 *p++ = (Py_UNICODE) chr;
2869 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002870 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002871 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002872#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002873 *p++ = chr;
2874#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002875 chr -= 0x10000L;
2876 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002877 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002878#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002879 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 endinpos = s-starts;
2881 outpos = p-PyUnicode_AS_UNICODE(v);
2882 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002883 errors, &errorHandler,
2884 "unicodeescape", "illegal Unicode character",
2885 starts, size, &startinpos, &endinpos, &exc, &s,
2886 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002887 goto onError;
2888 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 break;
2890
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002891 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 case 'N':
2893 message = "malformed \\N character escape";
2894 if (ucnhash_CAPI == NULL) {
2895 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002896 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002897 if (ucnhash_CAPI == NULL)
2898 goto ucnhashError;
2899 }
2900 if (*s == '{') {
2901 const char *start = s+1;
2902 /* look for the closing brace */
2903 while (*s != '}' && s < end)
2904 s++;
2905 if (s > start && s < end && *s == '}') {
2906 /* found a name. look it up in the unicode database */
2907 message = "unknown Unicode character name";
2908 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002909 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002910 goto store;
2911 }
2912 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 endinpos = s-starts;
2914 outpos = p-PyUnicode_AS_UNICODE(v);
2915 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002916 errors, &errorHandler,
2917 "unicodeescape", message,
2918 starts, size, &startinpos, &endinpos, &exc, &s,
2919 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002920 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002921 break;
2922
2923 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002924 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 message = "\\ at end of string";
2926 s--;
2927 endinpos = s-starts;
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002930 errors, &errorHandler,
2931 "unicodeescape", message,
2932 starts, size, &startinpos, &endinpos, &exc, &s,
2933 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002934 goto onError;
2935 }
2936 else {
2937 *p++ = '\\';
2938 *p++ = (unsigned char)s[-1];
2939 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002940 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002942 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002945 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002946 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002947 Py_XDECREF(errorHandler);
2948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002950
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002951 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002952 PyErr_SetString(
2953 PyExc_UnicodeError,
2954 "\\N escapes not supported (can't load unicodedata module)"
2955 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002959 return NULL;
2960
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002961 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 Py_XDECREF(errorHandler);
2964 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 return NULL;
2966}
2967
2968/* Return a Unicode-Escape string version of the Unicode object.
2969
2970 If quotes is true, the string is enclosed in u"" or u'' quotes as
2971 appropriate.
2972
2973*/
2974
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002975Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002976 Py_ssize_t size,
2977 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002978{
2979 /* like wcschr, but doesn't stop at NULL characters */
2980
2981 while (size-- > 0) {
2982 if (*s == ch)
2983 return s;
2984 s++;
2985 }
2986
2987 return NULL;
2988}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990static
2991PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002992 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 int quotes)
2994{
2995 PyObject *repr;
2996 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002998 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002999#ifdef Py_UNICODE_WIDE
3000 const Py_ssize_t expandsize = 10;
3001#else
3002 const Py_ssize_t expandsize = 6;
3003#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004
Neal Norwitz17753ec2006-08-21 22:21:19 +00003005 /* XXX(nnorwitz): rather than over-allocating, it would be
3006 better to choose a different scheme. Perhaps scan the
3007 first N-chars of the string and allocate based on that size.
3008 */
3009 /* Initial allocation is based on the longest-possible unichr
3010 escape.
3011
3012 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3013 unichr, so in this case it's the longest unichr escape. In
3014 narrow (UTF-16) builds this is five chars per source unichr
3015 since there are two unichrs in the surrogate pair, so in narrow
3016 (UTF-16) builds it's not the longest unichr escape.
3017
3018 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3019 so in the narrow (UTF-16) build case it's the longest unichr
3020 escape.
3021 */
3022
Neal Norwitze7d8be82008-07-31 17:17:14 +00003023 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003024 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003025
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003026 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003027 2
3028 + expandsize*size
3029 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (repr == NULL)
3031 return NULL;
3032
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003033 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034
3035 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003037 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 !findchar(s, size, '"')) ? '"' : '\'';
3039 }
3040 while (size-- > 0) {
3041 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003042
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003043 /* Escape quotes and backslashes */
3044 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003045 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 *p++ = '\\';
3047 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003048 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003049 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003050
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003051#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003052 /* Map 21-bit characters to '\U00xxxxxx' */
3053 else if (ch >= 0x10000) {
3054 *p++ = '\\';
3055 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003056 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3058 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3059 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3060 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3061 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3062 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003063 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003064 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003065 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003066#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003067 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3068 else if (ch >= 0xD800 && ch < 0xDC00) {
3069 Py_UNICODE ch2;
3070 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003072 ch2 = *s++;
3073 size--;
3074 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3075 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3076 *p++ = '\\';
3077 *p++ = 'U';
3078 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3080 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3081 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3082 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3083 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3084 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3085 *p++ = hexdigit[ucs & 0x0000000F];
3086 continue;
3087 }
3088 /* Fall through: isolated surrogates are copied as-is */
3089 s--;
3090 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003091 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003092#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003093
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003095 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 *p++ = '\\';
3097 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003098 *p++ = hexdigit[(ch >> 12) & 0x000F];
3099 *p++ = hexdigit[(ch >> 8) & 0x000F];
3100 *p++ = hexdigit[(ch >> 4) & 0x000F];
3101 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003103
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003104 /* Map special whitespace to '\t', \n', '\r' */
3105 else if (ch == '\t') {
3106 *p++ = '\\';
3107 *p++ = 't';
3108 }
3109 else if (ch == '\n') {
3110 *p++ = '\\';
3111 *p++ = 'n';
3112 }
3113 else if (ch == '\r') {
3114 *p++ = '\\';
3115 *p++ = 'r';
3116 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003117
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003118 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003119 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003121 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003122 *p++ = hexdigit[(ch >> 4) & 0x000F];
3123 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003125
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 /* Copy everything else as-is */
3127 else
3128 *p++ = (char) ch;
3129 }
3130 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003131 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132
3133 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003134 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 return repr;
3137}
3138
3139PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003140 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141{
3142 return unicodeescape_string(s, size, 0);
3143}
3144
3145PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3146{
3147 if (!PyUnicode_Check(unicode)) {
3148 PyErr_BadArgument();
3149 return NULL;
3150 }
3151 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153}
3154
3155/* --- Raw Unicode Escape Codec ------------------------------------------- */
3156
3157PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003158 Py_ssize_t size,
3159 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003162 Py_ssize_t startinpos;
3163 Py_ssize_t endinpos;
3164 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 const char *end;
3168 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 PyObject *errorHandler = NULL;
3170 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003171
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 /* Escaped strings will always be longer than the resulting
3173 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 length after conversion to the true value. (But decoding error
3175 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 v = _PyUnicode_New(size);
3177 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003180 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 end = s + size;
3183 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 unsigned char c;
3185 Py_UCS4 x;
3186 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003187 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003189 /* Non-escape characters are interpreted as Unicode ordinals */
3190 if (*s != '\\') {
3191 *p++ = (unsigned char)*s++;
3192 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003193 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003194 startinpos = s-starts;
3195
3196 /* \u-escapes are only interpreted iff the number of leading
3197 backslashes if odd */
3198 bs = s;
3199 for (;s < end;) {
3200 if (*s != '\\')
3201 break;
3202 *p++ = (unsigned char)*s++;
3203 }
3204 if (((s - bs) & 1) == 0 ||
3205 s >= end ||
3206 (*s != 'u' && *s != 'U')) {
3207 continue;
3208 }
3209 p--;
3210 count = *s=='u' ? 4 : 8;
3211 s++;
3212
3213 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3214 outpos = p-PyUnicode_AS_UNICODE(v);
3215 for (x = 0, i = 0; i < count; ++i, ++s) {
3216 c = (unsigned char)*s;
3217 if (!isxdigit(c)) {
3218 endinpos = s-starts;
3219 if (unicode_decode_call_errorhandler(
3220 errors, &errorHandler,
3221 "rawunicodeescape", "truncated \\uXXXX",
3222 starts, size, &startinpos, &endinpos, &exc, &s,
3223 &v, &outpos, &p))
3224 goto onError;
3225 goto nextByte;
3226 }
3227 x = (x<<4) & ~0xF;
3228 if (c >= '0' && c <= '9')
3229 x += c - '0';
3230 else if (c >= 'a' && c <= 'f')
3231 x += 10 + c - 'a';
3232 else
3233 x += 10 + c - 'A';
3234 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 /* UCS-2 character */
3237 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003238 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 /* UCS-4 character. Either store directly, or as
3240 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003242 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003243#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 x -= 0x10000L;
3245 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3246 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003247#endif
3248 } else {
3249 endinpos = s-starts;
3250 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 if (unicode_decode_call_errorhandler(
3252 errors, &errorHandler,
3253 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003254 starts, size, &startinpos, &endinpos, &exc, &s,
3255 &v, &outpos, &p))
3256 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003257 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 nextByte:
3259 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003261 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003266
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_XDECREF(errorHandler);
3270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 return NULL;
3272}
3273
3274PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
3277 PyObject *repr;
3278 char *p;
3279 char *q;
3280
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003281 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003282#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003283 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003286#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003287
Neal Norwitze7d8be82008-07-31 17:17:14 +00003288 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003290
Neal Norwitze7d8be82008-07-31 17:17:14 +00003291 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 if (repr == NULL)
3293 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003294 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003297 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 while (size-- > 0) {
3299 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003301 /* Map 32-bit characters to '\Uxxxxxxxx' */
3302 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003303 *p++ = '\\';
3304 *p++ = 'U';
3305 *p++ = hexdigit[(ch >> 28) & 0xf];
3306 *p++ = hexdigit[(ch >> 24) & 0xf];
3307 *p++ = hexdigit[(ch >> 20) & 0xf];
3308 *p++ = hexdigit[(ch >> 16) & 0xf];
3309 *p++ = hexdigit[(ch >> 12) & 0xf];
3310 *p++ = hexdigit[(ch >> 8) & 0xf];
3311 *p++ = hexdigit[(ch >> 4) & 0xf];
3312 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003313 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003314 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003315#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3317 if (ch >= 0xD800 && ch < 0xDC00) {
3318 Py_UNICODE ch2;
3319 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003320
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003321 ch2 = *s++;
3322 size--;
3323 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3324 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3325 *p++ = '\\';
3326 *p++ = 'U';
3327 *p++ = hexdigit[(ucs >> 28) & 0xf];
3328 *p++ = hexdigit[(ucs >> 24) & 0xf];
3329 *p++ = hexdigit[(ucs >> 20) & 0xf];
3330 *p++ = hexdigit[(ucs >> 16) & 0xf];
3331 *p++ = hexdigit[(ucs >> 12) & 0xf];
3332 *p++ = hexdigit[(ucs >> 8) & 0xf];
3333 *p++ = hexdigit[(ucs >> 4) & 0xf];
3334 *p++ = hexdigit[ucs & 0xf];
3335 continue;
3336 }
3337 /* Fall through: isolated surrogates are copied as-is */
3338 s--;
3339 size++;
3340 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003341#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003342 /* Map 16-bit characters to '\uxxxx' */
3343 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 *p++ = '\\';
3345 *p++ = 'u';
3346 *p++ = hexdigit[(ch >> 12) & 0xf];
3347 *p++ = hexdigit[(ch >> 8) & 0xf];
3348 *p++ = hexdigit[(ch >> 4) & 0xf];
3349 *p++ = hexdigit[ch & 15];
3350 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003351 /* Copy everything else as-is */
3352 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 *p++ = (char) ch;
3354 }
3355 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003356 if (_PyString_Resize(&repr, p - q))
3357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 return repr;
3359}
3360
3361PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3362{
3363 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 PyErr_BadArgument();
3365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
3367 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369}
3370
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003371/* --- Unicode Internal Codec ------------------------------------------- */
3372
3373PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003374 Py_ssize_t size,
3375 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003376{
3377 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003378 Py_ssize_t startinpos;
3379 Py_ssize_t endinpos;
3380 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003381 PyUnicodeObject *v;
3382 Py_UNICODE *p;
3383 const char *end;
3384 const char *reason;
3385 PyObject *errorHandler = NULL;
3386 PyObject *exc = NULL;
3387
Neal Norwitzd43069c2006-01-08 01:12:10 +00003388#ifdef Py_UNICODE_WIDE
3389 Py_UNICODE unimax = PyUnicode_GetMax();
3390#endif
3391
Armin Rigo7ccbca92006-10-04 12:17:45 +00003392 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003393 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3394 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003395 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003397 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003398 p = PyUnicode_AS_UNICODE(v);
3399 end = s + size;
3400
3401 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003402 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 /* We have to sanity check the raw data, otherwise doom looms for
3404 some malformed UCS-4 data. */
3405 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003407 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003408#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003409 end-s < Py_UNICODE_SIZE
3410 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003411 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003412 startinpos = s - starts;
3413 if (end-s < Py_UNICODE_SIZE) {
3414 endinpos = end-starts;
3415 reason = "truncated input";
3416 }
3417 else {
3418 endinpos = s - starts + Py_UNICODE_SIZE;
3419 reason = "illegal code point (> 0x10FFFF)";
3420 }
3421 outpos = p - PyUnicode_AS_UNICODE(v);
3422 if (unicode_decode_call_errorhandler(
3423 errors, &errorHandler,
3424 "unicode_internal", reason,
3425 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003426 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003427 goto onError;
3428 }
3429 }
3430 else {
3431 p++;
3432 s += Py_UNICODE_SIZE;
3433 }
3434 }
3435
Martin v. Löwis412fb672006-04-13 06:34:32 +00003436 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 goto onError;
3438 Py_XDECREF(errorHandler);
3439 Py_XDECREF(exc);
3440 return (PyObject *)v;
3441
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003443 Py_XDECREF(v);
3444 Py_XDECREF(errorHandler);
3445 Py_XDECREF(exc);
3446 return NULL;
3447}
3448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449/* --- Latin-1 Codec ------------------------------------------------------ */
3450
3451PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003452 Py_ssize_t size,
3453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
3455 PyUnicodeObject *v;
3456 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003457
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003459 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003460 Py_UNICODE r = *(unsigned char*)s;
3461 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003462 }
3463
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 v = _PyUnicode_New(size);
3465 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003468 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 p = PyUnicode_AS_UNICODE(v);
3470 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003471 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003473
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 Py_XDECREF(v);
3476 return NULL;
3477}
3478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479/* create or adjust a UnicodeEncodeError */
3480static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 const char *encoding,
3482 const Py_UNICODE *unicode, Py_ssize_t size,
3483 Py_ssize_t startpos, Py_ssize_t endpos,
3484 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003487 *exceptionObject = PyUnicodeEncodeError_Create(
3488 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
3490 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003491 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3492 goto onError;
3493 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3494 goto onError;
3495 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3496 goto onError;
3497 return;
3498 onError:
3499 Py_DECREF(*exceptionObject);
3500 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 }
3502}
3503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504/* raises a UnicodeEncodeError */
3505static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 const char *encoding,
3507 const Py_UNICODE *unicode, Py_ssize_t size,
3508 Py_ssize_t startpos, Py_ssize_t endpos,
3509 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510{
3511 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003514 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515}
3516
3517/* error handling callback helper:
3518 build arguments, call the callback and check the arguments,
3519 put the result into newpos and return the replacement string, which
3520 has to be freed by the caller */
3521static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003522 PyObject **errorHandler,
3523 const char *encoding, const char *reason,
3524 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3525 Py_ssize_t startpos, Py_ssize_t endpos,
3526 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529
3530 PyObject *restuple;
3531 PyObject *resunicode;
3532
3533 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
3538
3539 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543
3544 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003549 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 Py_DECREF(restuple);
3551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 }
3553 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 &resunicode, newpos)) {
3555 Py_DECREF(restuple);
3556 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 }
3558 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003559 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003560 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003561 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3562 Py_DECREF(restuple);
3563 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003564 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_INCREF(resunicode);
3566 Py_DECREF(restuple);
3567 return resunicode;
3568}
3569
3570static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003571 Py_ssize_t size,
3572 const char *errors,
3573 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574{
3575 /* output object */
3576 PyObject *res;
3577 /* pointers to the beginning and end+1 of input */
3578 const Py_UNICODE *startp = p;
3579 const Py_UNICODE *endp = p + size;
3580 /* pointer to the beginning of the unencodable characters */
3581 /* const Py_UNICODE *badp = NULL; */
3582 /* pointer into the output */
3583 char *str;
3584 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t respos = 0;
3586 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003587 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3588 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 PyObject *errorHandler = NULL;
3590 PyObject *exc = NULL;
3591 /* the following variable is used for caching string comparisons
3592 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3593 int known_errorHandler = -1;
3594
3595 /* allocate enough for a simple encoding without
3596 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003597 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 if (res == NULL)
3599 goto onError;
3600 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003602 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 ressize = size;
3604
3605 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003606 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003608 /* can we encode this? */
3609 if (c<limit) {
3610 /* no overflow check, because we know that the space is enough */
3611 *str++ = (char)c;
3612 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003613 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003614 else {
3615 Py_ssize_t unicodepos = p-startp;
3616 Py_ssize_t requiredsize;
3617 PyObject *repunicode;
3618 Py_ssize_t repsize;
3619 Py_ssize_t newpos;
3620 Py_ssize_t respos;
3621 Py_UNICODE *uni2;
3622 /* startpos for collecting unencodable chars */
3623 const Py_UNICODE *collstart = p;
3624 const Py_UNICODE *collend = p;
3625 /* find all unecodable characters */
3626 while ((collend < endp) && ((*collend)>=limit))
3627 ++collend;
3628 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3629 if (known_errorHandler==-1) {
3630 if ((errors==NULL) || (!strcmp(errors, "strict")))
3631 known_errorHandler = 1;
3632 else if (!strcmp(errors, "replace"))
3633 known_errorHandler = 2;
3634 else if (!strcmp(errors, "ignore"))
3635 known_errorHandler = 3;
3636 else if (!strcmp(errors, "xmlcharrefreplace"))
3637 known_errorHandler = 4;
3638 else
3639 known_errorHandler = 0;
3640 }
3641 switch (known_errorHandler) {
3642 case 1: /* strict */
3643 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3644 goto onError;
3645 case 2: /* replace */
3646 while (collstart++<collend)
3647 *str++ = '?'; /* fall through */
3648 case 3: /* ignore */
3649 p = collend;
3650 break;
3651 case 4: /* xmlcharrefreplace */
3652 respos = str-PyString_AS_STRING(res);
3653 /* determine replacement size (temporarily (mis)uses p) */
3654 for (p = collstart, repsize = 0; p < collend; ++p) {
3655 if (*p<10)
3656 repsize += 2+1+1;
3657 else if (*p<100)
3658 repsize += 2+2+1;
3659 else if (*p<1000)
3660 repsize += 2+3+1;
3661 else if (*p<10000)
3662 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003663#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003664 else
3665 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003666#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 else if (*p<100000)
3668 repsize += 2+5+1;
3669 else if (*p<1000000)
3670 repsize += 2+6+1;
3671 else
3672 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003673#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003674 }
3675 requiredsize = respos+repsize+(endp-collend);
3676 if (requiredsize > ressize) {
3677 if (requiredsize<2*ressize)
3678 requiredsize = 2*ressize;
3679 if (_PyString_Resize(&res, requiredsize))
3680 goto onError;
3681 str = PyString_AS_STRING(res) + respos;
3682 ressize = requiredsize;
3683 }
3684 /* generate replacement (temporarily (mis)uses p) */
3685 for (p = collstart; p < collend; ++p) {
3686 str += sprintf(str, "&#%d;", (int)*p);
3687 }
3688 p = collend;
3689 break;
3690 default:
3691 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3692 encoding, reason, startp, size, &exc,
3693 collstart-startp, collend-startp, &newpos);
3694 if (repunicode == NULL)
3695 goto onError;
3696 /* need more space? (at least enough for what we have+the
3697 replacement+the rest of the string, so we won't have to
3698 check space for encodable characters) */
3699 respos = str-PyString_AS_STRING(res);
3700 repsize = PyUnicode_GET_SIZE(repunicode);
3701 requiredsize = respos+repsize+(endp-collend);
3702 if (requiredsize > ressize) {
3703 if (requiredsize<2*ressize)
3704 requiredsize = 2*ressize;
3705 if (_PyString_Resize(&res, requiredsize)) {
3706 Py_DECREF(repunicode);
3707 goto onError;
3708 }
3709 str = PyString_AS_STRING(res) + respos;
3710 ressize = requiredsize;
3711 }
3712 /* check if there is anything unencodable in the replacement
3713 and copy it to the output */
3714 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3715 c = *uni2;
3716 if (c >= limit) {
3717 raise_encode_exception(&exc, encoding, startp, size,
3718 unicodepos, unicodepos+1, reason);
3719 Py_DECREF(repunicode);
3720 goto onError;
3721 }
3722 *str = (char)c;
3723 }
3724 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003725 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003726 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003727 }
3728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003730 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 /* If this falls res will be NULL */
3733 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc);
3736 return res;
3737
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003738 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 Py_XDECREF(res);
3740 Py_XDECREF(errorHandler);
3741 Py_XDECREF(exc);
3742 return NULL;
3743}
3744
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003746 Py_ssize_t size,
3747 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750}
3751
3752PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3753{
3754 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003755 PyErr_BadArgument();
3756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 }
3758 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 PyUnicode_GET_SIZE(unicode),
3760 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761}
3762
3763/* --- 7-bit ASCII Codec -------------------------------------------------- */
3764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003766 Py_ssize_t size,
3767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 PyUnicodeObject *v;
3771 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003772 Py_ssize_t startinpos;
3773 Py_ssize_t endinpos;
3774 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 const char *e;
3776 PyObject *errorHandler = NULL;
3777 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003780 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 Py_UNICODE r = *(unsigned char*)s;
3782 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003783 }
Tim Petersced69f82003-09-16 20:30:58 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 v = _PyUnicode_New(size);
3786 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 e = s + size;
3792 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003793 register unsigned char c = (unsigned char)*s;
3794 if (c < 128) {
3795 *p++ = c;
3796 ++s;
3797 }
3798 else {
3799 startinpos = s-starts;
3800 endinpos = startinpos + 1;
3801 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3802 if (unicode_decode_call_errorhandler(
3803 errors, &errorHandler,
3804 "ascii", "ordinal not in range(128)",
3805 starts, size, &startinpos, &endinpos, &exc, &s,
3806 &v, &outpos, &p))
3807 goto onError;
3808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003810 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3812 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 Py_XDECREF(errorHandler);
3814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003817 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 Py_XDECREF(errorHandler);
3820 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 return NULL;
3822}
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003825 Py_ssize_t size,
3826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829}
3830
3831PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3832{
3833 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 PyErr_BadArgument();
3835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 }
3837 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 PyUnicode_GET_SIZE(unicode),
3839 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840}
3841
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003842#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003843
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003844/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003845
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003846#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003847#define NEED_RETRY
3848#endif
3849
3850/* XXX This code is limited to "true" double-byte encodings, as
3851 a) it assumes an incomplete character consists of a single byte, and
3852 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003854
3855static int is_dbcs_lead_byte(const char *s, int offset)
3856{
3857 const char *curr = s + offset;
3858
3859 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003860 const char *prev = CharPrev(s, curr);
3861 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862 }
3863 return 0;
3864}
3865
3866/*
3867 * Decode MBCS string into unicode object. If 'final' is set, converts
3868 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3869 */
3870static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003871 const char *s, /* MBCS string */
3872 int size, /* sizeof MBCS string */
3873 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003874{
3875 Py_UNICODE *p;
3876 Py_ssize_t n = 0;
3877 int usize = 0;
3878
3879 assert(size >= 0);
3880
3881 /* Skip trailing lead-byte unless 'final' is set */
3882 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003884
3885 /* First get the size of the result */
3886 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3888 if (usize == 0) {
3889 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3890 return -1;
3891 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 }
3893
3894 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003895 /* Create unicode object */
3896 *v = _PyUnicode_New(usize);
3897 if (*v == NULL)
3898 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003899 }
3900 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003901 /* Extend unicode object */
3902 n = PyUnicode_GET_SIZE(*v);
3903 if (_PyUnicode_Resize(v, n + usize) < 0)
3904 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905 }
3906
3907 /* Do the conversion */
3908 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003909 p = PyUnicode_AS_UNICODE(*v) + n;
3910 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3911 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3912 return -1;
3913 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914 }
3915
3916 return size;
3917}
3918
3919PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003920 Py_ssize_t size,
3921 const char *errors,
3922 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923{
3924 PyUnicodeObject *v = NULL;
3925 int done;
3926
3927 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003928 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003929
3930#ifdef NEED_RETRY
3931 retry:
3932 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003933 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003934 else
3935#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937
3938 if (done < 0) {
3939 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941 }
3942
3943 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003944 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945
3946#ifdef NEED_RETRY
3947 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 s += done;
3949 size -= done;
3950 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003951 }
3952#endif
3953
3954 return (PyObject *)v;
3955}
3956
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003957PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003958 Py_ssize_t size,
3959 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003960{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003961 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3962}
3963
3964/*
3965 * Convert unicode into string object (MBCS).
3966 * Returns 0 if succeed, -1 otherwise.
3967 */
3968static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 const Py_UNICODE *p, /* unicode */
3970 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971{
3972 int mbcssize = 0;
3973 Py_ssize_t n = 0;
3974
3975 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976
3977 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003978 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3980 if (mbcssize == 0) {
3981 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3982 return -1;
3983 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003984 }
3985
Martin v. Löwisd8251432006-06-14 05:21:04 +00003986 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 /* Create string object */
3988 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3989 if (*repr == NULL)
3990 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003991 }
3992 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 /* Extend string object */
3994 n = PyString_Size(*repr);
3995 if (_PyString_Resize(repr, n + mbcssize) < 0)
3996 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003997 }
3998
3999 /* Do the conversion */
4000 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 char *s = PyString_AS_STRING(*repr) + n;
4002 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4003 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4004 return -1;
4005 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004006 }
4007
4008 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004009}
4010
4011PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 Py_ssize_t size,
4013 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004014{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 PyObject *repr = NULL;
4016 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004017
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004019 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004021 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022 else
4023#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004024 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004025
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 Py_XDECREF(repr);
4028 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004029 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030
4031#ifdef NEED_RETRY
4032 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004033 p += INT_MAX;
4034 size -= INT_MAX;
4035 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004036 }
4037#endif
4038
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004039 return repr;
4040}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004041
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004042PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4043{
4044 if (!PyUnicode_Check(unicode)) {
4045 PyErr_BadArgument();
4046 return NULL;
4047 }
4048 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004049 PyUnicode_GET_SIZE(unicode),
4050 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004051}
4052
Martin v. Löwisd8251432006-06-14 05:21:04 +00004053#undef NEED_RETRY
4054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004055#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057/* --- Character Mapping Codec -------------------------------------------- */
4058
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004060 Py_ssize_t size,
4061 PyObject *mapping,
4062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t startinpos;
4066 Py_ssize_t endinpos;
4067 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 PyUnicodeObject *v;
4070 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 PyObject *errorHandler = NULL;
4073 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004074 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004075 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 /* Default to Latin-1 */
4078 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080
4081 v = _PyUnicode_New(size);
4082 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004088 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 mapstring = PyUnicode_AS_UNICODE(mapping);
4090 maplen = PyUnicode_GET_SIZE(mapping);
4091 while (s < e) {
4092 unsigned char ch = *s;
4093 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004095 if (ch < maplen)
4096 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 if (x == 0xfffe) {
4099 /* undefined mapping */
4100 outpos = p-PyUnicode_AS_UNICODE(v);
4101 startinpos = s-starts;
4102 endinpos = startinpos+1;
4103 if (unicode_decode_call_errorhandler(
4104 errors, &errorHandler,
4105 "charmap", "character maps to <undefined>",
4106 starts, size, &startinpos, &endinpos, &exc, &s,
4107 &v, &outpos, &p)) {
4108 goto onError;
4109 }
4110 continue;
4111 }
4112 *p++ = x;
4113 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004114 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004115 }
4116 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 while (s < e) {
4118 unsigned char ch = *s;
4119 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004120
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004121 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4122 w = PyInt_FromLong((long)ch);
4123 if (w == NULL)
4124 goto onError;
4125 x = PyObject_GetItem(mapping, w);
4126 Py_DECREF(w);
4127 if (x == NULL) {
4128 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4129 /* No mapping found means: mapping is undefined. */
4130 PyErr_Clear();
4131 x = Py_None;
4132 Py_INCREF(x);
4133 } else
4134 goto onError;
4135 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004136
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004137 /* Apply mapping */
4138 if (PyInt_Check(x)) {
4139 long value = PyInt_AS_LONG(x);
4140 if (value < 0 || value > 65535) {
4141 PyErr_SetString(PyExc_TypeError,
4142 "character mapping must be in range(65536)");
4143 Py_DECREF(x);
4144 goto onError;
4145 }
4146 *p++ = (Py_UNICODE)value;
4147 }
4148 else if (x == Py_None) {
4149 /* undefined mapping */
4150 outpos = p-PyUnicode_AS_UNICODE(v);
4151 startinpos = s-starts;
4152 endinpos = startinpos+1;
4153 if (unicode_decode_call_errorhandler(
4154 errors, &errorHandler,
4155 "charmap", "character maps to <undefined>",
4156 starts, size, &startinpos, &endinpos, &exc, &s,
4157 &v, &outpos, &p)) {
4158 Py_DECREF(x);
4159 goto onError;
4160 }
4161 Py_DECREF(x);
4162 continue;
4163 }
4164 else if (PyUnicode_Check(x)) {
4165 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004166
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004167 if (targetsize == 1)
4168 /* 1-1 mapping */
4169 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004170
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004171 else if (targetsize > 1) {
4172 /* 1-n mapping */
4173 if (targetsize > extrachars) {
4174 /* resize first */
4175 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4176 Py_ssize_t needed = (targetsize - extrachars) + \
4177 (targetsize << 2);
4178 extrachars += needed;
4179 /* XXX overflow detection missing */
4180 if (_PyUnicode_Resize(&v,
4181 PyUnicode_GET_SIZE(v) + needed) < 0) {
4182 Py_DECREF(x);
4183 goto onError;
4184 }
4185 p = PyUnicode_AS_UNICODE(v) + oldpos;
4186 }
4187 Py_UNICODE_COPY(p,
4188 PyUnicode_AS_UNICODE(x),
4189 targetsize);
4190 p += targetsize;
4191 extrachars -= targetsize;
4192 }
4193 /* 1-0 mapping: skip the character */
4194 }
4195 else {
4196 /* wrong return value */
4197 PyErr_SetString(PyExc_TypeError,
4198 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004199 Py_DECREF(x);
4200 goto onError;
4201 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004202 Py_DECREF(x);
4203 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 }
4206 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004207 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 Py_XDECREF(errorHandler);
4210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004212
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004213 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 Py_XDECREF(errorHandler);
4215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 Py_XDECREF(v);
4217 return NULL;
4218}
4219
Martin v. Löwis3f767792006-06-04 19:36:28 +00004220/* Charmap encoding: the lookup table */
4221
4222struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004223 PyObject_HEAD
4224 unsigned char level1[32];
4225 int count2, count3;
4226 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004227};
4228
4229static PyObject*
4230encoding_map_size(PyObject *obj, PyObject* args)
4231{
4232 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004233 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004234 128*map->count3);
4235}
4236
4237static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004238 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004239 PyDoc_STR("Return the size (in bytes) of this object") },
4240 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004241};
4242
4243static void
4244encoding_map_dealloc(PyObject* o)
4245{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004246 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004247}
4248
4249static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004250 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004251 "EncodingMap", /*tp_name*/
4252 sizeof(struct encoding_map), /*tp_basicsize*/
4253 0, /*tp_itemsize*/
4254 /* methods */
4255 encoding_map_dealloc, /*tp_dealloc*/
4256 0, /*tp_print*/
4257 0, /*tp_getattr*/
4258 0, /*tp_setattr*/
4259 0, /*tp_compare*/
4260 0, /*tp_repr*/
4261 0, /*tp_as_number*/
4262 0, /*tp_as_sequence*/
4263 0, /*tp_as_mapping*/
4264 0, /*tp_hash*/
4265 0, /*tp_call*/
4266 0, /*tp_str*/
4267 0, /*tp_getattro*/
4268 0, /*tp_setattro*/
4269 0, /*tp_as_buffer*/
4270 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4271 0, /*tp_doc*/
4272 0, /*tp_traverse*/
4273 0, /*tp_clear*/
4274 0, /*tp_richcompare*/
4275 0, /*tp_weaklistoffset*/
4276 0, /*tp_iter*/
4277 0, /*tp_iternext*/
4278 encoding_map_methods, /*tp_methods*/
4279 0, /*tp_members*/
4280 0, /*tp_getset*/
4281 0, /*tp_base*/
4282 0, /*tp_dict*/
4283 0, /*tp_descr_get*/
4284 0, /*tp_descr_set*/
4285 0, /*tp_dictoffset*/
4286 0, /*tp_init*/
4287 0, /*tp_alloc*/
4288 0, /*tp_new*/
4289 0, /*tp_free*/
4290 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004291};
4292
4293PyObject*
4294PyUnicode_BuildEncodingMap(PyObject* string)
4295{
4296 Py_UNICODE *decode;
4297 PyObject *result;
4298 struct encoding_map *mresult;
4299 int i;
4300 int need_dict = 0;
4301 unsigned char level1[32];
4302 unsigned char level2[512];
4303 unsigned char *mlevel1, *mlevel2, *mlevel3;
4304 int count2 = 0, count3 = 0;
4305
4306 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4307 PyErr_BadArgument();
4308 return NULL;
4309 }
4310 decode = PyUnicode_AS_UNICODE(string);
4311 memset(level1, 0xFF, sizeof level1);
4312 memset(level2, 0xFF, sizeof level2);
4313
4314 /* If there isn't a one-to-one mapping of NULL to \0,
4315 or if there are non-BMP characters, we need to use
4316 a mapping dictionary. */
4317 if (decode[0] != 0)
4318 need_dict = 1;
4319 for (i = 1; i < 256; i++) {
4320 int l1, l2;
4321 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004322#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004323 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004324#endif
4325 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004326 need_dict = 1;
4327 break;
4328 }
4329 if (decode[i] == 0xFFFE)
4330 /* unmapped character */
4331 continue;
4332 l1 = decode[i] >> 11;
4333 l2 = decode[i] >> 7;
4334 if (level1[l1] == 0xFF)
4335 level1[l1] = count2++;
4336 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004337 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004338 }
4339
4340 if (count2 >= 0xFF || count3 >= 0xFF)
4341 need_dict = 1;
4342
4343 if (need_dict) {
4344 PyObject *result = PyDict_New();
4345 PyObject *key, *value;
4346 if (!result)
4347 return NULL;
4348 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004349 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004350 key = PyInt_FromLong(decode[i]);
4351 value = PyInt_FromLong(i);
4352 if (!key || !value)
4353 goto failed1;
4354 if (PyDict_SetItem(result, key, value) == -1)
4355 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004356 Py_DECREF(key);
4357 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004358 }
4359 return result;
4360 failed1:
4361 Py_XDECREF(key);
4362 Py_XDECREF(value);
4363 Py_DECREF(result);
4364 return NULL;
4365 }
4366
4367 /* Create a three-level trie */
4368 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4369 16*count2 + 128*count3 - 1);
4370 if (!result)
4371 return PyErr_NoMemory();
4372 PyObject_Init(result, &EncodingMapType);
4373 mresult = (struct encoding_map*)result;
4374 mresult->count2 = count2;
4375 mresult->count3 = count3;
4376 mlevel1 = mresult->level1;
4377 mlevel2 = mresult->level23;
4378 mlevel3 = mresult->level23 + 16*count2;
4379 memcpy(mlevel1, level1, 32);
4380 memset(mlevel2, 0xFF, 16*count2);
4381 memset(mlevel3, 0, 128*count3);
4382 count3 = 0;
4383 for (i = 1; i < 256; i++) {
4384 int o1, o2, o3, i2, i3;
4385 if (decode[i] == 0xFFFE)
4386 /* unmapped character */
4387 continue;
4388 o1 = decode[i]>>11;
4389 o2 = (decode[i]>>7) & 0xF;
4390 i2 = 16*mlevel1[o1] + o2;
4391 if (mlevel2[i2] == 0xFF)
4392 mlevel2[i2] = count3++;
4393 o3 = decode[i] & 0x7F;
4394 i3 = 128*mlevel2[i2] + o3;
4395 mlevel3[i3] = i;
4396 }
4397 return result;
4398}
4399
4400static int
4401encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4402{
4403 struct encoding_map *map = (struct encoding_map*)mapping;
4404 int l1 = c>>11;
4405 int l2 = (c>>7) & 0xF;
4406 int l3 = c & 0x7F;
4407 int i;
4408
4409#ifdef Py_UNICODE_WIDE
4410 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004411 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004412 }
4413#endif
4414 if (c == 0)
4415 return 0;
4416 /* level 1*/
4417 i = map->level1[l1];
4418 if (i == 0xFF) {
4419 return -1;
4420 }
4421 /* level 2*/
4422 i = map->level23[16*i+l2];
4423 if (i == 0xFF) {
4424 return -1;
4425 }
4426 /* level 3 */
4427 i = map->level23[16*map->count2 + 128*i + l3];
4428 if (i == 0) {
4429 return -1;
4430 }
4431 return i;
4432}
4433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434/* Lookup the character ch in the mapping. If the character
4435 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004436 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 PyObject *w = PyInt_FromLong((long)c);
4440 PyObject *x;
4441
4442 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 x = PyObject_GetItem(mapping, w);
4445 Py_DECREF(w);
4446 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004447 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4448 /* No mapping found means: mapping is undefined. */
4449 PyErr_Clear();
4450 x = Py_None;
4451 Py_INCREF(x);
4452 return x;
4453 } else
4454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004456 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004457 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004459 long value = PyInt_AS_LONG(x);
4460 if (value < 0 || value > 255) {
4461 PyErr_SetString(PyExc_TypeError,
4462 "character mapping must be in range(256)");
4463 Py_DECREF(x);
4464 return NULL;
4465 }
4466 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004468 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004469 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004471 /* wrong return value */
4472 PyErr_SetString(PyExc_TypeError,
4473 "character mapping must return integer, None or str");
4474 Py_DECREF(x);
4475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
4477}
4478
Martin v. Löwis3f767792006-06-04 19:36:28 +00004479static int
4480charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4481{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004482 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4483 /* exponentially overallocate to minimize reallocations */
4484 if (requiredsize < 2*outsize)
4485 requiredsize = 2*outsize;
4486 if (_PyString_Resize(outobj, requiredsize)) {
4487 return 0;
4488 }
4489 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004490}
4491
Benjamin Peterson857ce152009-01-31 16:29:18 +00004492typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004493 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004494}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495/* lookup the character, put the result in the output string and adjust
4496 various state variables. Reallocate the output string if not enough
4497 space is available. Return a new reference to the object that
4498 was put in the output buffer, or Py_None, if the mapping was undefined
4499 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004500 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004502charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004503 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505 PyObject *rep;
4506 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004507 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508
Christian Heimese93237d2007-12-19 02:37:44 +00004509 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004510 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004511 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512 if (res == -1)
4513 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004514 if (outsize<requiredsize)
4515 if (!charmapencode_resize(outobj, outpos, requiredsize))
4516 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004517 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 outstart[(*outpos)++] = (char)res;
4519 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004520 }
4521
4522 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004524 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004525 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004526 Py_DECREF(rep);
4527 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004528 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 if (PyInt_Check(rep)) {
4530 Py_ssize_t requiredsize = *outpos+1;
4531 if (outsize<requiredsize)
4532 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4533 Py_DECREF(rep);
4534 return enc_EXCEPTION;
4535 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004536 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004537 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004538 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004539 else {
4540 const char *repchars = PyString_AS_STRING(rep);
4541 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4542 Py_ssize_t requiredsize = *outpos+repsize;
4543 if (outsize<requiredsize)
4544 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4545 Py_DECREF(rep);
4546 return enc_EXCEPTION;
4547 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004548 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004549 memcpy(outstart + *outpos, repchars, repsize);
4550 *outpos += repsize;
4551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 }
Georg Brandl9f167602006-06-04 21:46:16 +00004553 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004554 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555}
4556
4557/* handle an error in PyUnicode_EncodeCharmap
4558 Return 0 on success, -1 on error */
4559static
4560int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004563 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565{
4566 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t repsize;
4568 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 Py_UNICODE *uni2;
4570 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004571 Py_ssize_t collstartpos = *inpos;
4572 Py_ssize_t collendpos = *inpos+1;
4573 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 char *encoding = "charmap";
4575 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004576 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 /* find all unencodable characters */
4579 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004580 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004581 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004582 int res = encoding_map_lookup(p[collendpos], mapping);
4583 if (res != -1)
4584 break;
4585 ++collendpos;
4586 continue;
4587 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004588
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004589 rep = charmapencode_lookup(p[collendpos], mapping);
4590 if (rep==NULL)
4591 return -1;
4592 else if (rep!=Py_None) {
4593 Py_DECREF(rep);
4594 break;
4595 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004596 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004597 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 }
4599 /* cache callback name lookup
4600 * (if not done yet, i.e. it's the first error) */
4601 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004602 if ((errors==NULL) || (!strcmp(errors, "strict")))
4603 *known_errorHandler = 1;
4604 else if (!strcmp(errors, "replace"))
4605 *known_errorHandler = 2;
4606 else if (!strcmp(errors, "ignore"))
4607 *known_errorHandler = 3;
4608 else if (!strcmp(errors, "xmlcharrefreplace"))
4609 *known_errorHandler = 4;
4610 else
4611 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 }
4613 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004614 case 1: /* strict */
4615 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4616 return -1;
4617 case 2: /* replace */
4618 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004619 x = charmapencode_output('?', mapping, res, respos);
4620 if (x==enc_EXCEPTION) {
4621 return -1;
4622 }
4623 else if (x==enc_FAILED) {
4624 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4625 return -1;
4626 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004627 }
4628 /* fall through */
4629 case 3: /* ignore */
4630 *inpos = collendpos;
4631 break;
4632 case 4: /* xmlcharrefreplace */
4633 /* generate replacement (temporarily (mis)uses p) */
4634 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004635 char buffer[2+29+1+1];
4636 char *cp;
4637 sprintf(buffer, "&#%d;", (int)p[collpos]);
4638 for (cp = buffer; *cp; ++cp) {
4639 x = charmapencode_output(*cp, mapping, res, respos);
4640 if (x==enc_EXCEPTION)
4641 return -1;
4642 else if (x==enc_FAILED) {
4643 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4644 return -1;
4645 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004646 }
4647 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004648 *inpos = collendpos;
4649 break;
4650 default:
4651 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004652 encoding, reason, p, size, exceptionObject,
4653 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004654 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004655 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004656 /* generate replacement */
4657 repsize = PyUnicode_GET_SIZE(repunicode);
4658 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004659 x = charmapencode_output(*uni2, mapping, res, respos);
4660 if (x==enc_EXCEPTION) {
4661 return -1;
4662 }
4663 else if (x==enc_FAILED) {
4664 Py_DECREF(repunicode);
4665 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4666 return -1;
4667 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004668 }
4669 *inpos = newpos;
4670 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671 }
4672 return 0;
4673}
4674
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004676 Py_ssize_t size,
4677 PyObject *mapping,
4678 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 /* output object */
4681 PyObject *res = NULL;
4682 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004683 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004685 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 PyObject *errorHandler = NULL;
4687 PyObject *exc = NULL;
4688 /* the following variable is used for caching string comparisons
4689 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4690 * 3=ignore, 4=xmlcharrefreplace */
4691 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
4693 /* Default to Latin-1 */
4694 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004695 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 /* allocate enough for a simple encoding without
4698 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004699 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 if (res == NULL)
4701 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004702 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004703 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004706 /* try to encode it */
4707 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4708 if (x==enc_EXCEPTION) /* error */
4709 goto onError;
4710 if (x==enc_FAILED) { /* unencodable character */
4711 if (charmap_encoding_error(p, size, &inpos, mapping,
4712 &exc,
4713 &known_errorHandler, &errorHandler, errors,
4714 &res, &respos)) {
4715 goto onError;
4716 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004717 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004718 else
4719 /* done with this character => adjust input position */
4720 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004724 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 if (_PyString_Resize(&res, respos))
4726 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 }
4728 Py_XDECREF(exc);
4729 Py_XDECREF(errorHandler);
4730 return res;
4731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(res);
4734 Py_XDECREF(exc);
4735 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 return NULL;
4737}
4738
4739PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741{
4742 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 PyErr_BadArgument();
4744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 }
4746 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 PyUnicode_GET_SIZE(unicode),
4748 mapping,
4749 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750}
4751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752/* create or adjust a UnicodeTranslateError */
4753static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004754 const Py_UNICODE *unicode, Py_ssize_t size,
4755 Py_ssize_t startpos, Py_ssize_t endpos,
4756 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004759 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004760 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 }
4762 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004763 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4764 goto onError;
4765 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4766 goto onError;
4767 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4768 goto onError;
4769 return;
4770 onError:
4771 Py_DECREF(*exceptionObject);
4772 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 }
4774}
4775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776/* raises a UnicodeTranslateError */
4777static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004778 const Py_UNICODE *unicode, Py_ssize_t size,
4779 Py_ssize_t startpos, Py_ssize_t endpos,
4780 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781{
4782 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004783 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004785 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786}
4787
4788/* error handling callback helper:
4789 build arguments, call the callback and check the arguments,
4790 put the result into newpos and return the replacement string, which
4791 has to be freed by the caller */
4792static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004793 PyObject **errorHandler,
4794 const char *reason,
4795 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4796 Py_ssize_t startpos, Py_ssize_t endpos,
4797 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004799 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800
Martin v. Löwis412fb672006-04-13 06:34:32 +00004801 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 PyObject *restuple;
4803 PyObject *resunicode;
4804
4805 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 }
4810
4811 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815
4816 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004817 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004821 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 Py_DECREF(restuple);
4823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
4825 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004826 &resunicode, &i_newpos)) {
4827 Py_DECREF(restuple);
4828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004830 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 else
4833 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004834 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004835 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4836 Py_DECREF(restuple);
4837 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 Py_INCREF(resunicode);
4840 Py_DECREF(restuple);
4841 return resunicode;
4842}
4843
4844/* Lookup the character ch in the mapping and put the result in result,
4845 which must be decrefed by the caller.
4846 Return 0 on success, -1 on error */
4847static
4848int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4849{
4850 PyObject *w = PyInt_FromLong((long)c);
4851 PyObject *x;
4852
4853 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 x = PyObject_GetItem(mapping, w);
4856 Py_DECREF(w);
4857 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004858 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4859 /* No mapping found means: use 1:1 mapping. */
4860 PyErr_Clear();
4861 *result = NULL;
4862 return 0;
4863 } else
4864 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 }
4866 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004867 *result = x;
4868 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 }
4870 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004871 long value = PyInt_AS_LONG(x);
4872 long max = PyUnicode_GetMax();
4873 if (value < 0 || value > max) {
4874 PyErr_Format(PyExc_TypeError,
4875 "character mapping must be in range(0x%lx)", max+1);
4876 Py_DECREF(x);
4877 return -1;
4878 }
4879 *result = x;
4880 return 0;
4881 }
4882 else if (PyUnicode_Check(x)) {
4883 *result = x;
4884 return 0;
4885 }
4886 else {
4887 /* wrong return value */
4888 PyErr_SetString(PyExc_TypeError,
4889 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004890 Py_DECREF(x);
4891 return -1;
4892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893}
4894/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 if not reallocate and adjust various state variables.
4896 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897static
Walter Dörwald4894c302003-10-24 14:25:28 +00004898int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004901 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004902 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004903 /* remember old output position */
4904 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4905 /* exponentially overallocate to minimize reallocations */
4906 if (requiredsize < 2 * oldsize)
4907 requiredsize = 2 * oldsize;
4908 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4909 return -1;
4910 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 }
4912 return 0;
4913}
4914/* lookup the character, put the result in the output string and adjust
4915 various state variables. Return a new reference to the object that
4916 was put in the output buffer in *result, or Py_None, if the mapping was
4917 undefined (in which case no character was written).
4918 The called must decref result.
4919 Return 0 on success, -1 on error. */
4920static
Walter Dörwald4894c302003-10-24 14:25:28 +00004921int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4923 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924{
Walter Dörwald4894c302003-10-24 14:25:28 +00004925 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004926 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 /* not found => default to 1:1 mapping */
4929 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 }
4931 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004932 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004934 /* no overflow check, because we know that the space is enough */
4935 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 }
4937 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004938 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4939 if (repsize==1) {
4940 /* no overflow check, because we know that the space is enough */
4941 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4942 }
4943 else if (repsize!=0) {
4944 /* more than one character */
4945 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4946 (insize - (curinp-startinp)) +
4947 repsize - 1;
4948 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4949 return -1;
4950 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4951 *outp += repsize;
4952 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 }
4954 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 return 0;
4957}
4958
4959PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004960 Py_ssize_t size,
4961 PyObject *mapping,
4962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 /* output object */
4965 PyObject *res = NULL;
4966 /* pointers to the beginning and end+1 of input */
4967 const Py_UNICODE *startp = p;
4968 const Py_UNICODE *endp = p + size;
4969 /* pointer into the output */
4970 Py_UNICODE *str;
4971 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 char *reason = "character maps to <undefined>";
4974 PyObject *errorHandler = NULL;
4975 PyObject *exc = NULL;
4976 /* the following variable is used for caching string comparisons
4977 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4978 * 3=ignore, 4=xmlcharrefreplace */
4979 int known_errorHandler = -1;
4980
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 PyErr_BadArgument();
4983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985
4986 /* allocate enough for a simple 1:1 translation without
4987 replacements, if we need more, we'll resize */
4988 res = PyUnicode_FromUnicode(NULL, size);
4989 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 /* try to encode it */
4997 PyObject *x = NULL;
4998 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4999 Py_XDECREF(x);
5000 goto onError;
5001 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005002 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005003 if (x!=Py_None) /* it worked => adjust input pointer */
5004 ++p;
5005 else { /* untranslatable character */
5006 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5007 Py_ssize_t repsize;
5008 Py_ssize_t newpos;
5009 Py_UNICODE *uni2;
5010 /* startpos for collecting untranslatable chars */
5011 const Py_UNICODE *collstart = p;
5012 const Py_UNICODE *collend = p+1;
5013 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005015 /* find all untranslatable characters */
5016 while (collend < endp) {
5017 if (charmaptranslate_lookup(*collend, mapping, &x))
5018 goto onError;
5019 Py_XDECREF(x);
5020 if (x!=Py_None)
5021 break;
5022 ++collend;
5023 }
5024 /* cache callback name lookup
5025 * (if not done yet, i.e. it's the first error) */
5026 if (known_errorHandler==-1) {
5027 if ((errors==NULL) || (!strcmp(errors, "strict")))
5028 known_errorHandler = 1;
5029 else if (!strcmp(errors, "replace"))
5030 known_errorHandler = 2;
5031 else if (!strcmp(errors, "ignore"))
5032 known_errorHandler = 3;
5033 else if (!strcmp(errors, "xmlcharrefreplace"))
5034 known_errorHandler = 4;
5035 else
5036 known_errorHandler = 0;
5037 }
5038 switch (known_errorHandler) {
5039 case 1: /* strict */
5040 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005041 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005042 case 2: /* replace */
5043 /* No need to check for space, this is a 1:1 replacement */
5044 for (coll = collstart; coll<collend; ++coll)
5045 *str++ = '?';
5046 /* fall through */
5047 case 3: /* ignore */
5048 p = collend;
5049 break;
5050 case 4: /* xmlcharrefreplace */
5051 /* generate replacement (temporarily (mis)uses p) */
5052 for (p = collstart; p < collend; ++p) {
5053 char buffer[2+29+1+1];
5054 char *cp;
5055 sprintf(buffer, "&#%d;", (int)*p);
5056 if (charmaptranslate_makespace(&res, &str,
5057 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5058 goto onError;
5059 for (cp = buffer; *cp; ++cp)
5060 *str++ = *cp;
5061 }
5062 p = collend;
5063 break;
5064 default:
5065 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5066 reason, startp, size, &exc,
5067 collstart-startp, collend-startp, &newpos);
5068 if (repunicode == NULL)
5069 goto onError;
5070 /* generate replacement */
5071 repsize = PyUnicode_GET_SIZE(repunicode);
5072 if (charmaptranslate_makespace(&res, &str,
5073 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5074 Py_DECREF(repunicode);
5075 goto onError;
5076 }
5077 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5078 *str++ = *uni2;
5079 p = startp + newpos;
5080 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005081 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005082 }
5083 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 /* Resize if we allocated to much */
5085 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005086 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005087 if (PyUnicode_Resize(&res, respos) < 0)
5088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 }
5090 Py_XDECREF(exc);
5091 Py_XDECREF(errorHandler);
5092 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005094 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005095 Py_XDECREF(res);
5096 Py_XDECREF(exc);
5097 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return NULL;
5099}
5100
5101PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005102 PyObject *mapping,
5103 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104{
5105 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 str = PyUnicode_FromObject(str);
5108 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005111 PyUnicode_GET_SIZE(str),
5112 mapping,
5113 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_DECREF(str);
5115 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005116
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005117 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 Py_XDECREF(str);
5119 return NULL;
5120}
Tim Petersced69f82003-09-16 20:30:58 +00005121
Guido van Rossum9e896b32000-04-05 20:11:21 +00005122/* --- Decimal Encoder ---------------------------------------------------- */
5123
5124int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005125 Py_ssize_t length,
5126 char *output,
5127 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005128{
5129 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 PyObject *errorHandler = NULL;
5131 PyObject *exc = NULL;
5132 const char *encoding = "decimal";
5133 const char *reason = "invalid decimal Unicode string";
5134 /* the following variable is used for caching string comparisons
5135 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5136 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005137
5138 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 PyErr_BadArgument();
5140 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005141 }
5142
5143 p = s;
5144 end = s + length;
5145 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005146 register Py_UNICODE ch = *p;
5147 int decimal;
5148 PyObject *repunicode;
5149 Py_ssize_t repsize;
5150 Py_ssize_t newpos;
5151 Py_UNICODE *uni2;
5152 Py_UNICODE *collstart;
5153 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005154
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005155 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005156 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005157 ++p;
5158 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005159 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005160 decimal = Py_UNICODE_TODECIMAL(ch);
5161 if (decimal >= 0) {
5162 *output++ = '0' + decimal;
5163 ++p;
5164 continue;
5165 }
5166 if (0 < ch && ch < 256) {
5167 *output++ = (char)ch;
5168 ++p;
5169 continue;
5170 }
5171 /* All other characters are considered unencodable */
5172 collstart = p;
5173 collend = p+1;
5174 while (collend < end) {
5175 if ((0 < *collend && *collend < 256) ||
5176 !Py_UNICODE_ISSPACE(*collend) ||
5177 Py_UNICODE_TODECIMAL(*collend))
5178 break;
5179 }
5180 /* cache callback name lookup
5181 * (if not done yet, i.e. it's the first error) */
5182 if (known_errorHandler==-1) {
5183 if ((errors==NULL) || (!strcmp(errors, "strict")))
5184 known_errorHandler = 1;
5185 else if (!strcmp(errors, "replace"))
5186 known_errorHandler = 2;
5187 else if (!strcmp(errors, "ignore"))
5188 known_errorHandler = 3;
5189 else if (!strcmp(errors, "xmlcharrefreplace"))
5190 known_errorHandler = 4;
5191 else
5192 known_errorHandler = 0;
5193 }
5194 switch (known_errorHandler) {
5195 case 1: /* strict */
5196 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5197 goto onError;
5198 case 2: /* replace */
5199 for (p = collstart; p < collend; ++p)
5200 *output++ = '?';
5201 /* fall through */
5202 case 3: /* ignore */
5203 p = collend;
5204 break;
5205 case 4: /* xmlcharrefreplace */
5206 /* generate replacement (temporarily (mis)uses p) */
5207 for (p = collstart; p < collend; ++p)
5208 output += sprintf(output, "&#%d;", (int)*p);
5209 p = collend;
5210 break;
5211 default:
5212 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5213 encoding, reason, s, length, &exc,
5214 collstart-s, collend-s, &newpos);
5215 if (repunicode == NULL)
5216 goto onError;
5217 /* generate replacement */
5218 repsize = PyUnicode_GET_SIZE(repunicode);
5219 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5220 Py_UNICODE ch = *uni2;
5221 if (Py_UNICODE_ISSPACE(ch))
5222 *output++ = ' ';
5223 else {
5224 decimal = Py_UNICODE_TODECIMAL(ch);
5225 if (decimal >= 0)
5226 *output++ = '0' + decimal;
5227 else if (0 < ch && ch < 256)
5228 *output++ = (char)ch;
5229 else {
5230 Py_DECREF(repunicode);
5231 raise_encode_exception(&exc, encoding,
5232 s, length, collstart-s, collend-s, reason);
5233 goto onError;
5234 }
5235 }
5236 }
5237 p = s + newpos;
5238 Py_DECREF(repunicode);
5239 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005240 }
5241 /* 0-terminate the output string */
5242 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 Py_XDECREF(exc);
5244 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005245 return 0;
5246
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005247 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 Py_XDECREF(exc);
5249 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005250 return -1;
5251}
5252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253/* --- Helpers ------------------------------------------------------------ */
5254
Eric Smitha9f7d622008-02-17 19:46:49 +00005255#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005256#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005257
5258#include "stringlib/count.h"
5259#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005260#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005261#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005262
Fredrik Lundhc8162812006-05-26 19:33:03 +00005263/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005264#define ADJUST_INDICES(start, end, len) \
5265 if (end > len) \
5266 end = len; \
5267 else if (end < 0) { \
5268 end += len; \
5269 if (end < 0) \
5270 end = 0; \
5271 } \
5272 if (start < 0) { \
5273 start += len; \
5274 if (start < 0) \
5275 start = 0; \
5276 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005277
Martin v. Löwis18e16552006-02-15 17:27:45 +00005278Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005279 PyObject *substr,
5280 Py_ssize_t start,
5281 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005283 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005284 PyUnicodeObject* str_obj;
5285 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005286
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005287 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5288 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005289 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005290 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5291 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005292 Py_DECREF(str_obj);
5293 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 }
Tim Petersced69f82003-09-16 20:30:58 +00005295
Antoine Pitrou64672132010-01-13 07:55:48 +00005296 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005297 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005298 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5299 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005300 );
5301
5302 Py_DECREF(sub_obj);
5303 Py_DECREF(str_obj);
5304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 return result;
5306}
5307
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005309 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005310 Py_ssize_t start,
5311 Py_ssize_t end,
5312 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005314 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005315
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005316 str = PyUnicode_FromObject(str);
5317 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005318 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005319 sub = PyUnicode_FromObject(sub);
5320 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005321 Py_DECREF(str);
5322 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 }
Tim Petersced69f82003-09-16 20:30:58 +00005324
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005325 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005326 result = stringlib_find_slice(
5327 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5328 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5329 start, end
5330 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005331 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005332 result = stringlib_rfind_slice(
5333 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5334 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5335 start, end
5336 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005337
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005338 Py_DECREF(str);
5339 Py_DECREF(sub);
5340
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 return result;
5342}
5343
Tim Petersced69f82003-09-16 20:30:58 +00005344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005346 PyUnicodeObject *substring,
5347 Py_ssize_t start,
5348 Py_ssize_t end,
5349 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 if (substring->length == 0)
5352 return 1;
5353
Antoine Pitrou64672132010-01-13 07:55:48 +00005354 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 end -= substring->length;
5356 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005357 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
5359 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005360 if (Py_UNICODE_MATCH(self, end, substring))
5361 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 } else {
5363 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 }
5366
5367 return 0;
5368}
5369
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005371 PyObject *substr,
5372 Py_ssize_t start,
5373 Py_ssize_t end,
5374 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005377
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 str = PyUnicode_FromObject(str);
5379 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 substr = PyUnicode_FromObject(substr);
5382 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 Py_DECREF(str);
5384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005388 (PyUnicodeObject *)substr,
5389 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 Py_DECREF(str);
5391 Py_DECREF(substr);
5392 return result;
5393}
5394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395/* Apply fixfct filter to the Unicode object self and return a
5396 reference to the modified object */
5397
Tim Petersced69f82003-09-16 20:30:58 +00005398static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005400 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
5402
5403 PyUnicodeObject *u;
5404
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005405 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005407 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005408
5409 Py_UNICODE_COPY(u->str, self->str, self->length);
5410
Tim Peters7a29bd52001-09-12 03:03:31 +00005411 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005412 /* fixfct should return TRUE if it modified the buffer. If
5413 FALSE, return a reference to the original buffer instead
5414 (to save space, not time) */
5415 Py_INCREF(self);
5416 Py_DECREF(u);
5417 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 }
5419 return (PyObject*) u;
5420}
5421
Tim Petersced69f82003-09-16 20:30:58 +00005422static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423int fixupper(PyUnicodeObject *self)
5424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 Py_UNICODE *s = self->str;
5427 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005428
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005430 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005431
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005432 ch = Py_UNICODE_TOUPPER(*s);
5433 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005435 *s = ch;
5436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 s++;
5438 }
5439
5440 return status;
5441}
5442
Tim Petersced69f82003-09-16 20:30:58 +00005443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444int fixlower(PyUnicodeObject *self)
5445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 Py_UNICODE *s = self->str;
5448 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005453 ch = Py_UNICODE_TOLOWER(*s);
5454 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005456 *s = ch;
5457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 s++;
5459 }
5460
5461 return status;
5462}
5463
Tim Petersced69f82003-09-16 20:30:58 +00005464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465int fixswapcase(PyUnicodeObject *self)
5466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 Py_UNICODE *s = self->str;
5469 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005470
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 while (len-- > 0) {
5472 if (Py_UNICODE_ISUPPER(*s)) {
5473 *s = Py_UNICODE_TOLOWER(*s);
5474 status = 1;
5475 } else if (Py_UNICODE_ISLOWER(*s)) {
5476 *s = Py_UNICODE_TOUPPER(*s);
5477 status = 1;
5478 }
5479 s++;
5480 }
5481
5482 return status;
5483}
5484
Tim Petersced69f82003-09-16 20:30:58 +00005485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486int fixcapitalize(PyUnicodeObject *self)
5487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005489 Py_UNICODE *s = self->str;
5490 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005491
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005492 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005493 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005494 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005495 *s = Py_UNICODE_TOUPPER(*s);
5496 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005498 s++;
5499 while (--len > 0) {
5500 if (Py_UNICODE_ISUPPER(*s)) {
5501 *s = Py_UNICODE_TOLOWER(*s);
5502 status = 1;
5503 }
5504 s++;
5505 }
5506 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507}
5508
5509static
5510int fixtitle(PyUnicodeObject *self)
5511{
5512 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5513 register Py_UNICODE *e;
5514 int previous_is_cased;
5515
5516 /* Shortcut for single character strings */
5517 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005518 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5519 if (*p != ch) {
5520 *p = ch;
5521 return 1;
5522 }
5523 else
5524 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Tim Petersced69f82003-09-16 20:30:58 +00005526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 e = p + PyUnicode_GET_SIZE(self);
5528 previous_is_cased = 0;
5529 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005530 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005531
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005532 if (previous_is_cased)
5533 *p = Py_UNICODE_TOLOWER(ch);
5534 else
5535 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005536
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005537 if (Py_UNICODE_ISLOWER(ch) ||
5538 Py_UNICODE_ISUPPER(ch) ||
5539 Py_UNICODE_ISTITLE(ch))
5540 previous_is_cased = 1;
5541 else
5542 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 }
5544 return 1;
5545}
5546
Tim Peters8ce9f162004-08-27 01:49:32 +00005547PyObject *
5548PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
Tim Peters8ce9f162004-08-27 01:49:32 +00005550 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005551 const Py_UNICODE blank = ' ';
5552 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005553 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005554 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005555 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5556 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5558 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005559 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005560 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005561 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 fseq = PySequence_Fast(seq, "");
5564 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005565 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005566 }
5567
Tim Peters91879ab2004-08-27 22:35:44 +00005568 /* Grrrr. A codec may be invoked to convert str objects to
5569 * Unicode, and so it's possible to call back into Python code
5570 * during PyUnicode_FromObject(), and so it's possible for a sick
5571 * codec to change the size of fseq (if seq is a list). Therefore
5572 * we have to keep refetching the size -- can't assume seqlen
5573 * is invariant.
5574 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 seqlen = PySequence_Fast_GET_SIZE(fseq);
5576 /* If empty sequence, return u"". */
5577 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005578 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5579 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005580 }
5581 /* If singleton sequence with an exact Unicode, return that. */
5582 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005583 item = PySequence_Fast_GET_ITEM(fseq, 0);
5584 if (PyUnicode_CheckExact(item)) {
5585 Py_INCREF(item);
5586 res = (PyUnicodeObject *)item;
5587 goto Done;
5588 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005589 }
5590
Tim Peters05eba1f2004-08-27 21:32:02 +00005591 /* At least two items to join, or one that isn't exact Unicode. */
5592 if (seqlen > 1) {
5593 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005594 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005595 sep = &blank;
5596 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005597 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005598 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005599 internal_separator = PyUnicode_FromObject(separator);
5600 if (internal_separator == NULL)
5601 goto onError;
5602 sep = PyUnicode_AS_UNICODE(internal_separator);
5603 seplen = PyUnicode_GET_SIZE(internal_separator);
5604 /* In case PyUnicode_FromObject() mutated seq. */
5605 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 }
5607 }
5608
5609 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005610 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005611 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005612 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005613 res_p = PyUnicode_AS_UNICODE(res);
5614 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005615
Tim Peters05eba1f2004-08-27 21:32:02 +00005616 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005617 Py_ssize_t itemlen;
5618 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005619
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005620 item = PySequence_Fast_GET_ITEM(fseq, i);
5621 /* Convert item to Unicode. */
5622 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5623 PyErr_Format(PyExc_TypeError,
5624 "sequence item %zd: expected string or Unicode,"
5625 " %.80s found",
5626 i, Py_TYPE(item)->tp_name);
5627 goto onError;
5628 }
5629 item = PyUnicode_FromObject(item);
5630 if (item == NULL)
5631 goto onError;
5632 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005633
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 /* In case PyUnicode_FromObject() mutated seq. */
5635 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005636
Tim Peters8ce9f162004-08-27 01:49:32 +00005637 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005638 itemlen = PyUnicode_GET_SIZE(item);
5639 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005640 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005641 goto Overflow;
5642 if (i < seqlen - 1) {
5643 new_res_used += seplen;
5644 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005645 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005646 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005647 if (new_res_used > res_alloc) {
5648 /* double allocated size until it's big enough */
5649 do {
5650 res_alloc += res_alloc;
5651 if (res_alloc <= 0)
5652 goto Overflow;
5653 } while (new_res_used > res_alloc);
5654 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5655 Py_DECREF(item);
5656 goto onError;
5657 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005658 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005659 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005660
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005661 /* Copy item, and maybe the separator. */
5662 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5663 res_p += itemlen;
5664 if (i < seqlen - 1) {
5665 Py_UNICODE_COPY(res_p, sep, seplen);
5666 res_p += seplen;
5667 }
5668 Py_DECREF(item);
5669 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005670 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005671
Tim Peters05eba1f2004-08-27 21:32:02 +00005672 /* Shrink res to match the used area; this probably can't fail,
5673 * but it's cheap to check.
5674 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005675 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005676 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005677
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005678 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005680 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return (PyObject *)res;
5682
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005683 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005684 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005685 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 Py_DECREF(item);
5687 /* fall through */
5688
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005689 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005690 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005691 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005692 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return NULL;
5694}
5695
Tim Petersced69f82003-09-16 20:30:58 +00005696static
5697PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005698 Py_ssize_t left,
5699 Py_ssize_t right,
5700 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
5702 PyUnicodeObject *u;
5703
5704 if (left < 0)
5705 left = 0;
5706 if (right < 0)
5707 right = 0;
5708
Tim Peters7a29bd52001-09-12 03:03:31 +00005709 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 Py_INCREF(self);
5711 return self;
5712 }
5713
Neal Norwitze7d8be82008-07-31 17:17:14 +00005714 if (left > PY_SSIZE_T_MAX - self->length ||
5715 right > PY_SSIZE_T_MAX - (left + self->length)) {
5716 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5717 return NULL;
5718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 u = _PyUnicode_New(left + self->length + right);
5720 if (u) {
5721 if (left)
5722 Py_UNICODE_FILL(u->str, fill, left);
5723 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5724 if (right)
5725 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5726 }
5727
5728 return u;
5729}
5730
Antoine Pitrou64672132010-01-13 07:55:48 +00005731PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
5735 string = PyUnicode_FromObject(string);
5736 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Antoine Pitrou64672132010-01-13 07:55:48 +00005739 list = stringlib_splitlines(
5740 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5741 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
5743 Py_DECREF(string);
5744 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745}
5746
Tim Petersced69f82003-09-16 20:30:58 +00005747static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005749 PyUnicodeObject *substring,
5750 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005753 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005756 return stringlib_split_whitespace(
5757 (PyObject*) self, self->str, self->length, maxcount
5758 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Antoine Pitrou64672132010-01-13 07:55:48 +00005760 return stringlib_split(
5761 (PyObject*) self, self->str, self->length,
5762 substring->str, substring->length,
5763 maxcount
5764 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
Tim Petersced69f82003-09-16 20:30:58 +00005767static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005769 PyUnicodeObject *substring,
5770 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005773 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005776 return stringlib_rsplit_whitespace(
5777 (PyObject*) self, self->str, self->length, maxcount
5778 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779
Antoine Pitrou64672132010-01-13 07:55:48 +00005780 return stringlib_rsplit(
5781 (PyObject*) self, self->str, self->length,
5782 substring->str, substring->length,
5783 maxcount
5784 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785}
5786
5787static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005789 PyUnicodeObject *str1,
5790 PyUnicodeObject *str2,
5791 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792{
5793 PyUnicodeObject *u;
5794
5795 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005796 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005797 else if (maxcount == 0 || self->length == 0)
5798 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Fredrik Lundh347ee272006-05-24 16:35:18 +00005800 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005801 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005802 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005803 if (str1->length == 0)
5804 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005805 if (str1->length == 1) {
5806 /* replace characters */
5807 Py_UNICODE u1, u2;
5808 if (!findchar(self->str, self->length, str1->str[0]))
5809 goto nothing;
5810 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5811 if (!u)
5812 return NULL;
5813 Py_UNICODE_COPY(u->str, self->str, self->length);
5814 u1 = str1->str[0];
5815 u2 = str2->str[0];
5816 for (i = 0; i < u->length; i++)
5817 if (u->str[i] == u1) {
5818 if (--maxcount < 0)
5819 break;
5820 u->str[i] = u2;
5821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005823 i = stringlib_find(
5824 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005826 if (i < 0)
5827 goto nothing;
5828 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5829 if (!u)
5830 return NULL;
5831 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005832
5833 /* change everything in-place, starting with this one */
5834 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5835 i += str1->length;
5836
5837 while ( --maxcount > 0) {
5838 i = stringlib_find(self->str+i, self->length-i,
5839 str1->str, str1->length,
5840 i);
5841 if (i == -1)
5842 break;
5843 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5844 i += str1->length;
5845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005848
Brett Cannona7f13ee2010-05-04 01:16:51 +00005849 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005850 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 Py_UNICODE *p;
5852
5853 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005854 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5855 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005856 if (n == 0)
5857 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005858 /* new_size = self->length + n * (str2->length - str1->length)); */
5859 delta = (str2->length - str1->length);
5860 if (delta == 0) {
5861 new_size = self->length;
5862 } else {
5863 product = n * (str2->length - str1->length);
5864 if ((product / (str2->length - str1->length)) != n) {
5865 PyErr_SetString(PyExc_OverflowError,
5866 "replace string is too long");
5867 return NULL;
5868 }
5869 new_size = self->length + product;
5870 if (new_size < 0) {
5871 PyErr_SetString(PyExc_OverflowError,
5872 "replace string is too long");
5873 return NULL;
5874 }
5875 }
5876 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005877 if (!u)
5878 return NULL;
5879 i = 0;
5880 p = u->str;
5881 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005882 while (n-- > 0) {
5883 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005884 j = stringlib_find(self->str+i, self->length-i,
5885 str1->str, str1->length,
5886 i);
5887 if (j == -1)
5888 break;
5889 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005890 /* copy unchanged part [i:j] */
5891 Py_UNICODE_COPY(p, self->str+i, j-i);
5892 p += j - i;
5893 }
5894 /* copy substitution string */
5895 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005896 Py_UNICODE_COPY(p, str2->str, str2->length);
5897 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005898 }
5899 i = j + str1->length;
5900 }
5901 if (i < self->length)
5902 /* copy tail [i:] */
5903 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005904 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005905 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005906 while (n > 0) {
5907 Py_UNICODE_COPY(p, str2->str, str2->length);
5908 p += str2->length;
5909 if (--n <= 0)
5910 break;
5911 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005913 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 }
5915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005918 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005919 /* nothing to replace; return original string (when possible) */
5920 if (PyUnicode_CheckExact(self)) {
5921 Py_INCREF(self);
5922 return (PyObject *) self;
5923 }
5924 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925}
5926
5927/* --- Unicode Object Methods --------------------------------------------- */
5928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005929PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005930 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931\n\
5932Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005933characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934
5935static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005936unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 return fixup(self, fixtitle);
5939}
5940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005941PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005942 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943\n\
5944Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005945have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946
5947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005948unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 return fixup(self, fixcapitalize);
5951}
5952
5953#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005955 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956\n\
5957Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005958normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005961unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962{
5963 PyObject *list;
5964 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 /* Split into words */
5968 list = split(self, NULL, -1);
5969 if (!list)
5970 return NULL;
5971
5972 /* Capitalize each word */
5973 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5974 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005975 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 if (item == NULL)
5977 goto onError;
5978 Py_DECREF(PyList_GET_ITEM(list, i));
5979 PyList_SET_ITEM(list, i, item);
5980 }
5981
5982 /* Join the words to form a new string */
5983 item = PyUnicode_Join(NULL, list);
5984
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 Py_DECREF(list);
5987 return (PyObject *)item;
5988}
5989#endif
5990
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005991/* Argument converter. Coerces to a single unicode character */
5992
5993static int
5994convert_uc(PyObject *obj, void *addr)
5995{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005996 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5997 PyObject *uniobj;
5998 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005999
Benjamin Peterson857ce152009-01-31 16:29:18 +00006000 uniobj = PyUnicode_FromObject(obj);
6001 if (uniobj == NULL) {
6002 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006003 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006004 return 0;
6005 }
6006 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6007 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006008 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006009 Py_DECREF(uniobj);
6010 return 0;
6011 }
6012 unistr = PyUnicode_AS_UNICODE(uniobj);
6013 *fillcharloc = unistr[0];
6014 Py_DECREF(uniobj);
6015 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006016}
6017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006019 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006021Return S centered in a Unicode string of length width. Padding is\n\
6022done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
6024static PyObject *
6025unicode_center(PyUnicodeObject *self, PyObject *args)
6026{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006027 Py_ssize_t marg, left;
6028 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006029 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
Thomas Woutersde017742006-02-16 19:34:37 +00006031 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 return NULL;
6033
Tim Peters7a29bd52001-09-12 03:03:31 +00006034 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 Py_INCREF(self);
6036 return (PyObject*) self;
6037 }
6038
6039 marg = width - self->length;
6040 left = marg / 2 + (marg & width & 1);
6041
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006042 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043}
6044
Marc-André Lemburge5034372000-08-08 08:04:29 +00006045#if 0
6046
6047/* This code should go into some future Unicode collation support
6048 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006049 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006050
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006051/* speedy UTF-16 code point order comparison */
6052/* gleaned from: */
6053/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6054
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006055static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006056{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006057 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006058 0, 0, 0, 0, 0, 0, 0, 0,
6059 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006060 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006061};
6062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063static int
6064unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6065{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006066 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 Py_UNICODE *s1 = str1->str;
6069 Py_UNICODE *s2 = str2->str;
6070
6071 len1 = str1->length;
6072 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006075 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006076
6077 c1 = *s1++;
6078 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006079
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006080 if (c1 > (1<<11) * 26)
6081 c1 += utf16Fixup[c1>>11];
6082 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006085
6086 if (c1 != c2)
6087 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006088
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006089 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
6091
6092 return (len1 < len2) ? -1 : (len1 != len2);
6093}
6094
Marc-André Lemburge5034372000-08-08 08:04:29 +00006095#else
6096
6097static int
6098unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6099{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006100 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006101
6102 Py_UNICODE *s1 = str1->str;
6103 Py_UNICODE *s2 = str2->str;
6104
6105 len1 = str1->length;
6106 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006107
Marc-André Lemburge5034372000-08-08 08:04:29 +00006108 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006109 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006110
Fredrik Lundh45714e92001-06-26 16:39:36 +00006111 c1 = *s1++;
6112 c2 = *s2++;
6113
6114 if (c1 != c2)
6115 return (c1 < c2) ? -1 : 1;
6116
Marc-André Lemburge5034372000-08-08 08:04:29 +00006117 len1--; len2--;
6118 }
6119
6120 return (len1 < len2) ? -1 : (len1 != len2);
6121}
6122
6123#endif
6124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006126 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
6128 PyUnicodeObject *u = NULL, *v = NULL;
6129 int result;
6130
6131 /* Coerce the two arguments */
6132 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6133 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006134 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6136 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Thomas Wouters7e474022000-07-16 12:04:32 +00006139 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006141 Py_DECREF(u);
6142 Py_DECREF(v);
6143 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 }
6145
6146 result = unicode_compare(u, v);
6147
6148 Py_DECREF(u);
6149 Py_DECREF(v);
6150 return result;
6151
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006152 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 Py_XDECREF(u);
6154 Py_XDECREF(v);
6155 return -1;
6156}
6157
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006158PyObject *PyUnicode_RichCompare(PyObject *left,
6159 PyObject *right,
6160 int op)
6161{
6162 int result;
6163
6164 result = PyUnicode_Compare(left, right);
6165 if (result == -1 && PyErr_Occurred())
6166 goto onError;
6167
6168 /* Convert the return value to a Boolean */
6169 switch (op) {
6170 case Py_EQ:
6171 result = (result == 0);
6172 break;
6173 case Py_NE:
6174 result = (result != 0);
6175 break;
6176 case Py_LE:
6177 result = (result <= 0);
6178 break;
6179 case Py_GE:
6180 result = (result >= 0);
6181 break;
6182 case Py_LT:
6183 result = (result == -1);
6184 break;
6185 case Py_GT:
6186 result = (result == 1);
6187 break;
6188 }
6189 return PyBool_FromLong(result);
6190
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006191 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006192
6193 /* Standard case
6194
6195 Type errors mean that PyUnicode_FromObject() could not convert
6196 one of the arguments (usually the right hand side) to Unicode,
6197 ie. we can't handle the comparison request. However, it is
6198 possible that the other object knows a comparison method, which
6199 is why we return Py_NotImplemented to give the other object a
6200 chance.
6201
6202 */
6203 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6204 PyErr_Clear();
6205 Py_INCREF(Py_NotImplemented);
6206 return Py_NotImplemented;
6207 }
6208 if (op != Py_EQ && op != Py_NE)
6209 return NULL;
6210
6211 /* Equality comparison.
6212
6213 This is a special case: we silence any PyExc_UnicodeDecodeError
6214 and instead turn it into a PyErr_UnicodeWarning.
6215
6216 */
6217 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6218 return NULL;
6219 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006220 if (PyErr_Warn(PyExc_UnicodeWarning,
6221 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006222 "Unicode equal comparison "
6223 "failed to convert both arguments to Unicode - "
6224 "interpreting them as being unequal" :
6225 "Unicode unequal comparison "
6226 "failed to convert both arguments to Unicode - "
6227 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006228 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006229 return NULL;
6230 result = (op == Py_NE);
6231 return PyBool_FromLong(result);
6232}
6233
Guido van Rossum403d68b2000-03-13 15:55:09 +00006234int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006235 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006236{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006237 PyObject *str, *sub;
6238 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006239
6240 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006241 sub = PyUnicode_FromObject(element);
6242 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006243 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006244 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006245
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006246 str = PyUnicode_FromObject(container);
6247 if (!str) {
6248 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006249 return -1;
6250 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006251
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006252 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006253
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006254 Py_DECREF(str);
6255 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006256
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006257 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006258}
6259
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260/* Concat to string or Unicode object giving a new Unicode object. */
6261
6262PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006263 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264{
6265 PyUnicodeObject *u = NULL, *v = NULL, *w;
6266
6267 /* Coerce the two arguments */
6268 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6269 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6272 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006273 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274
6275 /* Shortcuts */
6276 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006277 Py_DECREF(v);
6278 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 }
6280 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006281 Py_DECREF(u);
6282 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 }
6284
6285 /* Concat the two Unicode strings */
6286 w = _PyUnicode_New(u->length + v->length);
6287 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006288 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 Py_UNICODE_COPY(w->str, u->str, u->length);
6290 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6291
6292 Py_DECREF(u);
6293 Py_DECREF(v);
6294 return (PyObject *)w;
6295
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006296 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 Py_XDECREF(u);
6298 Py_XDECREF(v);
6299 return NULL;
6300}
6301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006302PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006303 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006305Return the number of non-overlapping occurrences of substring sub in\n\
6306Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006307interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
6309static PyObject *
6310unicode_count(PyUnicodeObject *self, PyObject *args)
6311{
6312 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006313 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006314 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 PyObject *result;
6316
Guido van Rossumb8872e62000-05-09 14:14:27 +00006317 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006318 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 return NULL;
6320
6321 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006322 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006324 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006325
Antoine Pitrou64672132010-01-13 07:55:48 +00006326 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006327 result = PyInt_FromSsize_t(
6328 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006329 substring->str, substring->length,
6330 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006331 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 return result;
6336}
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006339 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006341Encodes S using the codec registered for encoding. encoding defaults\n\
6342to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006343handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6345'xmlcharrefreplace' as well as any other name registered with\n\
6346codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
6348static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006349unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006351 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 char *encoding = NULL;
6353 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006354 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006355
Benjamin Peterson332d7212009-09-18 21:14:55 +00006356 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6357 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006359 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006360 if (v == NULL)
6361 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006362 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006363 PyErr_Format(PyExc_TypeError,
6364 "encoder did not return a string/unicode object "
6365 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006366 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006367 Py_DECREF(v);
6368 return NULL;
6369 }
6370 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006371
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006372 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006373 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006374}
6375
6376PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006377 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006378\n\
6379Decodes S using the codec registered for encoding. encoding defaults\n\
6380to the default encoding. errors may be given to set a different error\n\
6381handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6382a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6383as well as any other name registerd with codecs.register_error that is\n\
6384able to handle UnicodeDecodeErrors.");
6385
6386static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006387unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006388{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006389 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006390 char *encoding = NULL;
6391 char *errors = NULL;
6392 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006393
Benjamin Peterson332d7212009-09-18 21:14:55 +00006394 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6395 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396 return NULL;
6397 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006398 if (v == NULL)
6399 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006400 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006401 PyErr_Format(PyExc_TypeError,
6402 "decoder did not return a string/unicode object "
6403 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006404 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006405 Py_DECREF(v);
6406 return NULL;
6407 }
6408 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006409
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006410 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412}
6413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006415 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416\n\
6417Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006418If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
6420static PyObject*
6421unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6422{
6423 Py_UNICODE *e;
6424 Py_UNICODE *p;
6425 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006426 Py_UNICODE *qe;
6427 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 PyUnicodeObject *u;
6429 int tabsize = 8;
6430
6431 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
Thomas Wouters7e474022000-07-16 12:04:32 +00006434 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006435 i = 0; /* chars up to and including most recent \n or \r */
6436 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6437 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 for (p = self->str; p < e; p++)
6439 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006440 if (tabsize > 0) {
6441 incr = tabsize - (j % tabsize); /* cannot overflow */
6442 if (j > PY_SSIZE_T_MAX - incr)
6443 goto overflow1;
6444 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006445 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006448 if (j > PY_SSIZE_T_MAX - 1)
6449 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 j++;
6451 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006452 if (i > PY_SSIZE_T_MAX - j)
6453 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006455 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 }
6457 }
6458
Guido van Rossum5bdff602008-03-11 21:18:06 +00006459 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006460 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 /* Second pass: create output string and fill it */
6463 u = _PyUnicode_New(i + j);
6464 if (!u)
6465 return NULL;
6466
Guido van Rossum5bdff602008-03-11 21:18:06 +00006467 j = 0; /* same as in first pass */
6468 q = u->str; /* next output char */
6469 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
6471 for (p = self->str; p < e; p++)
6472 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006473 if (tabsize > 0) {
6474 i = tabsize - (j % tabsize);
6475 j += i;
6476 while (i--) {
6477 if (q >= qe)
6478 goto overflow2;
6479 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006480 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006481 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006482 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006483 else {
6484 if (q >= qe)
6485 goto overflow2;
6486 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006487 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 if (*p == '\n' || *p == '\r')
6489 j = 0;
6490 }
6491
6492 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006493
6494 overflow2:
6495 Py_DECREF(u);
6496 overflow1:
6497 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006502 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503\n\
6504Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006505such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506arguments start and end are interpreted as in slice notation.\n\
6507\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006508Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509
6510static PyObject *
6511unicode_find(PyUnicodeObject *self, PyObject *args)
6512{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006513 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006514 Py_ssize_t start;
6515 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006516 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Facundo Batista57d56692007-11-16 18:04:14 +00006518 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006521 result = stringlib_find_slice(
6522 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6523 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6524 start, end
6525 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
6527 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006528
6529 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
6532static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006533unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534{
6535 if (index < 0 || index >= self->length) {
6536 PyErr_SetString(PyExc_IndexError, "string index out of range");
6537 return NULL;
6538 }
6539
6540 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6541}
6542
6543static long
6544unicode_hash(PyUnicodeObject *self)
6545{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006546 /* Since Unicode objects compare equal to their ASCII string
6547 counterparts, they should use the individual character values
6548 as basis for their hash value. This is needed to assure that
6549 strings and Unicode objects behave in the same way as
6550 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
Martin v. Löwis18e16552006-02-15 17:27:45 +00006552 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006553 register Py_UNICODE *p;
6554 register long x;
6555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006557 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006558 len = PyUnicode_GET_SIZE(self);
6559 p = PyUnicode_AS_UNICODE(self);
6560 x = *p << 7;
6561 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006562 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006563 x ^= PyUnicode_GET_SIZE(self);
6564 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006566 self->hash = x;
6567 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568}
6569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006570PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006571 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006573Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575static PyObject *
6576unicode_index(PyUnicodeObject *self, PyObject *args)
6577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006578 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006579 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006580 Py_ssize_t start;
6581 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
Facundo Batista57d56692007-11-16 18:04:14 +00006583 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006586 result = stringlib_find_slice(
6587 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6588 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6589 start, end
6590 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591
6592 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006593
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 if (result < 0) {
6595 PyErr_SetString(PyExc_ValueError, "substring not found");
6596 return NULL;
6597 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006598
Martin v. Löwis18e16552006-02-15 17:27:45 +00006599 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600}
6601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006603 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006605Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
6608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006609unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
6611 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6612 register const Py_UNICODE *e;
6613 int cased;
6614
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 /* Shortcut for single character strings */
6616 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006617 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006619 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006620 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006621 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006622
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 e = p + PyUnicode_GET_SIZE(self);
6624 cased = 0;
6625 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006626 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006627
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006628 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6629 return PyBool_FromLong(0);
6630 else if (!cased && Py_UNICODE_ISLOWER(ch))
6631 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006633 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634}
6635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006637 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006639Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
6642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006643unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
6645 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6646 register const Py_UNICODE *e;
6647 int cased;
6648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 /* Shortcut for single character strings */
6650 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006651 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006653 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006654 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 e = p + PyUnicode_GET_SIZE(self);
6658 cased = 0;
6659 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006660 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006661
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006662 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6663 return PyBool_FromLong(0);
6664 else if (!cased && Py_UNICODE_ISUPPER(ch))
6665 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006667 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006671 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006673Return True if S is a titlecased string and there is at least one\n\
6674character in S, i.e. upper- and titlecase characters may only\n\
6675follow uncased characters and lowercase characters only cased ones.\n\
6676Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
6678static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006679unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680{
6681 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6682 register const Py_UNICODE *e;
6683 int cased, previous_is_cased;
6684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 /* Shortcut for single character strings */
6686 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006687 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6688 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006690 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006691 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006692 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006693
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 e = p + PyUnicode_GET_SIZE(self);
6695 cased = 0;
6696 previous_is_cased = 0;
6697 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006698 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006699
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006700 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6701 if (previous_is_cased)
6702 return PyBool_FromLong(0);
6703 previous_is_cased = 1;
6704 cased = 1;
6705 }
6706 else if (Py_UNICODE_ISLOWER(ch)) {
6707 if (!previous_is_cased)
6708 return PyBool_FromLong(0);
6709 previous_is_cased = 1;
6710 cased = 1;
6711 }
6712 else
6713 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006715 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716}
6717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006718PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006719 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006721Return True if all characters in S are whitespace\n\
6722and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
6724static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006725unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
6727 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6728 register const Py_UNICODE *e;
6729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 /* Shortcut for single character strings */
6731 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006732 Py_UNICODE_ISSPACE(*p))
6733 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006735 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006736 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006737 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 e = p + PyUnicode_GET_SIZE(self);
6740 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006741 if (!Py_UNICODE_ISSPACE(*p))
6742 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006744 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745}
6746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006748 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006749\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006750Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006751and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006752
6753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006754unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006755{
6756 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6757 register const Py_UNICODE *e;
6758
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006759 /* Shortcut for single character strings */
6760 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006761 Py_UNICODE_ISALPHA(*p))
6762 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006763
6764 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006765 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006766 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767
6768 e = p + PyUnicode_GET_SIZE(self);
6769 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006770 if (!Py_UNICODE_ISALPHA(*p))
6771 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006772 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006773 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006774}
6775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006776PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006777 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006779Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006780and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006781
6782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006783unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006784{
6785 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6786 register const Py_UNICODE *e;
6787
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006788 /* Shortcut for single character strings */
6789 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006790 Py_UNICODE_ISALNUM(*p))
6791 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792
6793 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006794 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006795 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796
6797 e = p + PyUnicode_GET_SIZE(self);
6798 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 if (!Py_UNICODE_ISALNUM(*p))
6800 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006801 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006803}
6804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006805PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006806 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
6811static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006812unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813{
6814 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6815 register const Py_UNICODE *e;
6816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 /* Shortcut for single character strings */
6818 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006819 Py_UNICODE_ISDECIMAL(*p))
6820 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006822 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006823 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006824 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006825
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 e = p + PyUnicode_GET_SIZE(self);
6827 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006828 if (!Py_UNICODE_ISDECIMAL(*p))
6829 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006831 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832}
6833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006834PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006835 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006837Return True if all characters in S are digits\n\
6838and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
6840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006841unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842{
6843 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6844 register const Py_UNICODE *e;
6845
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 /* Shortcut for single character strings */
6847 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006848 Py_UNICODE_ISDIGIT(*p))
6849 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006851 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006852 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006853 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006854
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 e = p + PyUnicode_GET_SIZE(self);
6856 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006857 if (!Py_UNICODE_ISDIGIT(*p))
6858 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861}
6862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006863PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006864 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006867False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868
6869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006870unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871{
6872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6873 register const Py_UNICODE *e;
6874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 /* Shortcut for single character strings */
6876 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006877 Py_UNICODE_ISNUMERIC(*p))
6878 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006880 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006881 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006882 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006883
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 e = p + PyUnicode_GET_SIZE(self);
6885 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006886 if (!Py_UNICODE_ISNUMERIC(*p))
6887 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006889 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890}
6891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006892PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006893 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894\n\
6895Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006896iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
6898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006899unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006901 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902}
6903
Martin v. Löwis18e16552006-02-15 17:27:45 +00006904static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905unicode_length(PyUnicodeObject *self)
6906{
6907 return self->length;
6908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006911 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006913Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006914done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916static PyObject *
6917unicode_ljust(PyUnicodeObject *self, PyObject *args)
6918{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006919 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006920 Py_UNICODE fillchar = ' ';
6921
Martin v. Löwis412fb672006-04-13 06:34:32 +00006922 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 return NULL;
6924
Tim Peters7a29bd52001-09-12 03:03:31 +00006925 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 Py_INCREF(self);
6927 return (PyObject*) self;
6928 }
6929
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006930 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006933PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006934 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006936Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
6938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006939unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 return fixup(self, fixlower);
6942}
6943
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006944#define LEFTSTRIP 0
6945#define RIGHTSTRIP 1
6946#define BOTHSTRIP 2
6947
6948/* Arrays indexed by above */
6949static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6950
6951#define STRIPNAME(i) (stripformat[i]+3)
6952
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006953/* externally visible for str.strip(unicode) */
6954PyObject *
6955_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6956{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006957 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6958 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6959 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6960 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6961 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006962
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006963 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006964
Benjamin Peterson857ce152009-01-31 16:29:18 +00006965 i = 0;
6966 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006967 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6968 i++;
6969 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006970 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006971
Benjamin Peterson857ce152009-01-31 16:29:18 +00006972 j = len;
6973 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006974 do {
6975 j--;
6976 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6977 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006978 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006979
Benjamin Peterson857ce152009-01-31 16:29:18 +00006980 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 Py_INCREF(self);
6982 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006983 }
6984 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006985 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006986}
6987
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006992 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6993 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006994
Benjamin Peterson857ce152009-01-31 16:29:18 +00006995 i = 0;
6996 if (striptype != RIGHTSTRIP) {
6997 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6998 i++;
6999 }
7000 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007001
Benjamin Peterson857ce152009-01-31 16:29:18 +00007002 j = len;
7003 if (striptype != LEFTSTRIP) {
7004 do {
7005 j--;
7006 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7007 j++;
7008 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007009
Benjamin Peterson857ce152009-01-31 16:29:18 +00007010 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7011 Py_INCREF(self);
7012 return (PyObject*)self;
7013 }
7014 else
7015 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007018
7019static PyObject *
7020do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7021{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007022 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023
Benjamin Peterson857ce152009-01-31 16:29:18 +00007024 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7025 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007026
Benjamin Peterson857ce152009-01-31 16:29:18 +00007027 if (sep != NULL && sep != Py_None) {
7028 if (PyUnicode_Check(sep))
7029 return _PyUnicode_XStrip(self, striptype, sep);
7030 else if (PyString_Check(sep)) {
7031 PyObject *res;
7032 sep = PyUnicode_FromObject(sep);
7033 if (sep==NULL)
7034 return NULL;
7035 res = _PyUnicode_XStrip(self, striptype, sep);
7036 Py_DECREF(sep);
7037 return res;
7038 }
7039 else {
7040 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007041 "%s arg must be None, unicode or str",
7042 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007043 return NULL;
7044 }
7045 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046
Benjamin Peterson857ce152009-01-31 16:29:18 +00007047 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007048}
7049
7050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007051PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007052 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053\n\
7054Return a copy of the string S with leading and trailing\n\
7055whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007056If chars is given and not None, remove characters in chars instead.\n\
7057If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058
7059static PyObject *
7060unicode_strip(PyUnicodeObject *self, PyObject *args)
7061{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007062 if (PyTuple_GET_SIZE(args) == 0)
7063 return do_strip(self, BOTHSTRIP); /* Common case */
7064 else
7065 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066}
7067
7068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007069PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007070 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071\n\
7072Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007073If chars is given and not None, remove characters in chars instead.\n\
7074If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075
7076static PyObject *
7077unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7078{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007079 if (PyTuple_GET_SIZE(args) == 0)
7080 return do_strip(self, LEFTSTRIP); /* Common case */
7081 else
7082 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083}
7084
7085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007087 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007088\n\
7089Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007090If chars is given and not None, remove characters in chars instead.\n\
7091If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092
7093static PyObject *
7094unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7095{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007096 if (PyTuple_GET_SIZE(args) == 0)
7097 return do_strip(self, RIGHTSTRIP); /* Common case */
7098 else
7099 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007100}
7101
7102
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007104unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105{
7106 PyUnicodeObject *u;
7107 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007109 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
7111 if (len < 0)
7112 len = 0;
7113
Tim Peters7a29bd52001-09-12 03:03:31 +00007114 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115 /* no repeat, return original string */
7116 Py_INCREF(str);
7117 return (PyObject*) str;
7118 }
Tim Peters8f422462000-09-09 06:13:41 +00007119
7120 /* ensure # of chars needed doesn't overflow int and # of bytes
7121 * needed doesn't overflow size_t
7122 */
7123 nchars = len * str->length;
7124 if (len && nchars / len != str->length) {
7125 PyErr_SetString(PyExc_OverflowError,
7126 "repeated string is too long");
7127 return NULL;
7128 }
7129 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7130 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7131 PyErr_SetString(PyExc_OverflowError,
7132 "repeated string is too long");
7133 return NULL;
7134 }
7135 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 if (!u)
7137 return NULL;
7138
7139 p = u->str;
7140
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007141 if (str->length == 1 && len > 0) {
7142 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007143 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007144 Py_ssize_t done = 0; /* number of characters copied this far */
7145 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007146 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007147 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007148 }
7149 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007150 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007151 Py_UNICODE_COPY(p+done, p, n);
7152 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007153 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155
7156 return (PyObject*) u;
7157}
7158
7159PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007160 PyObject *subobj,
7161 PyObject *replobj,
7162 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163{
7164 PyObject *self;
7165 PyObject *str1;
7166 PyObject *str2;
7167 PyObject *result;
7168
7169 self = PyUnicode_FromObject(obj);
7170 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 str1 = PyUnicode_FromObject(subobj);
7173 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007174 Py_DECREF(self);
7175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
7177 str2 = PyUnicode_FromObject(replobj);
7178 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007179 Py_DECREF(self);
7180 Py_DECREF(str1);
7181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 }
Tim Petersced69f82003-09-16 20:30:58 +00007183 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007184 (PyUnicodeObject *)str1,
7185 (PyUnicodeObject *)str2,
7186 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 Py_DECREF(self);
7188 Py_DECREF(str1);
7189 Py_DECREF(str2);
7190 return result;
7191}
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007194 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195\n\
7196Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007197old replaced by new. If the optional argument count is\n\
7198given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
7200static PyObject*
7201unicode_replace(PyUnicodeObject *self, PyObject *args)
7202{
7203 PyUnicodeObject *str1;
7204 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 PyObject *result;
7207
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 return NULL;
7210 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7211 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007214 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007215 Py_DECREF(str1);
7216 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
7219 result = replace(self, str1, str2, maxcount);
7220
7221 Py_DECREF(str1);
7222 Py_DECREF(str2);
7223 return result;
7224}
7225
7226static
7227PyObject *unicode_repr(PyObject *unicode)
7228{
7229 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007230 PyUnicode_GET_SIZE(unicode),
7231 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232}
7233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007235 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236\n\
7237Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007238such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239arguments start and end are interpreted as in slice notation.\n\
7240\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007241Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242
7243static PyObject *
7244unicode_rfind(PyUnicodeObject *self, PyObject *args)
7245{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007246 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007247 Py_ssize_t start;
7248 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007249 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250
Facundo Batista57d56692007-11-16 18:04:14 +00007251 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007254 result = stringlib_rfind_slice(
7255 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7256 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7257 start, end
7258 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259
7260 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007261
7262 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263}
7264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007265PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007266 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007268Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269
7270static PyObject *
7271unicode_rindex(PyUnicodeObject *self, PyObject *args)
7272{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007273 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007274 Py_ssize_t start;
7275 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007276 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
Facundo Batista57d56692007-11-16 18:04:14 +00007278 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007281 result = stringlib_rfind_slice(
7282 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7283 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7284 start, end
7285 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286
7287 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007288
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 if (result < 0) {
7290 PyErr_SetString(PyExc_ValueError, "substring not found");
7291 return NULL;
7292 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007293 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294}
7295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007297 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007299Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007300done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
7302static PyObject *
7303unicode_rjust(PyUnicodeObject *self, PyObject *args)
7304{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007305 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007306 Py_UNICODE fillchar = ' ';
7307
Martin v. Löwis412fb672006-04-13 06:34:32 +00007308 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 return NULL;
7310
Tim Peters7a29bd52001-09-12 03:03:31 +00007311 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 Py_INCREF(self);
7313 return (PyObject*) self;
7314 }
7315
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007316 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317}
7318
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007320unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321{
7322 /* standard clamping */
7323 if (start < 0)
7324 start = 0;
7325 if (end < 0)
7326 end = 0;
7327 if (end > self->length)
7328 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007329 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 /* full slice, return original string */
7331 Py_INCREF(self);
7332 return (PyObject*) self;
7333 }
7334 if (start > end)
7335 start = end;
7336 /* copy slice */
7337 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007338 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339}
7340
7341PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007342 PyObject *sep,
7343 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344{
7345 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 s = PyUnicode_FromObject(s);
7348 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007349 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007350 if (sep != NULL) {
7351 sep = PyUnicode_FromObject(sep);
7352 if (sep == NULL) {
7353 Py_DECREF(s);
7354 return NULL;
7355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 }
7357
7358 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7359
7360 Py_DECREF(s);
7361 Py_XDECREF(sep);
7362 return result;
7363}
7364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007365PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007366 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367\n\
7368Return a list of the words in S, using sep as the\n\
7369delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007370splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007371whitespace string is a separator and empty strings are\n\
7372removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374static PyObject*
7375unicode_split(PyUnicodeObject *self, PyObject *args)
7376{
7377 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007378 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
Martin v. Löwis18e16552006-02-15 17:27:45 +00007380 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 return NULL;
7382
7383 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007384 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007386 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007388 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389}
7390
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007391PyObject *
7392PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7393{
7394 PyObject* str_obj;
7395 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007396 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007397
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007398 str_obj = PyUnicode_FromObject(str_in);
7399 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007400 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007401 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007402 if (!sep_obj) {
7403 Py_DECREF(str_obj);
7404 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007405 }
7406
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007407 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007408 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7409 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7410 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007411
Fredrik Lundhb9479482006-05-26 17:22:38 +00007412 Py_DECREF(sep_obj);
7413 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007414
7415 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007416}
7417
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007418
7419PyObject *
7420PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7421{
7422 PyObject* str_obj;
7423 PyObject* sep_obj;
7424 PyObject* out;
7425
7426 str_obj = PyUnicode_FromObject(str_in);
7427 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007428 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007429 sep_obj = PyUnicode_FromObject(sep_in);
7430 if (!sep_obj) {
7431 Py_DECREF(str_obj);
7432 return NULL;
7433 }
7434
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007435 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007436 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7437 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7438 );
7439
7440 Py_DECREF(sep_obj);
7441 Py_DECREF(str_obj);
7442
7443 return out;
7444}
7445
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007446PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007447 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007448\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007449Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007450the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007451found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007452
7453static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007454unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007455{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007456 return PyUnicode_Partition((PyObject *)self, separator);
7457}
7458
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007459PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007460 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007461\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007462Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007463the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007464separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007465
7466static PyObject*
7467unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7468{
7469 return PyUnicode_RPartition((PyObject *)self, separator);
7470}
7471
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007472PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007473 PyObject *sep,
7474 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007475{
7476 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007477
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007478 s = PyUnicode_FromObject(s);
7479 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007480 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007481 if (sep != NULL) {
7482 sep = PyUnicode_FromObject(sep);
7483 if (sep == NULL) {
7484 Py_DECREF(s);
7485 return NULL;
7486 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007487 }
7488
7489 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7490
7491 Py_DECREF(s);
7492 Py_XDECREF(sep);
7493 return result;
7494}
7495
7496PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007497 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007498\n\
7499Return a list of the words in S, using sep as the\n\
7500delimiter string, starting at the end of the string and\n\
7501working to the front. If maxsplit is given, at most maxsplit\n\
7502splits are done. If sep is not specified, any whitespace string\n\
7503is a separator.");
7504
7505static PyObject*
7506unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7507{
7508 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007509 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007510
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007512 return NULL;
7513
7514 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007515 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007516 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007517 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007518 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007519 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007520}
7521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007522PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007523 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524\n\
7525Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007526Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007527is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529static PyObject*
7530unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7531{
Guido van Rossum86662912000-04-11 15:38:46 +00007532 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
Guido van Rossum86662912000-04-11 15:38:46 +00007534 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 return NULL;
7536
Guido van Rossum86662912000-04-11 15:38:46 +00007537 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538}
7539
7540static
7541PyObject *unicode_str(PyUnicodeObject *self)
7542{
Fred Drakee4315f52000-05-09 19:53:39 +00007543 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544}
7545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007546PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007547 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548\n\
7549Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007550and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551
7552static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007553unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 return fixup(self, fixswapcase);
7556}
7557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007559 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560\n\
7561Return a copy of the string S, where all characters have been mapped\n\
7562through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007563Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7564Unmapped characters are left untouched. Characters mapped to None\n\
7565are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566
7567static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007568unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569{
Tim Petersced69f82003-09-16 20:30:58 +00007570 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007571 self->length,
7572 table,
7573 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574}
7575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007577 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007579Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
7581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007582unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 return fixup(self, fixupper);
7585}
7586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007587PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007588 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589\n\
Georg Brandl98064072008-09-09 19:26:00 +00007590Pad a numeric string S with zeros on the left, to fill a field\n\
7591of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
7593static PyObject *
7594unicode_zfill(PyUnicodeObject *self, PyObject *args)
7595{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007596 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 PyUnicodeObject *u;
7598
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 Py_ssize_t width;
7600 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 return NULL;
7602
7603 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007604 if (PyUnicode_CheckExact(self)) {
7605 Py_INCREF(self);
7606 return (PyObject*) self;
7607 }
7608 else
7609 return PyUnicode_FromUnicode(
7610 PyUnicode_AS_UNICODE(self),
7611 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007612 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
7614
7615 fill = width - self->length;
7616
7617 u = pad(self, fill, 0, '0');
7618
Walter Dörwald068325e2002-04-15 13:36:47 +00007619 if (u == NULL)
7620 return NULL;
7621
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 if (u->str[fill] == '+' || u->str[fill] == '-') {
7623 /* move sign to beginning of string */
7624 u->str[0] = u->str[fill];
7625 u->str[fill] = '0';
7626 }
7627
7628 return (PyObject*) u;
7629}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
7631#if 0
7632static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007633free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007635 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636}
7637#endif
7638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007639PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007640 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007642Return True if S starts with the specified prefix, False otherwise.\n\
7643With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007644With optional end, stop comparing S at that position.\n\
7645prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646
7647static PyObject *
7648unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007649 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650{
Georg Brandl24250812006-06-09 18:45:48 +00007651 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007653 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007654 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007655 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656
Georg Brandl24250812006-06-09 18:45:48 +00007657 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007658 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7659 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007660 if (PyTuple_Check(subobj)) {
7661 Py_ssize_t i;
7662 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7663 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007664 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007665 if (substring == NULL)
7666 return NULL;
7667 result = tailmatch(self, substring, start, end, -1);
7668 Py_DECREF(substring);
7669 if (result) {
7670 Py_RETURN_TRUE;
7671 }
7672 }
7673 /* nothing matched */
7674 Py_RETURN_FALSE;
7675 }
7676 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007678 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007679 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007681 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682}
7683
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007686 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007688Return True if S ends with the specified suffix, False otherwise.\n\
7689With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007690With optional end, stop comparing S at that position.\n\
7691suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692
7693static PyObject *
7694unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007695 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
Georg Brandl24250812006-06-09 18:45:48 +00007697 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007699 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007700 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007701 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702
Georg Brandl24250812006-06-09 18:45:48 +00007703 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007704 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7705 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007706 if (PyTuple_Check(subobj)) {
7707 Py_ssize_t i;
7708 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7709 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007710 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007711 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007712 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007713 result = tailmatch(self, substring, start, end, +1);
7714 Py_DECREF(substring);
7715 if (result) {
7716 Py_RETURN_TRUE;
7717 }
7718 }
7719 Py_RETURN_FALSE;
7720 }
7721 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
Georg Brandl24250812006-06-09 18:45:48 +00007725 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007727 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728}
7729
7730
Eric Smitha9f7d622008-02-17 19:46:49 +00007731/* Implements do_string_format, which is unicode because of stringlib */
7732#include "stringlib/string_format.h"
7733
7734PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007735 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007736\n\
7737");
7738
Eric Smithdc13b792008-05-30 18:10:04 +00007739static PyObject *
7740unicode__format__(PyObject *self, PyObject *args)
7741{
7742 PyObject *format_spec;
7743 PyObject *result = NULL;
7744 PyObject *tmp = NULL;
7745
7746 /* If 2.x, convert format_spec to the same type as value */
7747 /* This is to allow things like u''.format('') */
7748 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7749 goto done;
7750 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7751 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007752 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007753 goto done;
7754 }
7755 tmp = PyObject_Unicode(format_spec);
7756 if (tmp == NULL)
7757 goto done;
7758 format_spec = tmp;
7759
7760 result = _PyUnicode_FormatAdvanced(self,
7761 PyUnicode_AS_UNICODE(format_spec),
7762 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007763 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007764 Py_XDECREF(tmp);
7765 return result;
7766}
7767
Eric Smitha9f7d622008-02-17 19:46:49 +00007768PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007769 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007770\n\
7771");
7772
Robert Schuppenies901c9972008-06-10 10:10:31 +00007773static PyObject *
7774unicode__sizeof__(PyUnicodeObject *v)
7775{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007776 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7777 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007778}
7779
7780PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007781 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007782\n\
7783");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007784
7785static PyObject *
7786unicode_getnewargs(PyUnicodeObject *v)
7787{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007788 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007789}
7790
7791
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792static PyMethodDef unicode_methods[] = {
7793
7794 /* Order is according to common usage: often used methods should
7795 appear first, since lookup is done sequentially. */
7796
Benjamin Peterson332d7212009-09-18 21:14:55 +00007797 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007798 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7799 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007800 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007801 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7802 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7803 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7804 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7805 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7806 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7807 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007808 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007809 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7810 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7811 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007812 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007813 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007814/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7815 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7816 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7817 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007818 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007819 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007820 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007821 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007822 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7823 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7824 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7825 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7826 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7827 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7828 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7829 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7830 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7831 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7832 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7833 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7834 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7835 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007836 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007837 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7838 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7839 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7840 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007841 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007842#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007843 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844#endif
7845
7846#if 0
7847 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007848 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849#endif
7850
Benjamin Peterson857ce152009-01-31 16:29:18 +00007851 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 {NULL, NULL}
7853};
7854
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007855static PyObject *
7856unicode_mod(PyObject *v, PyObject *w)
7857{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007858 if (!PyUnicode_Check(v)) {
7859 Py_INCREF(Py_NotImplemented);
7860 return Py_NotImplemented;
7861 }
7862 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007863}
7864
7865static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007866 0, /*nb_add*/
7867 0, /*nb_subtract*/
7868 0, /*nb_multiply*/
7869 0, /*nb_divide*/
7870 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007871};
7872
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007874 (lenfunc) unicode_length, /* sq_length */
7875 PyUnicode_Concat, /* sq_concat */
7876 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7877 (ssizeargfunc) unicode_getitem, /* sq_item */
7878 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7879 0, /* sq_ass_item */
7880 0, /* sq_ass_slice */
7881 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882};
7883
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007884static PyObject*
7885unicode_subscript(PyUnicodeObject* self, PyObject* item)
7886{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007887 if (PyIndex_Check(item)) {
7888 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007889 if (i == -1 && PyErr_Occurred())
7890 return NULL;
7891 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007892 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007893 return unicode_getitem(self, i);
7894 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007895 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007896 Py_UNICODE* source_buf;
7897 Py_UNICODE* result_buf;
7898 PyObject* result;
7899
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007900 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007901 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007902 return NULL;
7903 }
7904
7905 if (slicelength <= 0) {
7906 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007907 } else if (start == 0 && step == 1 && slicelength == self->length &&
7908 PyUnicode_CheckExact(self)) {
7909 Py_INCREF(self);
7910 return (PyObject *)self;
7911 } else if (step == 1) {
7912 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007913 } else {
7914 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007915 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7916 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007917
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007918 if (result_buf == NULL)
7919 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007920
7921 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7922 result_buf[i] = source_buf[cur];
7923 }
Tim Petersced69f82003-09-16 20:30:58 +00007924
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007925 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007926 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007927 return result;
7928 }
7929 } else {
7930 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7931 return NULL;
7932 }
7933}
7934
7935static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007936 (lenfunc)unicode_length, /* mp_length */
7937 (binaryfunc)unicode_subscript, /* mp_subscript */
7938 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007939};
7940
Martin v. Löwis18e16552006-02-15 17:27:45 +00007941static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007943 Py_ssize_t index,
7944 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945{
7946 if (index != 0) {
7947 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007948 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 return -1;
7950 }
7951 *ptr = (void *) self->str;
7952 return PyUnicode_GET_DATA_SIZE(self);
7953}
7954
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955static Py_ssize_t
7956unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007957 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958{
7959 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007960 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 return -1;
7962}
7963
7964static int
7965unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007966 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967{
7968 if (lenp)
7969 *lenp = PyUnicode_GET_DATA_SIZE(self);
7970 return 1;
7971}
7972
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007973static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 Py_ssize_t index,
7976 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977{
7978 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007979
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 if (index != 0) {
7981 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007982 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 return -1;
7984 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007985 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007987 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007988 *ptr = (void *) PyString_AS_STRING(str);
7989 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990}
7991
7992/* Helpers for PyUnicode_Format() */
7993
7994static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007995getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007997 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007999 (*p_argidx)++;
8000 if (arglen < 0)
8001 return args;
8002 else
8003 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 }
8005 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008006 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 return NULL;
8008}
8009
8010#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008011#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008013#define F_ALT (1<<3)
8014#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008017strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019 register Py_ssize_t i;
8020 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008022 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return len;
8025}
8026
Neal Norwitzfc76d632006-01-10 06:03:13 +00008027static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008028longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8029{
Tim Peters15231542006-02-16 01:08:01 +00008030 Py_ssize_t result;
8031
Neal Norwitzfc76d632006-01-10 06:03:13 +00008032 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008033 result = strtounicode(buffer, (char *)buffer);
8034 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008035}
8036
Guido van Rossum078151d2002-08-11 04:24:12 +00008037/* XXX To save some code duplication, formatfloat/long/int could have been
8038 shared with stringobject.c, converting from 8-bit to Unicode after the
8039 formatting is done. */
8040
Mark Dickinson18cfada2009-11-23 18:46:41 +00008041/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8042
8043static PyObject *
8044formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008046 char *p;
8047 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008049
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 x = PyFloat_AsDouble(v);
8051 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008052 return NULL;
8053
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008055 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008056
Mark Dickinson18cfada2009-11-23 18:46:41 +00008057 p = PyOS_double_to_string(x, type, prec,
8058 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8059 if (p == NULL)
8060 return NULL;
8061 result = PyUnicode_FromStringAndSize(p, strlen(p));
8062 PyMem_Free(p);
8063 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064}
8065
Tim Peters38fd5b62000-09-21 05:43:11 +00008066static PyObject*
8067formatlong(PyObject *val, int flags, int prec, int type)
8068{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008069 char *buf;
8070 int i, len;
8071 PyObject *str; /* temporary string object. */
8072 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008073
Benjamin Peterson857ce152009-01-31 16:29:18 +00008074 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8075 if (!str)
8076 return NULL;
8077 result = _PyUnicode_New(len);
8078 if (!result) {
8079 Py_DECREF(str);
8080 return NULL;
8081 }
8082 for (i = 0; i < len; i++)
8083 result->str[i] = buf[i];
8084 result->str[len] = 0;
8085 Py_DECREF(str);
8086 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008087}
8088
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089static int
8090formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008091 size_t buflen,
8092 int flags,
8093 int prec,
8094 int type,
8095 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008097 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008098 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8099 * + 1 + 1
8100 * = 24
8101 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008102 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008103 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 long x;
8105
8106 x = PyInt_AsLong(v);
8107 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008108 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008109 if (x < 0 && type == 'u') {
8110 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008111 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008112 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8113 sign = "-";
8114 else
8115 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008117 prec = 1;
8118
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008119 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8120 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008121 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008122 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008123 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008124 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008125 return -1;
8126 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008127
8128 if ((flags & F_ALT) &&
8129 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008130 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008131 * of issues that cause pain:
8132 * - when 0 is being converted, the C standard leaves off
8133 * the '0x' or '0X', which is inconsistent with other
8134 * %#x/%#X conversions and inconsistent with Python's
8135 * hex() function
8136 * - there are platforms that violate the standard and
8137 * convert 0 with the '0x' or '0X'
8138 * (Metrowerks, Compaq Tru64)
8139 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008140 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008141 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008142 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008143 * We can achieve the desired consistency by inserting our
8144 * own '0x' or '0X' prefix, and substituting %x/%X in place
8145 * of %#x/%#X.
8146 *
8147 * Note that this is the same approach as used in
8148 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008149 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008150 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8151 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008152 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008153 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008154 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8155 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008156 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008157 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008158 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008159 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008160 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008161 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162}
8163
8164static int
8165formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008166 size_t buflen,
8167 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168{
Ezio Melotti32125152010-02-25 17:36:04 +00008169 PyObject *unistr;
8170 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008171 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008172 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008173 if (PyUnicode_GET_SIZE(v) != 1)
8174 goto onError;
8175 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008178 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008179 if (PyString_GET_SIZE(v) != 1)
8180 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008181 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8182 with a UnicodeDecodeError if 'char' is not decodable with the
8183 default encoding (usually ASCII, but it might be something else) */
8184 str = PyString_AS_STRING(v);
8185 if ((unsigned char)str[0] > 0x7F) {
8186 /* the char is not ASCII; try to decode the string using the
8187 default encoding and return -1 to let the UnicodeDecodeError
8188 be raised if the string can't be decoded */
8189 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8190 if (unistr == NULL)
8191 return -1;
8192 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8193 Py_DECREF(unistr);
8194 }
8195 else
8196 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198
8199 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008200 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008202 x = PyInt_AsLong(v);
8203 if (x == -1 && PyErr_Occurred())
8204 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008205#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008206 if (x < 0 || x > 0x10ffff) {
8207 PyErr_SetString(PyExc_OverflowError,
8208 "%c arg not in range(0x110000) "
8209 "(wide Python build)");
8210 return -1;
8211 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008212#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008213 if (x < 0 || x > 0xffff) {
8214 PyErr_SetString(PyExc_OverflowError,
8215 "%c arg not in range(0x10000) "
8216 "(narrow Python build)");
8217 return -1;
8218 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008219#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 }
8222 buf[1] = '\0';
8223 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008224
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008225 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008226 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008227 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008228 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229}
8230
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008231/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8232
Mark Dickinson18cfada2009-11-23 18:46:41 +00008233 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008234 chars are formatted. XXX This is a magic number. Each formatting
8235 routine does bounds checking to ensure no overflow, but a better
8236 solution may be to malloc a buffer of appropriate size for each
8237 format. For now, the current solution is sufficient.
8238*/
8239#define FORMATBUFLEN (size_t)120
8240
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008242 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243{
8244 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008245 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 int args_owned = 0;
8247 PyUnicodeObject *result = NULL;
8248 PyObject *dict = NULL;
8249 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008250
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008252 PyErr_BadInternalCall();
8253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 }
8255 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008256 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 fmt = PyUnicode_AS_UNICODE(uformat);
8259 fmtcnt = PyUnicode_GET_SIZE(uformat);
8260
8261 reslen = rescnt = fmtcnt + 100;
8262 result = _PyUnicode_New(reslen);
8263 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008264 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 res = PyUnicode_AS_UNICODE(result);
8266
8267 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008268 arglen = PyTuple_Size(args);
8269 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 }
8271 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008272 arglen = -1;
8273 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
Christian Heimese93237d2007-12-19 02:37:44 +00008275 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008276 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008277 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
8279 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008280 if (*fmt != '%') {
8281 if (--rescnt < 0) {
8282 rescnt = fmtcnt + 100;
8283 reslen += rescnt;
8284 if (_PyUnicode_Resize(&result, reslen) < 0)
8285 goto onError;
8286 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8287 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008288 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008289 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008290 }
8291 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008292 /* Got a format specifier */
8293 int flags = 0;
8294 Py_ssize_t width = -1;
8295 int prec = -1;
8296 Py_UNICODE c = '\0';
8297 Py_UNICODE fill;
8298 int isnumok;
8299 PyObject *v = NULL;
8300 PyObject *temp = NULL;
8301 Py_UNICODE *pbuf;
8302 Py_UNICODE sign;
8303 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008304 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008305
8306 fmt++;
8307 if (*fmt == '(') {
8308 Py_UNICODE *keystart;
8309 Py_ssize_t keylen;
8310 PyObject *key;
8311 int pcount = 1;
8312
8313 if (dict == NULL) {
8314 PyErr_SetString(PyExc_TypeError,
8315 "format requires a mapping");
8316 goto onError;
8317 }
8318 ++fmt;
8319 --fmtcnt;
8320 keystart = fmt;
8321 /* Skip over balanced parentheses */
8322 while (pcount > 0 && --fmtcnt >= 0) {
8323 if (*fmt == ')')
8324 --pcount;
8325 else if (*fmt == '(')
8326 ++pcount;
8327 fmt++;
8328 }
8329 keylen = fmt - keystart - 1;
8330 if (fmtcnt < 0 || pcount > 0) {
8331 PyErr_SetString(PyExc_ValueError,
8332 "incomplete format key");
8333 goto onError;
8334 }
8335#if 0
8336 /* keys are converted to strings using UTF-8 and
8337 then looked up since Python uses strings to hold
8338 variables names etc. in its namespaces and we
8339 wouldn't want to break common idioms. */
8340 key = PyUnicode_EncodeUTF8(keystart,
8341 keylen,
8342 NULL);
8343#else
8344 key = PyUnicode_FromUnicode(keystart, keylen);
8345#endif
8346 if (key == NULL)
8347 goto onError;
8348 if (args_owned) {
8349 Py_DECREF(args);
8350 args_owned = 0;
8351 }
8352 args = PyObject_GetItem(dict, key);
8353 Py_DECREF(key);
8354 if (args == NULL) {
8355 goto onError;
8356 }
8357 args_owned = 1;
8358 arglen = -1;
8359 argidx = -2;
8360 }
8361 while (--fmtcnt >= 0) {
8362 switch (c = *fmt++) {
8363 case '-': flags |= F_LJUST; continue;
8364 case '+': flags |= F_SIGN; continue;
8365 case ' ': flags |= F_BLANK; continue;
8366 case '#': flags |= F_ALT; continue;
8367 case '0': flags |= F_ZERO; continue;
8368 }
8369 break;
8370 }
8371 if (c == '*') {
8372 v = getnextarg(args, arglen, &argidx);
8373 if (v == NULL)
8374 goto onError;
8375 if (!PyInt_Check(v)) {
8376 PyErr_SetString(PyExc_TypeError,
8377 "* wants int");
8378 goto onError;
8379 }
8380 width = PyInt_AsLong(v);
8381 if (width < 0) {
8382 flags |= F_LJUST;
8383 width = -width;
8384 }
8385 if (--fmtcnt >= 0)
8386 c = *fmt++;
8387 }
8388 else if (c >= '0' && c <= '9') {
8389 width = c - '0';
8390 while (--fmtcnt >= 0) {
8391 c = *fmt++;
8392 if (c < '0' || c > '9')
8393 break;
8394 if ((width*10) / 10 != width) {
8395 PyErr_SetString(PyExc_ValueError,
8396 "width too big");
8397 goto onError;
8398 }
8399 width = width*10 + (c - '0');
8400 }
8401 }
8402 if (c == '.') {
8403 prec = 0;
8404 if (--fmtcnt >= 0)
8405 c = *fmt++;
8406 if (c == '*') {
8407 v = getnextarg(args, arglen, &argidx);
8408 if (v == NULL)
8409 goto onError;
8410 if (!PyInt_Check(v)) {
8411 PyErr_SetString(PyExc_TypeError,
8412 "* wants int");
8413 goto onError;
8414 }
8415 prec = PyInt_AsLong(v);
8416 if (prec < 0)
8417 prec = 0;
8418 if (--fmtcnt >= 0)
8419 c = *fmt++;
8420 }
8421 else if (c >= '0' && c <= '9') {
8422 prec = c - '0';
8423 while (--fmtcnt >= 0) {
8424 c = Py_CHARMASK(*fmt++);
8425 if (c < '0' || c > '9')
8426 break;
8427 if ((prec*10) / 10 != prec) {
8428 PyErr_SetString(PyExc_ValueError,
8429 "prec too big");
8430 goto onError;
8431 }
8432 prec = prec*10 + (c - '0');
8433 }
8434 }
8435 } /* prec */
8436 if (fmtcnt >= 0) {
8437 if (c == 'h' || c == 'l' || c == 'L') {
8438 if (--fmtcnt >= 0)
8439 c = *fmt++;
8440 }
8441 }
8442 if (fmtcnt < 0) {
8443 PyErr_SetString(PyExc_ValueError,
8444 "incomplete format");
8445 goto onError;
8446 }
8447 if (c != '%') {
8448 v = getnextarg(args, arglen, &argidx);
8449 if (v == NULL)
8450 goto onError;
8451 }
8452 sign = 0;
8453 fill = ' ';
8454 switch (c) {
8455
8456 case '%':
8457 pbuf = formatbuf;
8458 /* presume that buffer length is at least 1 */
8459 pbuf[0] = '%';
8460 len = 1;
8461 break;
8462
8463 case 's':
8464 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008465 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008466 temp = v;
8467 Py_INCREF(temp);
8468 }
8469 else {
8470 PyObject *unicode;
8471 if (c == 's')
8472 temp = PyObject_Unicode(v);
8473 else
8474 temp = PyObject_Repr(v);
8475 if (temp == NULL)
8476 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008477 if (PyUnicode_Check(temp))
8478 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008479 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008480 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008481 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8482 PyString_GET_SIZE(temp),
8483 NULL,
8484 "strict");
8485 Py_DECREF(temp);
8486 temp = unicode;
8487 if (temp == NULL)
8488 goto onError;
8489 }
8490 else {
8491 Py_DECREF(temp);
8492 PyErr_SetString(PyExc_TypeError,
8493 "%s argument has non-string str()");
8494 goto onError;
8495 }
8496 }
8497 pbuf = PyUnicode_AS_UNICODE(temp);
8498 len = PyUnicode_GET_SIZE(temp);
8499 if (prec >= 0 && len > prec)
8500 len = prec;
8501 break;
8502
8503 case 'i':
8504 case 'd':
8505 case 'u':
8506 case 'o':
8507 case 'x':
8508 case 'X':
8509 if (c == 'i')
8510 c = 'd';
8511 isnumok = 0;
8512 if (PyNumber_Check(v)) {
8513 PyObject *iobj=NULL;
8514
8515 if (PyInt_Check(v) || (PyLong_Check(v))) {
8516 iobj = v;
8517 Py_INCREF(iobj);
8518 }
8519 else {
8520 iobj = PyNumber_Int(v);
8521 if (iobj==NULL) iobj = PyNumber_Long(v);
8522 }
8523 if (iobj!=NULL) {
8524 if (PyInt_Check(iobj)) {
8525 isnumok = 1;
8526 pbuf = formatbuf;
8527 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8528 flags, prec, c, iobj);
8529 Py_DECREF(iobj);
8530 if (len < 0)
8531 goto onError;
8532 sign = 1;
8533 }
8534 else if (PyLong_Check(iobj)) {
8535 isnumok = 1;
8536 temp = formatlong(iobj, flags, prec, c);
8537 Py_DECREF(iobj);
8538 if (!temp)
8539 goto onError;
8540 pbuf = PyUnicode_AS_UNICODE(temp);
8541 len = PyUnicode_GET_SIZE(temp);
8542 sign = 1;
8543 }
8544 else {
8545 Py_DECREF(iobj);
8546 }
8547 }
8548 }
8549 if (!isnumok) {
8550 PyErr_Format(PyExc_TypeError,
8551 "%%%c format: a number is required, "
8552 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8553 goto onError;
8554 }
8555 if (flags & F_ZERO)
8556 fill = '0';
8557 break;
8558
8559 case 'e':
8560 case 'E':
8561 case 'f':
8562 case 'F':
8563 case 'g':
8564 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008565 temp = formatfloat(v, flags, prec, c);
8566 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008567 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008568 pbuf = PyUnicode_AS_UNICODE(temp);
8569 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008570 sign = 1;
8571 if (flags & F_ZERO)
8572 fill = '0';
8573 break;
8574
8575 case 'c':
8576 pbuf = formatbuf;
8577 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8578 if (len < 0)
8579 goto onError;
8580 break;
8581
8582 default:
8583 PyErr_Format(PyExc_ValueError,
8584 "unsupported format character '%c' (0x%x) "
8585 "at index %zd",
8586 (31<=c && c<=126) ? (char)c : '?',
8587 (int)c,
8588 (Py_ssize_t)(fmt - 1 -
8589 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008590 goto onError;
8591 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008592 if (sign) {
8593 if (*pbuf == '-' || *pbuf == '+') {
8594 sign = *pbuf++;
8595 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008596 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008597 else if (flags & F_SIGN)
8598 sign = '+';
8599 else if (flags & F_BLANK)
8600 sign = ' ';
8601 else
8602 sign = 0;
8603 }
8604 if (width < len)
8605 width = len;
8606 if (rescnt - (sign != 0) < width) {
8607 reslen -= rescnt;
8608 rescnt = width + fmtcnt + 100;
8609 reslen += rescnt;
8610 if (reslen < 0) {
8611 Py_XDECREF(temp);
8612 PyErr_NoMemory();
8613 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008614 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008615 if (_PyUnicode_Resize(&result, reslen) < 0) {
8616 Py_XDECREF(temp);
8617 goto onError;
8618 }
8619 res = PyUnicode_AS_UNICODE(result)
8620 + reslen - rescnt;
8621 }
8622 if (sign) {
8623 if (fill != ' ')
8624 *res++ = sign;
8625 rescnt--;
8626 if (width > len)
8627 width--;
8628 }
8629 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8630 assert(pbuf[0] == '0');
8631 assert(pbuf[1] == c);
8632 if (fill != ' ') {
8633 *res++ = *pbuf++;
8634 *res++ = *pbuf++;
8635 }
8636 rescnt -= 2;
8637 width -= 2;
8638 if (width < 0)
8639 width = 0;
8640 len -= 2;
8641 }
8642 if (width > len && !(flags & F_LJUST)) {
8643 do {
8644 --rescnt;
8645 *res++ = fill;
8646 } while (--width > len);
8647 }
8648 if (fill == ' ') {
8649 if (sign)
8650 *res++ = sign;
8651 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8652 assert(pbuf[0] == '0');
8653 assert(pbuf[1] == c);
8654 *res++ = *pbuf++;
8655 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008656 }
8657 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008658 Py_UNICODE_COPY(res, pbuf, len);
8659 res += len;
8660 rescnt -= len;
8661 while (--width >= len) {
8662 --rescnt;
8663 *res++ = ' ';
8664 }
8665 if (dict && (argidx < arglen) && c != '%') {
8666 PyErr_SetString(PyExc_TypeError,
8667 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008668 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008669 goto onError;
8670 }
8671 Py_XDECREF(temp);
8672 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 } /* until end */
8674 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008675 PyErr_SetString(PyExc_TypeError,
8676 "not all arguments converted during string formatting");
8677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 }
8679
Thomas Woutersa96affe2006-03-12 00:29:36 +00008680 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008681 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008683 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 }
8685 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 return (PyObject *)result;
8687
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 Py_XDECREF(result);
8690 Py_DECREF(uformat);
8691 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008692 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 }
8694 return NULL;
8695}
8696
8697static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 (readbufferproc) unicode_buffer_getreadbuf,
8699 (writebufferproc) unicode_buffer_getwritebuf,
8700 (segcountproc) unicode_buffer_getsegcount,
8701 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702};
8703
Jeremy Hylton938ace62002-07-17 16:30:39 +00008704static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008705unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8706
Tim Peters6d6c1a32001-08-02 04:15:00 +00008707static PyObject *
8708unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8709{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008710 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008711 static char *kwlist[] = {"string", "encoding", "errors", 0};
8712 char *encoding = NULL;
8713 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008714
Benjamin Peterson857ce152009-01-31 16:29:18 +00008715 if (type != &PyUnicode_Type)
8716 return unicode_subtype_new(type, args, kwds);
8717 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008718 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008719 return NULL;
8720 if (x == NULL)
8721 return (PyObject *)_PyUnicode_New(0);
8722 if (encoding == NULL && errors == NULL)
8723 return PyObject_Unicode(x);
8724 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008725 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008726}
8727
Guido van Rossume023fe02001-08-30 03:12:59 +00008728static PyObject *
8729unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8730{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008731 PyUnicodeObject *tmp, *pnew;
8732 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008733
Benjamin Peterson857ce152009-01-31 16:29:18 +00008734 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8735 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8736 if (tmp == NULL)
8737 return NULL;
8738 assert(PyUnicode_Check(tmp));
8739 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8740 if (pnew == NULL) {
8741 Py_DECREF(tmp);
8742 return NULL;
8743 }
8744 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8745 if (pnew->str == NULL) {
8746 _Py_ForgetReference((PyObject *)pnew);
8747 PyObject_Del(pnew);
8748 Py_DECREF(tmp);
8749 return PyErr_NoMemory();
8750 }
8751 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8752 pnew->length = n;
8753 pnew->hash = tmp->hash;
8754 Py_DECREF(tmp);
8755 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008756}
8757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008758PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008759 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008760\n\
8761Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008762encoding defaults to the current default string encoding.\n\
8763errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008764
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008766 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008767 "unicode", /* tp_name */
8768 sizeof(PyUnicodeObject), /* tp_size */
8769 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008771 (destructor)unicode_dealloc, /* tp_dealloc */
8772 0, /* tp_print */
8773 0, /* tp_getattr */
8774 0, /* tp_setattr */
8775 0, /* tp_compare */
8776 unicode_repr, /* tp_repr */
8777 &unicode_as_number, /* tp_as_number */
8778 &unicode_as_sequence, /* tp_as_sequence */
8779 &unicode_as_mapping, /* tp_as_mapping */
8780 (hashfunc) unicode_hash, /* tp_hash*/
8781 0, /* tp_call*/
8782 (reprfunc) unicode_str, /* tp_str */
8783 PyObject_GenericGetAttr, /* tp_getattro */
8784 0, /* tp_setattro */
8785 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008786 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008787 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008788 unicode_doc, /* tp_doc */
8789 0, /* tp_traverse */
8790 0, /* tp_clear */
8791 PyUnicode_RichCompare, /* tp_richcompare */
8792 0, /* tp_weaklistoffset */
8793 0, /* tp_iter */
8794 0, /* tp_iternext */
8795 unicode_methods, /* tp_methods */
8796 0, /* tp_members */
8797 0, /* tp_getset */
8798 &PyBaseString_Type, /* tp_base */
8799 0, /* tp_dict */
8800 0, /* tp_descr_get */
8801 0, /* tp_descr_set */
8802 0, /* tp_dictoffset */
8803 0, /* tp_init */
8804 0, /* tp_alloc */
8805 unicode_new, /* tp_new */
8806 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807};
8808
8809/* Initialize the Unicode implementation */
8810
Thomas Wouters78890102000-07-22 19:25:51 +00008811void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008813 int i;
8814
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008815 /* XXX - move this array to unicodectype.c ? */
8816 Py_UNICODE linebreak[] = {
8817 0x000A, /* LINE FEED */
8818 0x000D, /* CARRIAGE RETURN */
8819 0x001C, /* FILE SEPARATOR */
8820 0x001D, /* GROUP SEPARATOR */
8821 0x001E, /* RECORD SEPARATOR */
8822 0x0085, /* NEXT LINE */
8823 0x2028, /* LINE SEPARATOR */
8824 0x2029, /* PARAGRAPH SEPARATOR */
8825 };
8826
Fred Drakee4315f52000-05-09 19:53:39 +00008827 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008828 free_list = NULL;
8829 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008831 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008832 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008833
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008834 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008835 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008836 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008837 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008838 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008839
8840 /* initialize the linebreak bloom filter */
8841 bloom_linebreak = make_bloom_mask(
8842 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8843 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008844
8845 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846}
8847
8848/* Finalize the Unicode implementation */
8849
Christian Heimes3b718a72008-02-14 12:47:33 +00008850int
8851PyUnicode_ClearFreeList(void)
8852{
8853 int freelist_size = numfree;
8854 PyUnicodeObject *u;
8855
8856 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008857 PyUnicodeObject *v = u;
8858 u = *(PyUnicodeObject **)u;
8859 if (v->str)
8860 PyObject_DEL(v->str);
8861 Py_XDECREF(v->defenc);
8862 PyObject_Del(v);
8863 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008864 }
8865 free_list = NULL;
8866 assert(numfree == 0);
8867 return freelist_size;
8868}
8869
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870void
Thomas Wouters78890102000-07-22 19:25:51 +00008871_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008873 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008875 Py_XDECREF(unicode_empty);
8876 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008877
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008878 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008879 if (unicode_latin1[i]) {
8880 Py_DECREF(unicode_latin1[i]);
8881 unicode_latin1[i] = NULL;
8882 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008883 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008884 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008886
Anthony Baxterac6bd462006-04-13 02:06:09 +00008887#ifdef __cplusplus
8888}
8889#endif