blob: 01a64807e0264fd628d4124d4fe206699a20fd01 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000297 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000300 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Andrew Dalkee0df7622006-05-27 11:04:36 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
Neal Norwitze7d8be82008-07-31 17:17:14 +0000315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
318 }
319
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000329 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
332 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000333 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000334 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000337 }
338 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 }
340 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000341 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 if (unicode == NULL)
344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 }
348
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000349 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000350 PyErr_NoMemory();
351 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000352 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000353 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
359 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000360 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000362 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000378 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000380 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
385 }
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
389 }
390 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 }
395 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400}
401
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000402static
403int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404{
405 register PyUnicodeObject *v;
406
407 /* Argument checks */
408 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 PyErr_BadInternalCall();
410 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000412 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000414 PyErr_BadInternalCall();
415 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000421 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 }
432
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
436}
437
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000438int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
439{
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000456 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000457
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
468 }
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
471 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
Tim Petersced69f82003-09-16 20:30:58 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
477
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000480 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481
482 return (PyObject *)unicode;
483}
484
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000485PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
486{
487 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000488
Benjamin Peterson857ce152009-01-31 16:29:18 +0000489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000491 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 return NULL;
493 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000494
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
500
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000505 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000506
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
517 }
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
520 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000521
522 return PyUnicode_DecodeUTF8(u, size, NULL);
523 }
524
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
528
529 return (PyObject *)unicode;
530}
531
532PyObject *PyUnicode_FromString(const char *u)
533{
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
538 }
539
540 return PyUnicode_FromStringAndSize(u, size);
541}
542
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543#ifdef HAVE_WCHAR_H
544
Mark Dickinson6b265f12009-03-18 16:07:26 +0000545#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546# define CONVERT_WCHAR_TO_SURROGATES
547#endif
548
549#ifdef CONVERT_WCHAR_TO_SURROGATES
550
551/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
553
554PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
556{
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
561
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
573 }
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
578
579 /* Copy the wchar_t data into the new object */
580 {
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
589 }
590 else
591 *u++ = *w++;
592 }
593 }
594 return (PyObject *)unicode;
595}
596
597#else
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000600 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601{
602 PyUnicodeObject *unicode;
603
604 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 PyErr_BadInternalCall();
606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 }
608
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614#ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000616#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624#endif
625
626 return (PyObject *)unicode;
627}
628
Mark Dickinson6b265f12009-03-18 16:07:26 +0000629#endif /* CONVERT_WCHAR_TO_SURROGATES */
630
631#undef CONVERT_WCHAR_TO_SURROGATES
632
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633static void
634makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
635{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
641 }
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
650 }
651 *fmt++ = c;
652 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653}
654
655#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656
657PyObject *
658PyUnicode_FromFormatV(const char *format, va_list vargs)
659{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000680
681#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000682 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000683#else
684#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000687 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000688#endif
689#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000697 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
703 if (*f == 's')
704 ++callcount;
705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000706 }
707 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
714 }
715 callresult = callresults;
716 }
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
729 */
730 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
755 {
756 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000757 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000764 break;
765 }
766 case 'U':
767 {
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
772 }
773 case 'V':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
784 }
785 case 'S':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
797 }
798 case 'R':
799 {
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
810 }
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
817 */
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
829 }
830 } else
831 n++;
832 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000833 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
839 }
840 realbuffer = abuffer;
841 }
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000851
Benjamin Peterson857ce152009-01-31 16:29:18 +0000852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000854
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
870 }
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
876 }
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
881 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000882
Benjamin Peterson857ce152009-01-31 16:29:18 +0000883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
918 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000928 break;
929 }
930 case 'U':
931 {
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
937 }
938 case 'V':
939 {
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
948 }
949 break;
950 }
951 case 'S':
952 case 'R':
953 {
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
968 }
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
978 }
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
987 }
988 } else
989 *s++ = *f;
990 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000992 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1005 }
1006 PyObject_Free(callresults);
1007 }
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011}
1012
1013#undef appendstring
1014
1015PyObject *
1016PyUnicode_FromFormat(const char *format, ...)
1017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 PyObject* ret;
1019 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001020
1021#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001025#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001029}
1030
Martin v. Löwis18e16552006-02-15 17:27:45 +00001031Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001032 wchar_t *w,
1033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyErr_BadInternalCall();
1037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039
1040 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044#ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046#else
1047 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 }
1054#endif
1055
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060}
1061
1062#endif
1063
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064PyObject *PyUnicode_FromOrdinal(int ordinal)
1065{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001066 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067
1068#ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074 }
1075#else
1076 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#endif
1083
Hye-Shik Chang40574832004-04-06 07:24:51 +00001084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001086}
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_FromObject(register PyObject *obj)
1089{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001092 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 Py_INCREF(obj);
1094 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 }
1096 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103}
1104
1105PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 const char *encoding,
1107 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001110 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 PyErr_BadInternalCall();
1115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118#if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001123
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001126
1127 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 return PyObject_Unicode(obj);
1135 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136#else
1137 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#endif
1143
1144 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001145 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 }
Christian Heimes3497f942008-05-26 12:29:14 +00001149 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 }
Tim Petersced69f82003-09-16 20:30:58 +00001165
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001173
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 return v;
1175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184{
1185 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186
1187 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001189
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001211 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245 return NULL;
1246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252{
1253 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1261}
1262
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1266{
1267 PyObject *v;
1268
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1272 }
1273
1274 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284 return NULL;
1285}
1286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
Fred Drakee4315f52000-05-09 19:53:39 +00001297
Tim Petersced69f82003-09-16 20:30:58 +00001298 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001300
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001307#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001319 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001321 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001322 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 Py_DECREF(v);
1324 goto onError;
1325 }
1326 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 return NULL;
1330}
1331
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001333 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001334{
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_AS_UNICODE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return NULL;
1355}
1356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358{
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1362 }
1363 return PyUnicode_GET_SIZE(unicode);
1364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 return -1;
1367}
1368
Thomas Wouters78890102000-07-22 19:25:51 +00001369const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001370{
1371 return unicode_default_encoding;
1372}
1373
1374int PyUnicode_SetDefaultEncoding(const char *encoding)
1375{
1376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001377
Fred Drakee4315f52000-05-09 19:53:39 +00001378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 encoding,
1386 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001387 return 0;
1388
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return -1;
1391}
1392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393/* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001395 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 and adjust various state variables.
1397 return 0 on success, -1 on error
1398*/
1399
1400static
1401int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 int res = -1;
1417
1418 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 }
1423
1424 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001425 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 }
1430 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 }
1438
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 }
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001450 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 }
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_XDECREF(restuple);
1479 return res;
1480}
1481
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482/* --- UTF-7 Codec -------------------------------------------------------- */
1483
Antoine Pitrou653dece2009-05-04 18:32:32 +00001484/* See RFC2152 for details. We encode conservatively and decode liberally. */
1485
1486/* Three simple macros defining base-64. */
1487
1488/* Is c a base-64 character? */
1489
1490#define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493/* given that c is a base-64 character, what is its base-64 value? */
1494
1495#define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1500
1501/* What is the base-64 character of the bottom 6 bits of n? */
1502
1503#define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1510
1511#define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1513
1514/* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
Tim Petersced69f82003-09-16 20:30:58 +00001528static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001529char utf7_category[128] = {
1530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534/* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538/* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542/* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546};
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554#define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001561 Py_ssize_t size,
1562 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565}
1566
Antoine Pitrou653dece2009-05-04 18:32:32 +00001567/* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1573
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001574PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001605 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 e = s + size;
1607
1608 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625#ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628#else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631#endif
1632 surrogate = 0;
1633 }
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1638 }
1639 }
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1643 }
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1647 }
1648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001900 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1925
1926 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 s++;
1932 continue;
1933 }
1934
1935 n = utf8_code_length[ch];
1936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001946 goto utf8Error;
1947 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
1950 switch (n) {
1951
1952 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001953 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001968 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001982 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001990 endinpos = startinpos + 1;
1991
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001998 goto utf8Error;
1999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2020 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 goto utf8Error;
2022 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
2043 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 }
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 goto onError;
2061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return (PyObject *)unicode;
2065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002066 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_DECREF(unicode);
2070 return NULL;
2071}
2072
Tim Peters602f7402002-04-27 18:03:26 +00002073/* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002077*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002078PyObject *
2079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002080 Py_ssize_t size,
2081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082{
Tim Peters602f7402002-04-27 18:03:26 +00002083#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002084
Martin v. Löwis18e16552006-02-15 17:27:45 +00002085 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002089 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002090 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 assert(s != NULL);
2093 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094
Tim Peters602f7402002-04-27 18:03:26 +00002095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2099 */
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2103 }
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002109 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002110 if (v == NULL)
2111 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002112 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002113 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002114
Tim Peters602f7402002-04-27 18:03:26 +00002115 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002117
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002119 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127 else {
Tim Peters602f7402002-04-27 18:03:26 +00002128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002137 i++;
2138 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (v == NULL) {
2157 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002158 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
2162 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002170
Tim Peters602f7402002-04-27 18:03:26 +00002171#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172}
2173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183}
2184
Walter Dörwald6e390802007-08-17 16:41:28 +00002185/* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187PyObject *
2188PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002192{
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194}
2195
2196PyObject *
2197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002202{
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209#ifndef Py_UNICODE_WIDE
2210 int i, pairs;
2211#else
2212 const int pairs = 0;
2213#endif
2214 const unsigned char *q, *e;
2215 int bo = 0; /* assume native ordering by default */
2216 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002217 /* Offsets from q for retrieving bytes in the right order. */
2218#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2219 int iorder[] = {0, 1, 2, 3};
2220#else
2221 int iorder[] = {3, 2, 1, 0};
2222#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002223 PyObject *errorHandler = NULL;
2224 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002225 /* On narrow builds we split characters outside the BMP into two
2226 codepoints => count how much extra space we need. */
2227#ifndef Py_UNICODE_WIDE
2228 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002229 if (((Py_UCS4 *)s)[i] >= 0x10000)
2230 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002231#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002232
2233 /* This might be one to much, because of a BOM */
2234 unicode = _PyUnicode_New((size+3)/4+pairs);
2235 if (!unicode)
2236 return NULL;
2237 if (size == 0)
2238 return (PyObject *)unicode;
2239
2240 /* Unpack UTF-32 encoded data */
2241 p = unicode->str;
2242 q = (unsigned char *)s;
2243 e = q + size;
2244
2245 if (byteorder)
2246 bo = *byteorder;
2247
2248 /* Check for BOM marks (U+FEFF) in the input and adjust current
2249 byte order setting accordingly. In native mode, the leading BOM
2250 mark is skipped, in all other modes, it is copied to the output
2251 stream as-is (giving a ZWNBSP character). */
2252 if (bo == 0) {
2253 if (size >= 4) {
2254 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002255 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002256#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002257 if (bom == 0x0000FEFF) {
2258 q += 4;
2259 bo = -1;
2260 }
2261 else if (bom == 0xFFFE0000) {
2262 q += 4;
2263 bo = 1;
2264 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002265#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002266 if (bom == 0x0000FEFF) {
2267 q += 4;
2268 bo = 1;
2269 }
2270 else if (bom == 0xFFFE0000) {
2271 q += 4;
2272 bo = -1;
2273 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002274#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002275 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002276 }
2277
2278 if (bo == -1) {
2279 /* force LE */
2280 iorder[0] = 0;
2281 iorder[1] = 1;
2282 iorder[2] = 2;
2283 iorder[3] = 3;
2284 }
2285 else if (bo == 1) {
2286 /* force BE */
2287 iorder[0] = 3;
2288 iorder[1] = 2;
2289 iorder[2] = 1;
2290 iorder[3] = 0;
2291 }
2292
2293 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002294 Py_UCS4 ch;
2295 /* remaining bytes at the end? (size should be divisible by 4) */
2296 if (e-q<4) {
2297 if (consumed)
2298 break;
2299 errmsg = "truncated data";
2300 startinpos = ((const char *)q)-starts;
2301 endinpos = ((const char *)e)-starts;
2302 goto utf32Error;
2303 /* The remaining input chars are ignored if the callback
2304 chooses to skip the input */
2305 }
2306 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2307 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002308
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002309 if (ch >= 0x110000)
2310 {
2311 errmsg = "codepoint not in range(0x110000)";
2312 startinpos = ((const char *)q)-starts;
2313 endinpos = startinpos+4;
2314 goto utf32Error;
2315 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002316#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002317 if (ch >= 0x10000)
2318 {
2319 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2320 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2321 }
2322 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002323#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002324 *p++ = ch;
2325 q += 4;
2326 continue;
2327 utf32Error:
2328 outpos = p-PyUnicode_AS_UNICODE(unicode);
2329 if (unicode_decode_call_errorhandler(
2330 errors, &errorHandler,
2331 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002332 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002333 &unicode, &outpos, &p))
2334 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002335 }
2336
2337 if (byteorder)
2338 *byteorder = bo;
2339
2340 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002341 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002342
2343 /* Adjust length */
2344 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2345 goto onError;
2346
2347 Py_XDECREF(errorHandler);
2348 Py_XDECREF(exc);
2349 return (PyObject *)unicode;
2350
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002351 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002352 Py_DECREF(unicode);
2353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
2355 return NULL;
2356}
2357
2358PyObject *
2359PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002360 Py_ssize_t size,
2361 const char *errors,
2362 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002363{
2364 PyObject *v;
2365 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002366 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002367#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002368 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002369#else
2370 const int pairs = 0;
2371#endif
2372 /* Offsets from p for storing byte pairs in the right order. */
2373#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2374 int iorder[] = {0, 1, 2, 3};
2375#else
2376 int iorder[] = {3, 2, 1, 0};
2377#endif
2378
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002379#define STORECHAR(CH) \
2380 do { \
2381 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2382 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2383 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2384 p[iorder[0]] = (CH) & 0xff; \
2385 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002386 } while(0)
2387
2388 /* In narrow builds we can output surrogate pairs as one codepoint,
2389 so we need less space. */
2390#ifndef Py_UNICODE_WIDE
2391 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002392 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2393 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2394 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002395#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002396 nsize = (size - pairs + (byteorder == 0));
2397 bytesize = nsize * 4;
2398 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002399 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002400 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002401 if (v == NULL)
2402 return NULL;
2403
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002404 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002405 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002406 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002407 if (size == 0)
2408 return v;
2409
2410 if (byteorder == -1) {
2411 /* force LE */
2412 iorder[0] = 0;
2413 iorder[1] = 1;
2414 iorder[2] = 2;
2415 iorder[3] = 3;
2416 }
2417 else if (byteorder == 1) {
2418 /* force BE */
2419 iorder[0] = 3;
2420 iorder[1] = 2;
2421 iorder[2] = 1;
2422 iorder[3] = 0;
2423 }
2424
2425 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002426 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002427#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002428 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2429 Py_UCS4 ch2 = *s;
2430 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2431 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2432 s++;
2433 size--;
2434 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002435 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002436#endif
2437 STORECHAR(ch);
2438 }
2439 return v;
2440#undef STORECHAR
2441}
2442
2443PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2444{
2445 if (!PyUnicode_Check(unicode)) {
2446 PyErr_BadArgument();
2447 return NULL;
2448 }
2449 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002450 PyUnicode_GET_SIZE(unicode),
2451 NULL,
2452 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002453}
2454
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455/* --- UTF-16 Codec ------------------------------------------------------- */
2456
Tim Peters772747b2001-08-09 22:21:55 +00002457PyObject *
2458PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002459 Py_ssize_t size,
2460 const char *errors,
2461 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462{
Walter Dörwald69652032004-09-07 20:24:22 +00002463 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2464}
2465
2466PyObject *
2467PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002468 Py_ssize_t size,
2469 const char *errors,
2470 int *byteorder,
2471 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002472{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002474 Py_ssize_t startinpos;
2475 Py_ssize_t endinpos;
2476 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 PyUnicodeObject *unicode;
2478 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002479 const unsigned char *q, *e;
2480 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002481 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002482 /* Offsets from q for retrieving byte pairs in the right order. */
2483#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2484 int ihi = 1, ilo = 0;
2485#else
2486 int ihi = 0, ilo = 1;
2487#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002488 PyObject *errorHandler = NULL;
2489 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490
2491 /* Note: size will always be longer than the resulting Unicode
2492 character count */
2493 unicode = _PyUnicode_New(size);
2494 if (!unicode)
2495 return NULL;
2496 if (size == 0)
2497 return (PyObject *)unicode;
2498
2499 /* Unpack UTF-16 encoded data */
2500 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002501 q = (unsigned char *)s;
2502 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503
2504 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002505 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002507 /* Check for BOM marks (U+FEFF) in the input and adjust current
2508 byte order setting accordingly. In native mode, the leading BOM
2509 mark is skipped, in all other modes, it is copied to the output
2510 stream as-is (giving a ZWNBSP character). */
2511 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002512 if (size >= 2) {
2513 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002514#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002515 if (bom == 0xFEFF) {
2516 q += 2;
2517 bo = -1;
2518 }
2519 else if (bom == 0xFFFE) {
2520 q += 2;
2521 bo = 1;
2522 }
Tim Petersced69f82003-09-16 20:30:58 +00002523#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002524 if (bom == 0xFEFF) {
2525 q += 2;
2526 bo = 1;
2527 }
2528 else if (bom == 0xFFFE) {
2529 q += 2;
2530 bo = -1;
2531 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002532#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002533 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535
Tim Peters772747b2001-08-09 22:21:55 +00002536 if (bo == -1) {
2537 /* force LE */
2538 ihi = 1;
2539 ilo = 0;
2540 }
2541 else if (bo == 1) {
2542 /* force BE */
2543 ihi = 0;
2544 ilo = 1;
2545 }
2546
2547 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002548 Py_UNICODE ch;
2549 /* remaining bytes at the end? (size should be even) */
2550 if (e-q<2) {
2551 if (consumed)
2552 break;
2553 errmsg = "truncated data";
2554 startinpos = ((const char *)q)-starts;
2555 endinpos = ((const char *)e)-starts;
2556 goto utf16Error;
2557 /* The remaining input chars are ignored if the callback
2558 chooses to skip the input */
2559 }
2560 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561
Benjamin Peterson857ce152009-01-31 16:29:18 +00002562 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002563
2564 if (ch < 0xD800 || ch > 0xDFFF) {
2565 *p++ = ch;
2566 continue;
2567 }
2568
2569 /* UTF-16 code pair: */
2570 if (q >= e) {
2571 errmsg = "unexpected end of data";
2572 startinpos = (((const char *)q)-2)-starts;
2573 endinpos = ((const char *)e)-starts;
2574 goto utf16Error;
2575 }
2576 if (0xD800 <= ch && ch <= 0xDBFF) {
2577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2578 q += 2;
2579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002580#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002581 *p++ = ch;
2582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002583#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002585#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 continue;
2587 }
2588 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002589 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 startinpos = (((const char *)q)-4)-starts;
2591 endinpos = startinpos+2;
2592 goto utf16Error;
2593 }
2594
Benjamin Peterson857ce152009-01-31 16:29:18 +00002595 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 errmsg = "illegal encoding";
2597 startinpos = (((const char *)q)-2)-starts;
2598 endinpos = startinpos+2;
2599 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002601 utf16Error:
2602 outpos = p-PyUnicode_AS_UNICODE(unicode);
2603 if (unicode_decode_call_errorhandler(
2604 errors, &errorHandler,
2605 "utf16", errmsg,
2606 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2607 &unicode, &outpos, &p))
2608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
2610
2611 if (byteorder)
2612 *byteorder = bo;
2613
Walter Dörwald69652032004-09-07 20:24:22 +00002614 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002615 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002618 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 goto onError;
2620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 Py_XDECREF(errorHandler);
2622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 return (PyObject *)unicode;
2624
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002625 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return NULL;
2630}
2631
Tim Peters772747b2001-08-09 22:21:55 +00002632PyObject *
2633PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002634 Py_ssize_t size,
2635 const char *errors,
2636 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637{
2638 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002639 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002640 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002641#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002642 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002643#else
2644 const int pairs = 0;
2645#endif
Tim Peters772747b2001-08-09 22:21:55 +00002646 /* Offsets from p for storing byte pairs in the right order. */
2647#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2648 int ihi = 1, ilo = 0;
2649#else
2650 int ihi = 0, ilo = 1;
2651#endif
2652
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002653#define STORECHAR(CH) \
2654 do { \
2655 p[ihi] = ((CH) >> 8) & 0xff; \
2656 p[ilo] = (CH) & 0xff; \
2657 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002658 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002660#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002662 if (s[i] >= 0x10000)
2663 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002664#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002665 /* 2 * (size + pairs + (byteorder == 0)) */
2666 if (size > PY_SSIZE_T_MAX ||
2667 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002669 nsize = size + pairs + (byteorder == 0);
2670 bytesize = nsize * 2;
2671 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002672 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002673 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 if (v == NULL)
2675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002677 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002680 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002681 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002682
2683 if (byteorder == -1) {
2684 /* force LE */
2685 ihi = 1;
2686 ilo = 0;
2687 }
2688 else if (byteorder == 1) {
2689 /* force BE */
2690 ihi = 0;
2691 ilo = 1;
2692 }
2693
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002694 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002695 Py_UNICODE ch = *s++;
2696 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002697#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002698 if (ch >= 0x10000) {
2699 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2700 ch = 0xD800 | ((ch-0x10000) >> 10);
2701 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002702#endif
Tim Peters772747b2001-08-09 22:21:55 +00002703 STORECHAR(ch);
2704 if (ch2)
2705 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002708#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709}
2710
2711PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2712{
2713 if (!PyUnicode_Check(unicode)) {
2714 PyErr_BadArgument();
2715 return NULL;
2716 }
2717 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002718 PyUnicode_GET_SIZE(unicode),
2719 NULL,
2720 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
2723/* --- Unicode Escape Codec ----------------------------------------------- */
2724
Fredrik Lundh06d12682001-01-24 07:59:11 +00002725static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002728 Py_ssize_t size,
2729 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002732 Py_ssize_t startinpos;
2733 Py_ssize_t endinpos;
2734 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002739 char* message;
2740 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 PyObject *errorHandler = NULL;
2742 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 /* Escaped strings will always be longer than the resulting
2745 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 length after conversion to the true value.
2747 (but if the error callback returns a long replacement string
2748 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 v = _PyUnicode_New(size);
2750 if (v == NULL)
2751 goto onError;
2752 if (size == 0)
2753 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 while (s < end) {
2759 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002760 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762
2763 /* Non-escape characters are interpreted as Unicode ordinals */
2764 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 continue;
2767 }
2768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 /* \ - Escapes */
2771 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002772 c = *s++;
2773 if (s > end)
2774 c = '\0'; /* Invalid after \ */
2775 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002777 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 case '\n': break;
2779 case '\\': *p++ = '\\'; break;
2780 case '\'': *p++ = '\''; break;
2781 case '\"': *p++ = '\"'; break;
2782 case 'b': *p++ = '\b'; break;
2783 case 'f': *p++ = '\014'; break; /* FF */
2784 case 't': *p++ = '\t'; break;
2785 case 'n': *p++ = '\n'; break;
2786 case 'r': *p++ = '\r'; break;
2787 case 'v': *p++ = '\013'; break; /* VT */
2788 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2789
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002790 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 case '0': case '1': case '2': case '3':
2792 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002793 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002794 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002795 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002796 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002797 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002799 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 break;
2801
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002802 /* hex escapes */
2803 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 digits = 2;
2806 message = "truncated \\xXX escape";
2807 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002809 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811 digits = 4;
2812 message = "truncated \\uXXXX escape";
2813 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002815 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002816 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 digits = 8;
2818 message = "truncated \\UXXXXXXXX escape";
2819 hexescape:
2820 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 outpos = p-PyUnicode_AS_UNICODE(v);
2822 if (s+digits>end) {
2823 endinpos = size;
2824 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002825 errors, &errorHandler,
2826 "unicodeescape", "end of string in escape sequence",
2827 starts, size, &startinpos, &endinpos, &exc, &s,
2828 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 goto onError;
2830 goto nextByte;
2831 }
2832 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002834 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 endinpos = (s+i+1)-starts;
2836 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002837 errors, &errorHandler,
2838 "unicodeescape", message,
2839 starts, size, &startinpos, &endinpos, &exc, &s,
2840 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002841 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002843 }
2844 chr = (chr<<4) & ~0xF;
2845 if (c >= '0' && c <= '9')
2846 chr += c - '0';
2847 else if (c >= 'a' && c <= 'f')
2848 chr += 10 + c - 'a';
2849 else
2850 chr += 10 + c - 'A';
2851 }
2852 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002853 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 /* _decoding_error will have already written into the
2855 target buffer. */
2856 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002857 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002858 /* when we get here, chr is a 32-bit unicode character */
2859 if (chr <= 0xffff)
2860 /* UCS-2 character */
2861 *p++ = (Py_UNICODE) chr;
2862 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002863 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002864 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002865#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002866 *p++ = chr;
2867#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002868 chr -= 0x10000L;
2869 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002870 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002871#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002872 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 endinpos = s-starts;
2874 outpos = p-PyUnicode_AS_UNICODE(v);
2875 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002876 errors, &errorHandler,
2877 "unicodeescape", "illegal Unicode character",
2878 starts, size, &startinpos, &endinpos, &exc, &s,
2879 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002880 goto onError;
2881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002882 break;
2883
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002884 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 case 'N':
2886 message = "malformed \\N character escape";
2887 if (ucnhash_CAPI == NULL) {
2888 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 if (ucnhash_CAPI == NULL)
2891 goto ucnhashError;
2892 }
2893 if (*s == '{') {
2894 const char *start = s+1;
2895 /* look for the closing brace */
2896 while (*s != '}' && s < end)
2897 s++;
2898 if (s > start && s < end && *s == '}') {
2899 /* found a name. look it up in the unicode database */
2900 message = "unknown Unicode character name";
2901 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002902 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002903 goto store;
2904 }
2905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906 endinpos = s-starts;
2907 outpos = p-PyUnicode_AS_UNICODE(v);
2908 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002909 errors, &errorHandler,
2910 "unicodeescape", message,
2911 starts, size, &startinpos, &endinpos, &exc, &s,
2912 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002913 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 break;
2915
2916 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002917 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 message = "\\ at end of string";
2919 s--;
2920 endinpos = s-starts;
2921 outpos = p-PyUnicode_AS_UNICODE(v);
2922 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002923 errors, &errorHandler,
2924 "unicodeescape", message,
2925 starts, size, &startinpos, &endinpos, &exc, &s,
2926 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002927 goto onError;
2928 }
2929 else {
2930 *p++ = '\\';
2931 *p++ = (unsigned char)s[-1];
2932 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002933 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002935 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002938 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002940 Py_XDECREF(errorHandler);
2941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002943
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002944 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002945 PyErr_SetString(
2946 PyExc_UnicodeError,
2947 "\\N escapes not supported (can't load unicodedata module)"
2948 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002949 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 Py_XDECREF(errorHandler);
2951 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002952 return NULL;
2953
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956 Py_XDECREF(errorHandler);
2957 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 return NULL;
2959}
2960
2961/* Return a Unicode-Escape string version of the Unicode object.
2962
2963 If quotes is true, the string is enclosed in u"" or u'' quotes as
2964 appropriate.
2965
2966*/
2967
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002968Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002969 Py_ssize_t size,
2970 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002971{
2972 /* like wcschr, but doesn't stop at NULL characters */
2973
2974 while (size-- > 0) {
2975 if (*s == ch)
2976 return s;
2977 s++;
2978 }
2979
2980 return NULL;
2981}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002982
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983static
2984PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002985 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 int quotes)
2987{
2988 PyObject *repr;
2989 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002991 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002992#ifdef Py_UNICODE_WIDE
2993 const Py_ssize_t expandsize = 10;
2994#else
2995 const Py_ssize_t expandsize = 6;
2996#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997
Neal Norwitz17753ec2006-08-21 22:21:19 +00002998 /* XXX(nnorwitz): rather than over-allocating, it would be
2999 better to choose a different scheme. Perhaps scan the
3000 first N-chars of the string and allocate based on that size.
3001 */
3002 /* Initial allocation is based on the longest-possible unichr
3003 escape.
3004
3005 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3006 unichr, so in this case it's the longest unichr escape. In
3007 narrow (UTF-16) builds this is five chars per source unichr
3008 since there are two unichrs in the surrogate pair, so in narrow
3009 (UTF-16) builds it's not the longest unichr escape.
3010
3011 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3012 so in the narrow (UTF-16) build case it's the longest unichr
3013 escape.
3014 */
3015
Neal Norwitze7d8be82008-07-31 17:17:14 +00003016 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003017 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003018
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003019 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003020 2
3021 + expandsize*size
3022 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 if (repr == NULL)
3024 return NULL;
3025
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003026 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
3028 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003030 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 !findchar(s, size, '"')) ? '"' : '\'';
3032 }
3033 while (size-- > 0) {
3034 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003035
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003036 /* Escape quotes and backslashes */
3037 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003038 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 *p++ = '\\';
3040 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003041 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003042 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003043
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003044#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003045 /* Map 21-bit characters to '\U00xxxxxx' */
3046 else if (ch >= 0x10000) {
3047 *p++ = '\\';
3048 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003049 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3050 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003056 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003057 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003058 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003059#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003060 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3061 else if (ch >= 0xD800 && ch < 0xDC00) {
3062 Py_UNICODE ch2;
3063 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003064
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003065 ch2 = *s++;
3066 size--;
3067 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3068 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3069 *p++ = '\\';
3070 *p++ = 'U';
3071 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3072 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3078 *p++ = hexdigit[ucs & 0x0000000F];
3079 continue;
3080 }
3081 /* Fall through: isolated surrogates are copied as-is */
3082 s--;
3083 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003084 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003085#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003088 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 *p++ = '\\';
3090 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003091 *p++ = hexdigit[(ch >> 12) & 0x000F];
3092 *p++ = hexdigit[(ch >> 8) & 0x000F];
3093 *p++ = hexdigit[(ch >> 4) & 0x000F];
3094 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003096
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003097 /* Map special whitespace to '\t', \n', '\r' */
3098 else if (ch == '\t') {
3099 *p++ = '\\';
3100 *p++ = 't';
3101 }
3102 else if (ch == '\n') {
3103 *p++ = '\\';
3104 *p++ = 'n';
3105 }
3106 else if (ch == '\r') {
3107 *p++ = '\\';
3108 *p++ = 'r';
3109 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003110
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003111 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003112 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003114 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003115 *p++ = hexdigit[(ch >> 4) & 0x000F];
3116 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003117 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003118
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 /* Copy everything else as-is */
3120 else
3121 *p++ = (char) ch;
3122 }
3123 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003124 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003127 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 return repr;
3130}
3131
3132PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003133 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134{
3135 return unicodeescape_string(s, size, 0);
3136}
3137
3138PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3139{
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 return NULL;
3143 }
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003145 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146}
3147
3148/* --- Raw Unicode Escape Codec ------------------------------------------- */
3149
3150PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003151 Py_ssize_t size,
3152 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003155 Py_ssize_t startinpos;
3156 Py_ssize_t endinpos;
3157 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 const char *end;
3161 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 PyObject *errorHandler = NULL;
3163 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003164
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 v = _PyUnicode_New(size);
3170 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003171 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003173 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 end = s + size;
3176 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003177 unsigned char c;
3178 Py_UCS4 x;
3179 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003180 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003182 /* Non-escape characters are interpreted as Unicode ordinals */
3183 if (*s != '\\') {
3184 *p++ = (unsigned char)*s++;
3185 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003186 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003187 startinpos = s-starts;
3188
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3191 bs = s;
3192 for (;s < end;) {
3193 if (*s != '\\')
3194 break;
3195 *p++ = (unsigned char)*s++;
3196 }
3197 if (((s - bs) & 1) == 0 ||
3198 s >= end ||
3199 (*s != 'u' && *s != 'U')) {
3200 continue;
3201 }
3202 p--;
3203 count = *s=='u' ? 4 : 8;
3204 s++;
3205
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 for (x = 0, i = 0; i < count; ++i, ++s) {
3209 c = (unsigned char)*s;
3210 if (!isxdigit(c)) {
3211 endinpos = s-starts;
3212 if (unicode_decode_call_errorhandler(
3213 errors, &errorHandler,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts, size, &startinpos, &endinpos, &exc, &s,
3216 &v, &outpos, &p))
3217 goto onError;
3218 goto nextByte;
3219 }
3220 x = (x<<4) & ~0xF;
3221 if (c >= '0' && c <= '9')
3222 x += c - '0';
3223 else if (c >= 'a' && c <= 'f')
3224 x += 10 + c - 'a';
3225 else
3226 x += 10 + c - 'A';
3227 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003228 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003229 /* UCS-2 character */
3230 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003231 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 /* UCS-4 character. Either store directly, or as
3233 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003234#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003235 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003236#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003237 x -= 0x10000L;
3238 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003240#endif
3241 } else {
3242 endinpos = s-starts;
3243 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003250 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003251 nextByte:
3252 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003259
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003260 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return NULL;
3265}
3266
3267PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269{
3270 PyObject *repr;
3271 char *p;
3272 char *q;
3273
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003274 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003275#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003276 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003277#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003278 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003279#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003280
Neal Norwitze7d8be82008-07-31 17:17:14 +00003281 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003282 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003283
Neal Norwitze7d8be82008-07-31 17:17:14 +00003284 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 if (repr == NULL)
3286 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003287 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003288 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003290 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 while (size-- > 0) {
3292 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003293#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003296 *p++ = '\\';
3297 *p++ = 'U';
3298 *p++ = hexdigit[(ch >> 28) & 0xf];
3299 *p++ = hexdigit[(ch >> 24) & 0xf];
3300 *p++ = hexdigit[(ch >> 20) & 0xf];
3301 *p++ = hexdigit[(ch >> 16) & 0xf];
3302 *p++ = hexdigit[(ch >> 12) & 0xf];
3303 *p++ = hexdigit[(ch >> 8) & 0xf];
3304 *p++ = hexdigit[(ch >> 4) & 0xf];
3305 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003306 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003307 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003308#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch >= 0xD800 && ch < 0xDC00) {
3311 Py_UNICODE ch2;
3312 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003313
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 ch2 = *s++;
3315 size--;
3316 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318 *p++ = '\\';
3319 *p++ = 'U';
3320 *p++ = hexdigit[(ucs >> 28) & 0xf];
3321 *p++ = hexdigit[(ucs >> 24) & 0xf];
3322 *p++ = hexdigit[(ucs >> 20) & 0xf];
3323 *p++ = hexdigit[(ucs >> 16) & 0xf];
3324 *p++ = hexdigit[(ucs >> 12) & 0xf];
3325 *p++ = hexdigit[(ucs >> 8) & 0xf];
3326 *p++ = hexdigit[(ucs >> 4) & 0xf];
3327 *p++ = hexdigit[ucs & 0xf];
3328 continue;
3329 }
3330 /* Fall through: isolated surrogates are copied as-is */
3331 s--;
3332 size++;
3333 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003335 /* Map 16-bit characters to '\uxxxx' */
3336 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 *p++ = '\\';
3338 *p++ = 'u';
3339 *p++ = hexdigit[(ch >> 12) & 0xf];
3340 *p++ = hexdigit[(ch >> 8) & 0xf];
3341 *p++ = hexdigit[(ch >> 4) & 0xf];
3342 *p++ = hexdigit[ch & 15];
3343 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003344 /* Copy everything else as-is */
3345 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 *p++ = (char) ch;
3347 }
3348 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003349 if (_PyString_Resize(&repr, p - q))
3350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 return repr;
3352}
3353
3354PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3355{
3356 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003357 PyErr_BadArgument();
3358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 }
3360 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003361 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362}
3363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003364/* --- Unicode Internal Codec ------------------------------------------- */
3365
3366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003367 Py_ssize_t size,
3368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003369{
3370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 Py_ssize_t startinpos;
3372 Py_ssize_t endinpos;
3373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003374 PyUnicodeObject *v;
3375 Py_UNICODE *p;
3376 const char *end;
3377 const char *reason;
3378 PyObject *errorHandler = NULL;
3379 PyObject *exc = NULL;
3380
Neal Norwitzd43069c2006-01-08 01:12:10 +00003381#ifdef Py_UNICODE_WIDE
3382 Py_UNICODE unimax = PyUnicode_GetMax();
3383#endif
3384
Armin Rigo7ccbca92006-10-04 12:17:45 +00003385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3387 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003391 p = PyUnicode_AS_UNICODE(v);
3392 end = s + size;
3393
3394 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003395 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 /* We have to sanity check the raw data, otherwise doom looms for
3397 some malformed UCS-4 data. */
3398 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003399#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003400 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003402 end-s < Py_UNICODE_SIZE
3403 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003405 startinpos = s - starts;
3406 if (end-s < Py_UNICODE_SIZE) {
3407 endinpos = end-starts;
3408 reason = "truncated input";
3409 }
3410 else {
3411 endinpos = s - starts + Py_UNICODE_SIZE;
3412 reason = "illegal code point (> 0x10FFFF)";
3413 }
3414 outpos = p - PyUnicode_AS_UNICODE(v);
3415 if (unicode_decode_call_errorhandler(
3416 errors, &errorHandler,
3417 "unicode_internal", reason,
3418 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003419 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420 goto onError;
3421 }
3422 }
3423 else {
3424 p++;
3425 s += Py_UNICODE_SIZE;
3426 }
3427 }
3428
Martin v. Löwis412fb672006-04-13 06:34:32 +00003429 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003430 goto onError;
3431 Py_XDECREF(errorHandler);
3432 Py_XDECREF(exc);
3433 return (PyObject *)v;
3434
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003435 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003436 Py_XDECREF(v);
3437 Py_XDECREF(errorHandler);
3438 Py_XDECREF(exc);
3439 return NULL;
3440}
3441
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442/* --- Latin-1 Codec ------------------------------------------------------ */
3443
3444PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 Py_ssize_t size,
3446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447{
3448 PyUnicodeObject *v;
3449 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003450
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003452 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003453 Py_UNICODE r = *(unsigned char*)s;
3454 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003455 }
3456
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 v = _PyUnicode_New(size);
3458 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003459 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003461 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 p = PyUnicode_AS_UNICODE(v);
3463 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003466
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 Py_XDECREF(v);
3469 return NULL;
3470}
3471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472/* create or adjust a UnicodeEncodeError */
3473static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 const char *encoding,
3475 const Py_UNICODE *unicode, Py_ssize_t size,
3476 Py_ssize_t startpos, Py_ssize_t endpos,
3477 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003480 *exceptionObject = PyUnicodeEncodeError_Create(
3481 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 }
3483 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003484 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3485 goto onError;
3486 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3487 goto onError;
3488 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3489 goto onError;
3490 return;
3491 onError:
3492 Py_DECREF(*exceptionObject);
3493 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 }
3495}
3496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497/* raises a UnicodeEncodeError */
3498static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 const char *encoding,
3500 const Py_UNICODE *unicode, Py_ssize_t size,
3501 Py_ssize_t startpos, Py_ssize_t endpos,
3502 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503{
3504 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508}
3509
3510/* error handling callback helper:
3511 build arguments, call the callback and check the arguments,
3512 put the result into newpos and return the replacement string, which
3513 has to be freed by the caller */
3514static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 PyObject **errorHandler,
3516 const char *encoding, const char *reason,
3517 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3518 Py_ssize_t startpos, Py_ssize_t endpos,
3519 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003521 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522
3523 PyObject *restuple;
3524 PyObject *resunicode;
3525
3526 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 }
3531
3532 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003533 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536
3537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003542 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003543 Py_DECREF(restuple);
3544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 }
3546 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 &resunicode, newpos)) {
3548 Py_DECREF(restuple);
3549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 }
3551 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003553 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3555 Py_DECREF(restuple);
3556 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 Py_INCREF(resunicode);
3559 Py_DECREF(restuple);
3560 return resunicode;
3561}
3562
3563static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003564 Py_ssize_t size,
3565 const char *errors,
3566 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567{
3568 /* output object */
3569 PyObject *res;
3570 /* pointers to the beginning and end+1 of input */
3571 const Py_UNICODE *startp = p;
3572 const Py_UNICODE *endp = p + size;
3573 /* pointer to the beginning of the unencodable characters */
3574 /* const Py_UNICODE *badp = NULL; */
3575 /* pointer into the output */
3576 char *str;
3577 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003578 Py_ssize_t respos = 0;
3579 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003580 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3581 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 PyObject *errorHandler = NULL;
3583 PyObject *exc = NULL;
3584 /* the following variable is used for caching string comparisons
3585 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3586 int known_errorHandler = -1;
3587
3588 /* allocate enough for a simple encoding without
3589 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003590 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 if (res == NULL)
3592 goto onError;
3593 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003594 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003595 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 ressize = size;
3597
3598 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 /* can we encode this? */
3602 if (c<limit) {
3603 /* no overflow check, because we know that the space is enough */
3604 *str++ = (char)c;
3605 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003606 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003607 else {
3608 Py_ssize_t unicodepos = p-startp;
3609 Py_ssize_t requiredsize;
3610 PyObject *repunicode;
3611 Py_ssize_t repsize;
3612 Py_ssize_t newpos;
3613 Py_ssize_t respos;
3614 Py_UNICODE *uni2;
3615 /* startpos for collecting unencodable chars */
3616 const Py_UNICODE *collstart = p;
3617 const Py_UNICODE *collend = p;
3618 /* find all unecodable characters */
3619 while ((collend < endp) && ((*collend)>=limit))
3620 ++collend;
3621 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3622 if (known_errorHandler==-1) {
3623 if ((errors==NULL) || (!strcmp(errors, "strict")))
3624 known_errorHandler = 1;
3625 else if (!strcmp(errors, "replace"))
3626 known_errorHandler = 2;
3627 else if (!strcmp(errors, "ignore"))
3628 known_errorHandler = 3;
3629 else if (!strcmp(errors, "xmlcharrefreplace"))
3630 known_errorHandler = 4;
3631 else
3632 known_errorHandler = 0;
3633 }
3634 switch (known_errorHandler) {
3635 case 1: /* strict */
3636 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3637 goto onError;
3638 case 2: /* replace */
3639 while (collstart++<collend)
3640 *str++ = '?'; /* fall through */
3641 case 3: /* ignore */
3642 p = collend;
3643 break;
3644 case 4: /* xmlcharrefreplace */
3645 respos = str-PyString_AS_STRING(res);
3646 /* determine replacement size (temporarily (mis)uses p) */
3647 for (p = collstart, repsize = 0; p < collend; ++p) {
3648 if (*p<10)
3649 repsize += 2+1+1;
3650 else if (*p<100)
3651 repsize += 2+2+1;
3652 else if (*p<1000)
3653 repsize += 2+3+1;
3654 else if (*p<10000)
3655 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003656#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003657 else
3658 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003659#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 else if (*p<100000)
3661 repsize += 2+5+1;
3662 else if (*p<1000000)
3663 repsize += 2+6+1;
3664 else
3665 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003666#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 }
3668 requiredsize = respos+repsize+(endp-collend);
3669 if (requiredsize > ressize) {
3670 if (requiredsize<2*ressize)
3671 requiredsize = 2*ressize;
3672 if (_PyString_Resize(&res, requiredsize))
3673 goto onError;
3674 str = PyString_AS_STRING(res) + respos;
3675 ressize = requiredsize;
3676 }
3677 /* generate replacement (temporarily (mis)uses p) */
3678 for (p = collstart; p < collend; ++p) {
3679 str += sprintf(str, "&#%d;", (int)*p);
3680 }
3681 p = collend;
3682 break;
3683 default:
3684 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3685 encoding, reason, startp, size, &exc,
3686 collstart-startp, collend-startp, &newpos);
3687 if (repunicode == NULL)
3688 goto onError;
3689 /* need more space? (at least enough for what we have+the
3690 replacement+the rest of the string, so we won't have to
3691 check space for encodable characters) */
3692 respos = str-PyString_AS_STRING(res);
3693 repsize = PyUnicode_GET_SIZE(repunicode);
3694 requiredsize = respos+repsize+(endp-collend);
3695 if (requiredsize > ressize) {
3696 if (requiredsize<2*ressize)
3697 requiredsize = 2*ressize;
3698 if (_PyString_Resize(&res, requiredsize)) {
3699 Py_DECREF(repunicode);
3700 goto onError;
3701 }
3702 str = PyString_AS_STRING(res) + respos;
3703 ressize = requiredsize;
3704 }
3705 /* check if there is anything unencodable in the replacement
3706 and copy it to the output */
3707 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3708 c = *uni2;
3709 if (c >= limit) {
3710 raise_encode_exception(&exc, encoding, startp, size,
3711 unicodepos, unicodepos+1, reason);
3712 Py_DECREF(repunicode);
3713 goto onError;
3714 }
3715 *str = (char)c;
3716 }
3717 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003718 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003719 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003720 }
3721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003723 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003725 /* If this falls res will be NULL */
3726 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 Py_XDECREF(errorHandler);
3728 Py_XDECREF(exc);
3729 return res;
3730
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 Py_XDECREF(res);
3733 Py_XDECREF(errorHandler);
3734 Py_XDECREF(exc);
3735 return NULL;
3736}
3737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003739 Py_ssize_t size,
3740 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743}
3744
3745PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3746{
3747 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003748 PyErr_BadArgument();
3749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 }
3751 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003752 PyUnicode_GET_SIZE(unicode),
3753 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754}
3755
3756/* --- 7-bit ASCII Codec -------------------------------------------------- */
3757
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 Py_ssize_t size,
3760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
3764 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003765 Py_ssize_t startinpos;
3766 Py_ssize_t endinpos;
3767 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 const char *e;
3769 PyObject *errorHandler = NULL;
3770 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003771
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003773 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003774 Py_UNICODE r = *(unsigned char*)s;
3775 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003776 }
Tim Petersced69f82003-09-16 20:30:58 +00003777
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 v = _PyUnicode_New(size);
3779 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003782 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 e = s + size;
3785 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003786 register unsigned char c = (unsigned char)*s;
3787 if (c < 128) {
3788 *p++ = c;
3789 ++s;
3790 }
3791 else {
3792 startinpos = s-starts;
3793 endinpos = startinpos + 1;
3794 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3795 if (unicode_decode_call_errorhandler(
3796 errors, &errorHandler,
3797 "ascii", "ordinal not in range(128)",
3798 starts, size, &startinpos, &endinpos, &exc, &s,
3799 &v, &outpos, &p))
3800 goto onError;
3801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003803 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003804 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3805 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 Py_XDECREF(errorHandler);
3807 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003809
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003810 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 Py_XDECREF(errorHandler);
3813 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 return NULL;
3815}
3816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003818 Py_ssize_t size,
3819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822}
3823
3824PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3825{
3826 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003827 PyErr_BadArgument();
3828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 }
3830 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003831 PyUnicode_GET_SIZE(unicode),
3832 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833}
3834
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003835#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003836
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003837/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003838
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003839#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003840#define NEED_RETRY
3841#endif
3842
3843/* XXX This code is limited to "true" double-byte encodings, as
3844 a) it assumes an incomplete character consists of a single byte, and
3845 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003846 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003847
3848static int is_dbcs_lead_byte(const char *s, int offset)
3849{
3850 const char *curr = s + offset;
3851
3852 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 const char *prev = CharPrev(s, curr);
3854 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003855 }
3856 return 0;
3857}
3858
3859/*
3860 * Decode MBCS string into unicode object. If 'final' is set, converts
3861 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 */
3863static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 const char *s, /* MBCS string */
3865 int size, /* sizeof MBCS string */
3866 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003867{
3868 Py_UNICODE *p;
3869 Py_ssize_t n = 0;
3870 int usize = 0;
3871
3872 assert(size >= 0);
3873
3874 /* Skip trailing lead-byte unless 'final' is set */
3875 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003876 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877
3878 /* First get the size of the result */
3879 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003880 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3881 if (usize == 0) {
3882 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3883 return -1;
3884 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003885 }
3886
3887 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003888 /* Create unicode object */
3889 *v = _PyUnicode_New(usize);
3890 if (*v == NULL)
3891 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 }
3893 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003894 /* Extend unicode object */
3895 n = PyUnicode_GET_SIZE(*v);
3896 if (_PyUnicode_Resize(v, n + usize) < 0)
3897 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003898 }
3899
3900 /* Do the conversion */
3901 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003902 p = PyUnicode_AS_UNICODE(*v) + n;
3903 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3904 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3905 return -1;
3906 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907 }
3908
3909 return size;
3910}
3911
3912PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003913 Py_ssize_t size,
3914 const char *errors,
3915 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003916{
3917 PyUnicodeObject *v = NULL;
3918 int done;
3919
3920 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003921 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922
3923#ifdef NEED_RETRY
3924 retry:
3925 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003926 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003927 else
3928#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930
3931 if (done < 0) {
3932 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003933 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003934 }
3935
3936 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003937 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003938
3939#ifdef NEED_RETRY
3940 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003941 s += done;
3942 size -= done;
3943 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003944 }
3945#endif
3946
3947 return (PyObject *)v;
3948}
3949
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003951 Py_ssize_t size,
3952 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003954 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3955}
3956
3957/*
3958 * Convert unicode into string object (MBCS).
3959 * Returns 0 if succeed, -1 otherwise.
3960 */
3961static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003962 const Py_UNICODE *p, /* unicode */
3963 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003964{
3965 int mbcssize = 0;
3966 Py_ssize_t n = 0;
3967
3968 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003969
3970 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003972 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3973 if (mbcssize == 0) {
3974 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3975 return -1;
3976 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003977 }
3978
Martin v. Löwisd8251432006-06-14 05:21:04 +00003979 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003980 /* Create string object */
3981 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3982 if (*repr == NULL)
3983 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003984 }
3985 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003986 /* Extend string object */
3987 n = PyString_Size(*repr);
3988 if (_PyString_Resize(repr, n + mbcssize) < 0)
3989 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990 }
3991
3992 /* Do the conversion */
3993 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003994 char *s = PyString_AS_STRING(*repr) + n;
3995 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3996 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3997 return -1;
3998 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 }
4000
4001 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004002}
4003
4004PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004005 Py_ssize_t size,
4006 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004007{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004008 PyObject *repr = NULL;
4009 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004010
Martin v. Löwisd8251432006-06-14 05:21:04 +00004011#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004013 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 else
4016#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004018
Martin v. Löwisd8251432006-06-14 05:21:04 +00004019 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 Py_XDECREF(repr);
4021 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004022 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004023
4024#ifdef NEED_RETRY
4025 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004026 p += INT_MAX;
4027 size -= INT_MAX;
4028 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004029 }
4030#endif
4031
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004032 return repr;
4033}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004034
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004035PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4036{
4037 if (!PyUnicode_Check(unicode)) {
4038 PyErr_BadArgument();
4039 return NULL;
4040 }
4041 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004042 PyUnicode_GET_SIZE(unicode),
4043 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004044}
4045
Martin v. Löwisd8251432006-06-14 05:21:04 +00004046#undef NEED_RETRY
4047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004048#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050/* --- Character Mapping Codec -------------------------------------------- */
4051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004053 Py_ssize_t size,
4054 PyObject *mapping,
4055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004058 Py_ssize_t startinpos;
4059 Py_ssize_t endinpos;
4060 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 PyUnicodeObject *v;
4063 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004064 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 PyObject *errorHandler = NULL;
4066 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004067 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 /* Default to Latin-1 */
4071 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073
4074 v = _PyUnicode_New(size);
4075 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004076 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004078 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004081 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004082 mapstring = PyUnicode_AS_UNICODE(mapping);
4083 maplen = PyUnicode_GET_SIZE(mapping);
4084 while (s < e) {
4085 unsigned char ch = *s;
4086 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004088 if (ch < maplen)
4089 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 if (x == 0xfffe) {
4092 /* undefined mapping */
4093 outpos = p-PyUnicode_AS_UNICODE(v);
4094 startinpos = s-starts;
4095 endinpos = startinpos+1;
4096 if (unicode_decode_call_errorhandler(
4097 errors, &errorHandler,
4098 "charmap", "character maps to <undefined>",
4099 starts, size, &startinpos, &endinpos, &exc, &s,
4100 &v, &outpos, &p)) {
4101 goto onError;
4102 }
4103 continue;
4104 }
4105 *p++ = x;
4106 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004107 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004108 }
4109 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004110 while (s < e) {
4111 unsigned char ch = *s;
4112 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004113
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004114 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4115 w = PyInt_FromLong((long)ch);
4116 if (w == NULL)
4117 goto onError;
4118 x = PyObject_GetItem(mapping, w);
4119 Py_DECREF(w);
4120 if (x == NULL) {
4121 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4122 /* No mapping found means: mapping is undefined. */
4123 PyErr_Clear();
4124 x = Py_None;
4125 Py_INCREF(x);
4126 } else
4127 goto onError;
4128 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004129
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004130 /* Apply mapping */
4131 if (PyInt_Check(x)) {
4132 long value = PyInt_AS_LONG(x);
4133 if (value < 0 || value > 65535) {
4134 PyErr_SetString(PyExc_TypeError,
4135 "character mapping must be in range(65536)");
4136 Py_DECREF(x);
4137 goto onError;
4138 }
4139 *p++ = (Py_UNICODE)value;
4140 }
4141 else if (x == Py_None) {
4142 /* undefined mapping */
4143 outpos = p-PyUnicode_AS_UNICODE(v);
4144 startinpos = s-starts;
4145 endinpos = startinpos+1;
4146 if (unicode_decode_call_errorhandler(
4147 errors, &errorHandler,
4148 "charmap", "character maps to <undefined>",
4149 starts, size, &startinpos, &endinpos, &exc, &s,
4150 &v, &outpos, &p)) {
4151 Py_DECREF(x);
4152 goto onError;
4153 }
4154 Py_DECREF(x);
4155 continue;
4156 }
4157 else if (PyUnicode_Check(x)) {
4158 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004159
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004160 if (targetsize == 1)
4161 /* 1-1 mapping */
4162 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004163
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004164 else if (targetsize > 1) {
4165 /* 1-n mapping */
4166 if (targetsize > extrachars) {
4167 /* resize first */
4168 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4169 Py_ssize_t needed = (targetsize - extrachars) + \
4170 (targetsize << 2);
4171 extrachars += needed;
4172 /* XXX overflow detection missing */
4173 if (_PyUnicode_Resize(&v,
4174 PyUnicode_GET_SIZE(v) + needed) < 0) {
4175 Py_DECREF(x);
4176 goto onError;
4177 }
4178 p = PyUnicode_AS_UNICODE(v) + oldpos;
4179 }
4180 Py_UNICODE_COPY(p,
4181 PyUnicode_AS_UNICODE(x),
4182 targetsize);
4183 p += targetsize;
4184 extrachars -= targetsize;
4185 }
4186 /* 1-0 mapping: skip the character */
4187 }
4188 else {
4189 /* wrong return value */
4190 PyErr_SetString(PyExc_TypeError,
4191 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004192 Py_DECREF(x);
4193 goto onError;
4194 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004195 Py_DECREF(x);
4196 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 }
4199 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004200 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4201 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 Py_XDECREF(errorHandler);
4203 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004205
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004206 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 Py_XDECREF(errorHandler);
4208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 Py_XDECREF(v);
4210 return NULL;
4211}
4212
Martin v. Löwis3f767792006-06-04 19:36:28 +00004213/* Charmap encoding: the lookup table */
4214
4215struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004216 PyObject_HEAD
4217 unsigned char level1[32];
4218 int count2, count3;
4219 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004220};
4221
4222static PyObject*
4223encoding_map_size(PyObject *obj, PyObject* args)
4224{
4225 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004226 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004227 128*map->count3);
4228}
4229
4230static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004231 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004232 PyDoc_STR("Return the size (in bytes) of this object") },
4233 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004234};
4235
4236static void
4237encoding_map_dealloc(PyObject* o)
4238{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004239 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004240}
4241
4242static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004243 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004244 "EncodingMap", /*tp_name*/
4245 sizeof(struct encoding_map), /*tp_basicsize*/
4246 0, /*tp_itemsize*/
4247 /* methods */
4248 encoding_map_dealloc, /*tp_dealloc*/
4249 0, /*tp_print*/
4250 0, /*tp_getattr*/
4251 0, /*tp_setattr*/
4252 0, /*tp_compare*/
4253 0, /*tp_repr*/
4254 0, /*tp_as_number*/
4255 0, /*tp_as_sequence*/
4256 0, /*tp_as_mapping*/
4257 0, /*tp_hash*/
4258 0, /*tp_call*/
4259 0, /*tp_str*/
4260 0, /*tp_getattro*/
4261 0, /*tp_setattro*/
4262 0, /*tp_as_buffer*/
4263 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4264 0, /*tp_doc*/
4265 0, /*tp_traverse*/
4266 0, /*tp_clear*/
4267 0, /*tp_richcompare*/
4268 0, /*tp_weaklistoffset*/
4269 0, /*tp_iter*/
4270 0, /*tp_iternext*/
4271 encoding_map_methods, /*tp_methods*/
4272 0, /*tp_members*/
4273 0, /*tp_getset*/
4274 0, /*tp_base*/
4275 0, /*tp_dict*/
4276 0, /*tp_descr_get*/
4277 0, /*tp_descr_set*/
4278 0, /*tp_dictoffset*/
4279 0, /*tp_init*/
4280 0, /*tp_alloc*/
4281 0, /*tp_new*/
4282 0, /*tp_free*/
4283 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004284};
4285
4286PyObject*
4287PyUnicode_BuildEncodingMap(PyObject* string)
4288{
4289 Py_UNICODE *decode;
4290 PyObject *result;
4291 struct encoding_map *mresult;
4292 int i;
4293 int need_dict = 0;
4294 unsigned char level1[32];
4295 unsigned char level2[512];
4296 unsigned char *mlevel1, *mlevel2, *mlevel3;
4297 int count2 = 0, count3 = 0;
4298
4299 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4300 PyErr_BadArgument();
4301 return NULL;
4302 }
4303 decode = PyUnicode_AS_UNICODE(string);
4304 memset(level1, 0xFF, sizeof level1);
4305 memset(level2, 0xFF, sizeof level2);
4306
4307 /* If there isn't a one-to-one mapping of NULL to \0,
4308 or if there are non-BMP characters, we need to use
4309 a mapping dictionary. */
4310 if (decode[0] != 0)
4311 need_dict = 1;
4312 for (i = 1; i < 256; i++) {
4313 int l1, l2;
4314 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004315#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004316 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004317#endif
4318 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319 need_dict = 1;
4320 break;
4321 }
4322 if (decode[i] == 0xFFFE)
4323 /* unmapped character */
4324 continue;
4325 l1 = decode[i] >> 11;
4326 l2 = decode[i] >> 7;
4327 if (level1[l1] == 0xFF)
4328 level1[l1] = count2++;
4329 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004330 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004331 }
4332
4333 if (count2 >= 0xFF || count3 >= 0xFF)
4334 need_dict = 1;
4335
4336 if (need_dict) {
4337 PyObject *result = PyDict_New();
4338 PyObject *key, *value;
4339 if (!result)
4340 return NULL;
4341 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004342 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004343 key = PyInt_FromLong(decode[i]);
4344 value = PyInt_FromLong(i);
4345 if (!key || !value)
4346 goto failed1;
4347 if (PyDict_SetItem(result, key, value) == -1)
4348 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004349 Py_DECREF(key);
4350 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004351 }
4352 return result;
4353 failed1:
4354 Py_XDECREF(key);
4355 Py_XDECREF(value);
4356 Py_DECREF(result);
4357 return NULL;
4358 }
4359
4360 /* Create a three-level trie */
4361 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4362 16*count2 + 128*count3 - 1);
4363 if (!result)
4364 return PyErr_NoMemory();
4365 PyObject_Init(result, &EncodingMapType);
4366 mresult = (struct encoding_map*)result;
4367 mresult->count2 = count2;
4368 mresult->count3 = count3;
4369 mlevel1 = mresult->level1;
4370 mlevel2 = mresult->level23;
4371 mlevel3 = mresult->level23 + 16*count2;
4372 memcpy(mlevel1, level1, 32);
4373 memset(mlevel2, 0xFF, 16*count2);
4374 memset(mlevel3, 0, 128*count3);
4375 count3 = 0;
4376 for (i = 1; i < 256; i++) {
4377 int o1, o2, o3, i2, i3;
4378 if (decode[i] == 0xFFFE)
4379 /* unmapped character */
4380 continue;
4381 o1 = decode[i]>>11;
4382 o2 = (decode[i]>>7) & 0xF;
4383 i2 = 16*mlevel1[o1] + o2;
4384 if (mlevel2[i2] == 0xFF)
4385 mlevel2[i2] = count3++;
4386 o3 = decode[i] & 0x7F;
4387 i3 = 128*mlevel2[i2] + o3;
4388 mlevel3[i3] = i;
4389 }
4390 return result;
4391}
4392
4393static int
4394encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4395{
4396 struct encoding_map *map = (struct encoding_map*)mapping;
4397 int l1 = c>>11;
4398 int l2 = (c>>7) & 0xF;
4399 int l3 = c & 0x7F;
4400 int i;
4401
4402#ifdef Py_UNICODE_WIDE
4403 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004404 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 }
4406#endif
4407 if (c == 0)
4408 return 0;
4409 /* level 1*/
4410 i = map->level1[l1];
4411 if (i == 0xFF) {
4412 return -1;
4413 }
4414 /* level 2*/
4415 i = map->level23[16*i+l2];
4416 if (i == 0xFF) {
4417 return -1;
4418 }
4419 /* level 3 */
4420 i = map->level23[16*map->count2 + 128*i + l3];
4421 if (i == 0) {
4422 return -1;
4423 }
4424 return i;
4425}
4426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427/* Lookup the character ch in the mapping. If the character
4428 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004429 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 PyObject *w = PyInt_FromLong((long)c);
4433 PyObject *x;
4434
4435 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 x = PyObject_GetItem(mapping, w);
4438 Py_DECREF(w);
4439 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004440 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4441 /* No mapping found means: mapping is undefined. */
4442 PyErr_Clear();
4443 x = Py_None;
4444 Py_INCREF(x);
4445 return x;
4446 } else
4447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004449 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004450 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004452 long value = PyInt_AS_LONG(x);
4453 if (value < 0 || value > 255) {
4454 PyErr_SetString(PyExc_TypeError,
4455 "character mapping must be in range(256)");
4456 Py_DECREF(x);
4457 return NULL;
4458 }
4459 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004461 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004462 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004464 /* wrong return value */
4465 PyErr_SetString(PyExc_TypeError,
4466 "character mapping must return integer, None or str");
4467 Py_DECREF(x);
4468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
4470}
4471
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472static int
4473charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4474{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004475 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4476 /* exponentially overallocate to minimize reallocations */
4477 if (requiredsize < 2*outsize)
4478 requiredsize = 2*outsize;
4479 if (_PyString_Resize(outobj, requiredsize)) {
4480 return 0;
4481 }
4482 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004483}
4484
Benjamin Peterson857ce152009-01-31 16:29:18 +00004485typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004486 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004487}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488/* lookup the character, put the result in the output string and adjust
4489 various state variables. Reallocate the output string if not enough
4490 space is available. Return a new reference to the object that
4491 was put in the output buffer, or Py_None, if the mapping was undefined
4492 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004493 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004495charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004496 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004498 PyObject *rep;
4499 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004500 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501
Christian Heimese93237d2007-12-19 02:37:44 +00004502 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004503 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004504 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505 if (res == -1)
4506 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004507 if (outsize<requiredsize)
4508 if (!charmapencode_resize(outobj, outpos, requiredsize))
4509 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004510 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004511 outstart[(*outpos)++] = (char)res;
4512 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004513 }
4514
4515 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004517 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004518 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004519 Py_DECREF(rep);
4520 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004521 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 if (PyInt_Check(rep)) {
4523 Py_ssize_t requiredsize = *outpos+1;
4524 if (outsize<requiredsize)
4525 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4526 Py_DECREF(rep);
4527 return enc_EXCEPTION;
4528 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004529 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004530 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004531 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 else {
4533 const char *repchars = PyString_AS_STRING(rep);
4534 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4535 Py_ssize_t requiredsize = *outpos+repsize;
4536 if (outsize<requiredsize)
4537 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4538 Py_DECREF(rep);
4539 return enc_EXCEPTION;
4540 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004541 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004542 memcpy(outstart + *outpos, repchars, repsize);
4543 *outpos += repsize;
4544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 }
Georg Brandl9f167602006-06-04 21:46:16 +00004546 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004547 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548}
4549
4550/* handle an error in PyUnicode_EncodeCharmap
4551 Return 0 on success, -1 on error */
4552static
4553int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004556 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004557 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558{
4559 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004560 Py_ssize_t repsize;
4561 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 Py_UNICODE *uni2;
4563 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 Py_ssize_t collstartpos = *inpos;
4565 Py_ssize_t collendpos = *inpos+1;
4566 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 char *encoding = "charmap";
4568 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004569 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 /* find all unencodable characters */
4572 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004573 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004574 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004575 int res = encoding_map_lookup(p[collendpos], mapping);
4576 if (res != -1)
4577 break;
4578 ++collendpos;
4579 continue;
4580 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004581
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004582 rep = charmapencode_lookup(p[collendpos], mapping);
4583 if (rep==NULL)
4584 return -1;
4585 else if (rep!=Py_None) {
4586 Py_DECREF(rep);
4587 break;
4588 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004589 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004590 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 }
4592 /* cache callback name lookup
4593 * (if not done yet, i.e. it's the first error) */
4594 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004595 if ((errors==NULL) || (!strcmp(errors, "strict")))
4596 *known_errorHandler = 1;
4597 else if (!strcmp(errors, "replace"))
4598 *known_errorHandler = 2;
4599 else if (!strcmp(errors, "ignore"))
4600 *known_errorHandler = 3;
4601 else if (!strcmp(errors, "xmlcharrefreplace"))
4602 *known_errorHandler = 4;
4603 else
4604 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 }
4606 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004607 case 1: /* strict */
4608 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4609 return -1;
4610 case 2: /* replace */
4611 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004612 x = charmapencode_output('?', mapping, res, respos);
4613 if (x==enc_EXCEPTION) {
4614 return -1;
4615 }
4616 else if (x==enc_FAILED) {
4617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4618 return -1;
4619 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004620 }
4621 /* fall through */
4622 case 3: /* ignore */
4623 *inpos = collendpos;
4624 break;
4625 case 4: /* xmlcharrefreplace */
4626 /* generate replacement (temporarily (mis)uses p) */
4627 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004628 char buffer[2+29+1+1];
4629 char *cp;
4630 sprintf(buffer, "&#%d;", (int)p[collpos]);
4631 for (cp = buffer; *cp; ++cp) {
4632 x = charmapencode_output(*cp, mapping, res, respos);
4633 if (x==enc_EXCEPTION)
4634 return -1;
4635 else if (x==enc_FAILED) {
4636 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4637 return -1;
4638 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004639 }
4640 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004641 *inpos = collendpos;
4642 break;
4643 default:
4644 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004645 encoding, reason, p, size, exceptionObject,
4646 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004647 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004648 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004649 /* generate replacement */
4650 repsize = PyUnicode_GET_SIZE(repunicode);
4651 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004652 x = charmapencode_output(*uni2, mapping, res, respos);
4653 if (x==enc_EXCEPTION) {
4654 return -1;
4655 }
4656 else if (x==enc_FAILED) {
4657 Py_DECREF(repunicode);
4658 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4659 return -1;
4660 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004661 }
4662 *inpos = newpos;
4663 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 }
4665 return 0;
4666}
4667
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004669 Py_ssize_t size,
4670 PyObject *mapping,
4671 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 /* output object */
4674 PyObject *res = NULL;
4675 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004678 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 PyObject *errorHandler = NULL;
4680 PyObject *exc = NULL;
4681 /* the following variable is used for caching string comparisons
4682 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4683 * 3=ignore, 4=xmlcharrefreplace */
4684 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686 /* Default to Latin-1 */
4687 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004688 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 /* allocate enough for a simple encoding without
4691 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004692 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 if (res == NULL)
4694 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004695 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004696 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004699 /* try to encode it */
4700 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4701 if (x==enc_EXCEPTION) /* error */
4702 goto onError;
4703 if (x==enc_FAILED) { /* unencodable character */
4704 if (charmap_encoding_error(p, size, &inpos, mapping,
4705 &exc,
4706 &known_errorHandler, &errorHandler, errors,
4707 &res, &respos)) {
4708 goto onError;
4709 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004710 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004711 else
4712 /* done with this character => adjust input position */
4713 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004717 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004718 if (_PyString_Resize(&res, respos))
4719 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 }
4721 Py_XDECREF(exc);
4722 Py_XDECREF(errorHandler);
4723 return res;
4724
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 Py_XDECREF(res);
4727 Py_XDECREF(exc);
4728 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 return NULL;
4730}
4731
4732PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004733 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734{
4735 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004736 PyErr_BadArgument();
4737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 }
4739 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 PyUnicode_GET_SIZE(unicode),
4741 mapping,
4742 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743}
4744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745/* create or adjust a UnicodeTranslateError */
4746static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 const Py_UNICODE *unicode, Py_ssize_t size,
4748 Py_ssize_t startpos, Py_ssize_t endpos,
4749 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004752 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004753 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 }
4755 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004756 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4757 goto onError;
4758 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4759 goto onError;
4760 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4761 goto onError;
4762 return;
4763 onError:
4764 Py_DECREF(*exceptionObject);
4765 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 }
4767}
4768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769/* raises a UnicodeTranslateError */
4770static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004771 const Py_UNICODE *unicode, Py_ssize_t size,
4772 Py_ssize_t startpos, Py_ssize_t endpos,
4773 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774{
4775 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004776 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004778 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779}
4780
4781/* error handling callback helper:
4782 build arguments, call the callback and check the arguments,
4783 put the result into newpos and return the replacement string, which
4784 has to be freed by the caller */
4785static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004786 PyObject **errorHandler,
4787 const char *reason,
4788 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4789 Py_ssize_t startpos, Py_ssize_t endpos,
4790 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004792 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793
Martin v. Löwis412fb672006-04-13 06:34:32 +00004794 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 PyObject *restuple;
4796 PyObject *resunicode;
4797
4798 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 }
4803
4804 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004805 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004807 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808
4809 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004814 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 Py_DECREF(restuple);
4816 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 }
4818 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 &resunicode, &i_newpos)) {
4820 Py_DECREF(restuple);
4821 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004824 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004825 else
4826 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004827 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4829 Py_DECREF(restuple);
4830 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 Py_INCREF(resunicode);
4833 Py_DECREF(restuple);
4834 return resunicode;
4835}
4836
4837/* Lookup the character ch in the mapping and put the result in result,
4838 which must be decrefed by the caller.
4839 Return 0 on success, -1 on error */
4840static
4841int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4842{
4843 PyObject *w = PyInt_FromLong((long)c);
4844 PyObject *x;
4845
4846 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 x = PyObject_GetItem(mapping, w);
4849 Py_DECREF(w);
4850 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004851 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4852 /* No mapping found means: use 1:1 mapping. */
4853 PyErr_Clear();
4854 *result = NULL;
4855 return 0;
4856 } else
4857 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 }
4859 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004860 *result = x;
4861 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 }
4863 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004864 long value = PyInt_AS_LONG(x);
4865 long max = PyUnicode_GetMax();
4866 if (value < 0 || value > max) {
4867 PyErr_Format(PyExc_TypeError,
4868 "character mapping must be in range(0x%lx)", max+1);
4869 Py_DECREF(x);
4870 return -1;
4871 }
4872 *result = x;
4873 return 0;
4874 }
4875 else if (PyUnicode_Check(x)) {
4876 *result = x;
4877 return 0;
4878 }
4879 else {
4880 /* wrong return value */
4881 PyErr_SetString(PyExc_TypeError,
4882 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004883 Py_DECREF(x);
4884 return -1;
4885 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886}
4887/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004888 if not reallocate and adjust various state variables.
4889 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890static
Walter Dörwald4894c302003-10-24 14:25:28 +00004891int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004892 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004894 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004895 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004896 /* remember old output position */
4897 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4898 /* exponentially overallocate to minimize reallocations */
4899 if (requiredsize < 2 * oldsize)
4900 requiredsize = 2 * oldsize;
4901 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4902 return -1;
4903 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 }
4905 return 0;
4906}
4907/* lookup the character, put the result in the output string and adjust
4908 various state variables. Return a new reference to the object that
4909 was put in the output buffer in *result, or Py_None, if the mapping was
4910 undefined (in which case no character was written).
4911 The called must decref result.
4912 Return 0 on success, -1 on error. */
4913static
Walter Dörwald4894c302003-10-24 14:25:28 +00004914int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004915 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4916 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917{
Walter Dörwald4894c302003-10-24 14:25:28 +00004918 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004919 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004921 /* not found => default to 1:1 mapping */
4922 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 }
4924 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004925 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 /* no overflow check, because we know that the space is enough */
4928 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 }
4930 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004931 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4932 if (repsize==1) {
4933 /* no overflow check, because we know that the space is enough */
4934 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4935 }
4936 else if (repsize!=0) {
4937 /* more than one character */
4938 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4939 (insize - (curinp-startinp)) +
4940 repsize - 1;
4941 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4942 return -1;
4943 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4944 *outp += repsize;
4945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 }
4947 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004948 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 return 0;
4950}
4951
4952PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004953 Py_ssize_t size,
4954 PyObject *mapping,
4955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 /* output object */
4958 PyObject *res = NULL;
4959 /* pointers to the beginning and end+1 of input */
4960 const Py_UNICODE *startp = p;
4961 const Py_UNICODE *endp = p + size;
4962 /* pointer into the output */
4963 Py_UNICODE *str;
4964 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004965 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 char *reason = "character maps to <undefined>";
4967 PyObject *errorHandler = NULL;
4968 PyObject *exc = NULL;
4969 /* the following variable is used for caching string comparisons
4970 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4971 * 3=ignore, 4=xmlcharrefreplace */
4972 int known_errorHandler = -1;
4973
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004975 PyErr_BadArgument();
4976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978
4979 /* allocate enough for a simple 1:1 translation without
4980 replacements, if we need more, we'll resize */
4981 res = PyUnicode_FromUnicode(NULL, size);
4982 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004983 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004985 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004989 /* try to encode it */
4990 PyObject *x = NULL;
4991 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4992 Py_XDECREF(x);
4993 goto onError;
4994 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004995 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 if (x!=Py_None) /* it worked => adjust input pointer */
4997 ++p;
4998 else { /* untranslatable character */
4999 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5000 Py_ssize_t repsize;
5001 Py_ssize_t newpos;
5002 Py_UNICODE *uni2;
5003 /* startpos for collecting untranslatable chars */
5004 const Py_UNICODE *collstart = p;
5005 const Py_UNICODE *collend = p+1;
5006 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005008 /* find all untranslatable characters */
5009 while (collend < endp) {
5010 if (charmaptranslate_lookup(*collend, mapping, &x))
5011 goto onError;
5012 Py_XDECREF(x);
5013 if (x!=Py_None)
5014 break;
5015 ++collend;
5016 }
5017 /* cache callback name lookup
5018 * (if not done yet, i.e. it's the first error) */
5019 if (known_errorHandler==-1) {
5020 if ((errors==NULL) || (!strcmp(errors, "strict")))
5021 known_errorHandler = 1;
5022 else if (!strcmp(errors, "replace"))
5023 known_errorHandler = 2;
5024 else if (!strcmp(errors, "ignore"))
5025 known_errorHandler = 3;
5026 else if (!strcmp(errors, "xmlcharrefreplace"))
5027 known_errorHandler = 4;
5028 else
5029 known_errorHandler = 0;
5030 }
5031 switch (known_errorHandler) {
5032 case 1: /* strict */
5033 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005034 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005035 case 2: /* replace */
5036 /* No need to check for space, this is a 1:1 replacement */
5037 for (coll = collstart; coll<collend; ++coll)
5038 *str++ = '?';
5039 /* fall through */
5040 case 3: /* ignore */
5041 p = collend;
5042 break;
5043 case 4: /* xmlcharrefreplace */
5044 /* generate replacement (temporarily (mis)uses p) */
5045 for (p = collstart; p < collend; ++p) {
5046 char buffer[2+29+1+1];
5047 char *cp;
5048 sprintf(buffer, "&#%d;", (int)*p);
5049 if (charmaptranslate_makespace(&res, &str,
5050 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5051 goto onError;
5052 for (cp = buffer; *cp; ++cp)
5053 *str++ = *cp;
5054 }
5055 p = collend;
5056 break;
5057 default:
5058 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5059 reason, startp, size, &exc,
5060 collstart-startp, collend-startp, &newpos);
5061 if (repunicode == NULL)
5062 goto onError;
5063 /* generate replacement */
5064 repsize = PyUnicode_GET_SIZE(repunicode);
5065 if (charmaptranslate_makespace(&res, &str,
5066 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5067 Py_DECREF(repunicode);
5068 goto onError;
5069 }
5070 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5071 *str++ = *uni2;
5072 p = startp + newpos;
5073 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005074 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005075 }
5076 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 /* Resize if we allocated to much */
5078 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005079 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005080 if (PyUnicode_Resize(&res, respos) < 0)
5081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 }
5083 Py_XDECREF(exc);
5084 Py_XDECREF(errorHandler);
5085 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005087 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 Py_XDECREF(res);
5089 Py_XDECREF(exc);
5090 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 return NULL;
5092}
5093
5094PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005095 PyObject *mapping,
5096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097{
5098 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005099
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 str = PyUnicode_FromObject(str);
5101 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005104 PyUnicode_GET_SIZE(str),
5105 mapping,
5106 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 Py_DECREF(str);
5108 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005109
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005110 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 Py_XDECREF(str);
5112 return NULL;
5113}
Tim Petersced69f82003-09-16 20:30:58 +00005114
Guido van Rossum9e896b32000-04-05 20:11:21 +00005115/* --- Decimal Encoder ---------------------------------------------------- */
5116
5117int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005118 Py_ssize_t length,
5119 char *output,
5120 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005121{
5122 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 PyObject *errorHandler = NULL;
5124 PyObject *exc = NULL;
5125 const char *encoding = "decimal";
5126 const char *reason = "invalid decimal Unicode string";
5127 /* the following variable is used for caching string comparisons
5128 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5129 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005130
5131 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005132 PyErr_BadArgument();
5133 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005134 }
5135
5136 p = s;
5137 end = s + length;
5138 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 register Py_UNICODE ch = *p;
5140 int decimal;
5141 PyObject *repunicode;
5142 Py_ssize_t repsize;
5143 Py_ssize_t newpos;
5144 Py_UNICODE *uni2;
5145 Py_UNICODE *collstart;
5146 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005147
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005148 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005149 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005150 ++p;
5151 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005152 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 decimal = Py_UNICODE_TODECIMAL(ch);
5154 if (decimal >= 0) {
5155 *output++ = '0' + decimal;
5156 ++p;
5157 continue;
5158 }
5159 if (0 < ch && ch < 256) {
5160 *output++ = (char)ch;
5161 ++p;
5162 continue;
5163 }
5164 /* All other characters are considered unencodable */
5165 collstart = p;
5166 collend = p+1;
5167 while (collend < end) {
5168 if ((0 < *collend && *collend < 256) ||
5169 !Py_UNICODE_ISSPACE(*collend) ||
5170 Py_UNICODE_TODECIMAL(*collend))
5171 break;
5172 }
5173 /* cache callback name lookup
5174 * (if not done yet, i.e. it's the first error) */
5175 if (known_errorHandler==-1) {
5176 if ((errors==NULL) || (!strcmp(errors, "strict")))
5177 known_errorHandler = 1;
5178 else if (!strcmp(errors, "replace"))
5179 known_errorHandler = 2;
5180 else if (!strcmp(errors, "ignore"))
5181 known_errorHandler = 3;
5182 else if (!strcmp(errors, "xmlcharrefreplace"))
5183 known_errorHandler = 4;
5184 else
5185 known_errorHandler = 0;
5186 }
5187 switch (known_errorHandler) {
5188 case 1: /* strict */
5189 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5190 goto onError;
5191 case 2: /* replace */
5192 for (p = collstart; p < collend; ++p)
5193 *output++ = '?';
5194 /* fall through */
5195 case 3: /* ignore */
5196 p = collend;
5197 break;
5198 case 4: /* xmlcharrefreplace */
5199 /* generate replacement (temporarily (mis)uses p) */
5200 for (p = collstart; p < collend; ++p)
5201 output += sprintf(output, "&#%d;", (int)*p);
5202 p = collend;
5203 break;
5204 default:
5205 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5206 encoding, reason, s, length, &exc,
5207 collstart-s, collend-s, &newpos);
5208 if (repunicode == NULL)
5209 goto onError;
5210 /* generate replacement */
5211 repsize = PyUnicode_GET_SIZE(repunicode);
5212 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5213 Py_UNICODE ch = *uni2;
5214 if (Py_UNICODE_ISSPACE(ch))
5215 *output++ = ' ';
5216 else {
5217 decimal = Py_UNICODE_TODECIMAL(ch);
5218 if (decimal >= 0)
5219 *output++ = '0' + decimal;
5220 else if (0 < ch && ch < 256)
5221 *output++ = (char)ch;
5222 else {
5223 Py_DECREF(repunicode);
5224 raise_encode_exception(&exc, encoding,
5225 s, length, collstart-s, collend-s, reason);
5226 goto onError;
5227 }
5228 }
5229 }
5230 p = s + newpos;
5231 Py_DECREF(repunicode);
5232 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005233 }
5234 /* 0-terminate the output string */
5235 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 Py_XDECREF(exc);
5237 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005238 return 0;
5239
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 Py_XDECREF(exc);
5242 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005243 return -1;
5244}
5245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246/* --- Helpers ------------------------------------------------------------ */
5247
Eric Smitha9f7d622008-02-17 19:46:49 +00005248#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005249#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005250
5251#include "stringlib/count.h"
5252#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005253#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005254#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005255
Fredrik Lundhc8162812006-05-26 19:33:03 +00005256/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005257#define ADJUST_INDICES(start, end, len) \
5258 if (end > len) \
5259 end = len; \
5260 else if (end < 0) { \
5261 end += len; \
5262 if (end < 0) \
5263 end = 0; \
5264 } \
5265 if (start < 0) { \
5266 start += len; \
5267 if (start < 0) \
5268 start = 0; \
5269 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005270
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005272 PyObject *substr,
5273 Py_ssize_t start,
5274 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005277 PyUnicodeObject* str_obj;
5278 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005279
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005280 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5281 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005282 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005283 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5284 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005285 Py_DECREF(str_obj);
5286 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 }
Tim Petersced69f82003-09-16 20:30:58 +00005288
Antoine Pitrou64672132010-01-13 07:55:48 +00005289 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005290 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005291 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5292 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005293 );
5294
5295 Py_DECREF(sub_obj);
5296 Py_DECREF(str_obj);
5297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 return result;
5299}
5300
Martin v. Löwis18e16552006-02-15 17:27:45 +00005301Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005302 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005303 Py_ssize_t start,
5304 Py_ssize_t end,
5305 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005307 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005308
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005309 str = PyUnicode_FromObject(str);
5310 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005311 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005312 sub = PyUnicode_FromObject(sub);
5313 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005314 Py_DECREF(str);
5315 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 }
Tim Petersced69f82003-09-16 20:30:58 +00005317
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005318 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005319 result = stringlib_find_slice(
5320 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5321 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5322 start, end
5323 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005324 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005325 result = stringlib_rfind_slice(
5326 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5327 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5328 start, end
5329 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005330
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005331 Py_DECREF(str);
5332 Py_DECREF(sub);
5333
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 return result;
5335}
5336
Tim Petersced69f82003-09-16 20:30:58 +00005337static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005339 PyUnicodeObject *substring,
5340 Py_ssize_t start,
5341 Py_ssize_t end,
5342 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 if (substring->length == 0)
5345 return 1;
5346
Antoine Pitrou64672132010-01-13 07:55:48 +00005347 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 end -= substring->length;
5349 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
5352 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005353 if (Py_UNICODE_MATCH(self, end, substring))
5354 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 } else {
5356 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005357 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
5359
5360 return 0;
5361}
5362
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 str = PyUnicode_FromObject(str);
5372 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005373 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 substr = PyUnicode_FromObject(substr);
5375 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 Py_DECREF(str);
5377 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 }
Tim Petersced69f82003-09-16 20:30:58 +00005379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 (PyUnicodeObject *)substr,
5382 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 Py_DECREF(str);
5384 Py_DECREF(substr);
5385 return result;
5386}
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388/* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5390
Tim Petersced69f82003-09-16 20:30:58 +00005391static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005393 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394{
5395
5396 PyUnicodeObject *u;
5397
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005400 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005401
5402 Py_UNICODE_COPY(u->str, self->str, self->length);
5403
Tim Peters7a29bd52001-09-12 03:03:31 +00005404 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5408 Py_INCREF(self);
5409 Py_DECREF(u);
5410 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 }
5412 return (PyObject*) u;
5413}
5414
Tim Petersced69f82003-09-16 20:30:58 +00005415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416int fixupper(PyUnicodeObject *self)
5417{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 Py_UNICODE *s = self->str;
5420 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005423 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 ch = Py_UNICODE_TOUPPER(*s);
5426 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005428 *s = ch;
5429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 s++;
5431 }
5432
5433 return status;
5434}
5435
Tim Petersced69f82003-09-16 20:30:58 +00005436static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437int fixlower(PyUnicodeObject *self)
5438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 Py_UNICODE *s = self->str;
5441 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005444 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 ch = Py_UNICODE_TOLOWER(*s);
5447 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 *s = ch;
5450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 s++;
5452 }
5453
5454 return status;
5455}
5456
Tim Petersced69f82003-09-16 20:30:58 +00005457static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458int fixswapcase(PyUnicodeObject *self)
5459{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005460 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_UNICODE *s = self->str;
5462 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 while (len-- > 0) {
5465 if (Py_UNICODE_ISUPPER(*s)) {
5466 *s = Py_UNICODE_TOLOWER(*s);
5467 status = 1;
5468 } else if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
5471 }
5472 s++;
5473 }
5474
5475 return status;
5476}
5477
Tim Petersced69f82003-09-16 20:30:58 +00005478static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479int fixcapitalize(PyUnicodeObject *self)
5480{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005482 Py_UNICODE *s = self->str;
5483 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005484
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005485 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005486 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005487 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005488 *s = Py_UNICODE_TOUPPER(*s);
5489 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005491 s++;
5492 while (--len > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5496 }
5497 s++;
5498 }
5499 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500}
5501
5502static
5503int fixtitle(PyUnicodeObject *self)
5504{
5505 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506 register Py_UNICODE *e;
5507 int previous_is_cased;
5508
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005511 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512 if (*p != ch) {
5513 *p = ch;
5514 return 1;
5515 }
5516 else
5517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 }
Tim Petersced69f82003-09-16 20:30:58 +00005519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 e = p + PyUnicode_GET_SIZE(self);
5521 previous_is_cased = 0;
5522 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005523 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005524
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005525 if (previous_is_cased)
5526 *p = Py_UNICODE_TOLOWER(ch);
5527 else
5528 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005529
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005530 if (Py_UNICODE_ISLOWER(ch) ||
5531 Py_UNICODE_ISUPPER(ch) ||
5532 Py_UNICODE_ISTITLE(ch))
5533 previous_is_cased = 1;
5534 else
5535 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 }
5537 return 1;
5538}
5539
Tim Peters8ce9f162004-08-27 01:49:32 +00005540PyObject *
5541PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Tim Peters8ce9f162004-08-27 01:49:32 +00005543 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005544 const Py_UNICODE blank = ' ';
5545 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005546 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005547 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005548 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5551 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005554 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Tim Peters05eba1f2004-08-27 21:32:02 +00005556 fseq = PySequence_Fast(seq, "");
5557 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005558 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005559 }
5560
Tim Peters91879ab2004-08-27 22:35:44 +00005561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5566 * is invariant.
5567 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005568 seqlen = PySequence_Fast_GET_SIZE(fseq);
5569 /* If empty sequence, return u"". */
5570 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005571 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5572 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005573 }
5574 /* If singleton sequence with an exact Unicode, return that. */
5575 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005576 item = PySequence_Fast_GET_ITEM(fseq, 0);
5577 if (PyUnicode_CheckExact(item)) {
5578 Py_INCREF(item);
5579 res = (PyUnicodeObject *)item;
5580 goto Done;
5581 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005582 }
5583
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 /* At least two items to join, or one that isn't exact Unicode. */
5585 if (seqlen > 1) {
5586 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005587 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005588 sep = &blank;
5589 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005591 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005592 internal_separator = PyUnicode_FromObject(separator);
5593 if (internal_separator == NULL)
5594 goto onError;
5595 sep = PyUnicode_AS_UNICODE(internal_separator);
5596 seplen = PyUnicode_GET_SIZE(internal_separator);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005599 }
5600 }
5601
5602 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005603 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005604 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005605 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 res_p = PyUnicode_AS_UNICODE(res);
5607 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005608
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005610 Py_ssize_t itemlen;
5611 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005612
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 item = PySequence_Fast_GET_ITEM(fseq, i);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616 PyErr_Format(PyExc_TypeError,
5617 "sequence item %zd: expected string or Unicode,"
5618 " %.80s found",
5619 i, Py_TYPE(item)->tp_name);
5620 goto onError;
5621 }
5622 item = PyUnicode_FromObject(item);
5623 if (item == NULL)
5624 goto onError;
5625 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005626
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005629
Tim Peters8ce9f162004-08-27 01:49:32 +00005630 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005631 itemlen = PyUnicode_GET_SIZE(item);
5632 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005633 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 goto Overflow;
5635 if (i < seqlen - 1) {
5636 new_res_used += seplen;
5637 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005638 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005639 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005640 if (new_res_used > res_alloc) {
5641 /* double allocated size until it's big enough */
5642 do {
5643 res_alloc += res_alloc;
5644 if (res_alloc <= 0)
5645 goto Overflow;
5646 } while (new_res_used > res_alloc);
5647 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648 Py_DECREF(item);
5649 goto onError;
5650 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005652 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005653
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656 res_p += itemlen;
5657 if (i < seqlen - 1) {
5658 Py_UNICODE_COPY(res_p, sep, seplen);
5659 res_p += seplen;
5660 }
5661 Py_DECREF(item);
5662 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005663 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005664
Tim Peters05eba1f2004-08-27 21:32:02 +00005665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5667 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005668 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005669 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005670
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005671 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005672 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 return (PyObject *)res;
5675
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005676 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005677 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005678 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 Py_DECREF(item);
5680 /* fall through */
5681
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005682 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005683 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005684 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 return NULL;
5687}
5688
Tim Petersced69f82003-09-16 20:30:58 +00005689static
5690PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005691 Py_ssize_t left,
5692 Py_ssize_t right,
5693 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
5695 PyUnicodeObject *u;
5696
5697 if (left < 0)
5698 left = 0;
5699 if (right < 0)
5700 right = 0;
5701
Tim Peters7a29bd52001-09-12 03:03:31 +00005702 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 Py_INCREF(self);
5704 return self;
5705 }
5706
Neal Norwitze7d8be82008-07-31 17:17:14 +00005707 if (left > PY_SSIZE_T_MAX - self->length ||
5708 right > PY_SSIZE_T_MAX - (left + self->length)) {
5709 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710 return NULL;
5711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 u = _PyUnicode_New(left + self->length + right);
5713 if (u) {
5714 if (left)
5715 Py_UNICODE_FILL(u->str, fill, left);
5716 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717 if (right)
5718 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5719 }
5720
5721 return u;
5722}
5723
Antoine Pitrou64672132010-01-13 07:55:48 +00005724PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 string = PyUnicode_FromObject(string);
5729 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
Antoine Pitrou64672132010-01-13 07:55:48 +00005732 list = stringlib_splitlines(
5733 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5734 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736 Py_DECREF(string);
5737 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738}
5739
Tim Petersced69f82003-09-16 20:30:58 +00005740static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005742 PyUnicodeObject *substring,
5743 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005746 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005749 return stringlib_split_whitespace(
5750 (PyObject*) self, self->str, self->length, maxcount
5751 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Antoine Pitrou64672132010-01-13 07:55:48 +00005753 return stringlib_split(
5754 (PyObject*) self, self->str, self->length,
5755 substring->str, substring->length,
5756 maxcount
5757 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
Tim Petersced69f82003-09-16 20:30:58 +00005760static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005762 PyUnicodeObject *substring,
5763 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005764{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005766 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005769 return stringlib_rsplit_whitespace(
5770 (PyObject*) self, self->str, self->length, maxcount
5771 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772
Antoine Pitrou64672132010-01-13 07:55:48 +00005773 return stringlib_rsplit(
5774 (PyObject*) self, self->str, self->length,
5775 substring->str, substring->length,
5776 maxcount
5777 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778}
5779
5780static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005782 PyUnicodeObject *str1,
5783 PyUnicodeObject *str2,
5784 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785{
5786 PyUnicodeObject *u;
5787
5788 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005789 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005790 else if (maxcount == 0 || self->length == 0)
5791 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Fredrik Lundh347ee272006-05-24 16:35:18 +00005793 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005794 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005795 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005796 if (str1->length == 0)
5797 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005798 if (str1->length == 1) {
5799 /* replace characters */
5800 Py_UNICODE u1, u2;
5801 if (!findchar(self->str, self->length, str1->str[0]))
5802 goto nothing;
5803 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5804 if (!u)
5805 return NULL;
5806 Py_UNICODE_COPY(u->str, self->str, self->length);
5807 u1 = str1->str[0];
5808 u2 = str2->str[0];
5809 for (i = 0; i < u->length; i++)
5810 if (u->str[i] == u1) {
5811 if (--maxcount < 0)
5812 break;
5813 u->str[i] = u2;
5814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005816 i = stringlib_find(
5817 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005819 if (i < 0)
5820 goto nothing;
5821 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5822 if (!u)
5823 return NULL;
5824 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005825
5826 /* change everything in-place, starting with this one */
5827 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5828 i += str1->length;
5829
5830 while ( --maxcount > 0) {
5831 i = stringlib_find(self->str+i, self->length-i,
5832 str1->str, str1->length,
5833 i);
5834 if (i == -1)
5835 break;
5836 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5837 i += str1->length;
5838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005841
Brett Cannona7f13ee2010-05-04 01:16:51 +00005842 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005843 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_UNICODE *p;
5845
5846 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005847 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5848 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005849 if (n == 0)
5850 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005851 /* new_size = self->length + n * (str2->length - str1->length)); */
5852 delta = (str2->length - str1->length);
5853 if (delta == 0) {
5854 new_size = self->length;
5855 } else {
5856 product = n * (str2->length - str1->length);
5857 if ((product / (str2->length - str1->length)) != n) {
5858 PyErr_SetString(PyExc_OverflowError,
5859 "replace string is too long");
5860 return NULL;
5861 }
5862 new_size = self->length + product;
5863 if (new_size < 0) {
5864 PyErr_SetString(PyExc_OverflowError,
5865 "replace string is too long");
5866 return NULL;
5867 }
5868 }
5869 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005870 if (!u)
5871 return NULL;
5872 i = 0;
5873 p = u->str;
5874 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005875 while (n-- > 0) {
5876 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005877 j = stringlib_find(self->str+i, self->length-i,
5878 str1->str, str1->length,
5879 i);
5880 if (j == -1)
5881 break;
5882 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005883 /* copy unchanged part [i:j] */
5884 Py_UNICODE_COPY(p, self->str+i, j-i);
5885 p += j - i;
5886 }
5887 /* copy substitution string */
5888 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005889 Py_UNICODE_COPY(p, str2->str, str2->length);
5890 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005891 }
5892 i = j + str1->length;
5893 }
5894 if (i < self->length)
5895 /* copy tail [i:] */
5896 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005897 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005898 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005899 while (n > 0) {
5900 Py_UNICODE_COPY(p, str2->str, str2->length);
5901 p += str2->length;
5902 if (--n <= 0)
5903 break;
5904 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005906 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
5908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005910
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005911 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005912 /* nothing to replace; return original string (when possible) */
5913 if (PyUnicode_CheckExact(self)) {
5914 Py_INCREF(self);
5915 return (PyObject *) self;
5916 }
5917 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918}
5919
5920/* --- Unicode Object Methods --------------------------------------------- */
5921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005922PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005923 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924\n\
5925Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005926characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
5928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005929unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 return fixup(self, fixtitle);
5932}
5933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005935 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936\n\
5937Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005938have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
5940static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005941unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 return fixup(self, fixcapitalize);
5944}
5945
5946#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005948 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949\n\
5950Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005954unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
5956 PyObject *list;
5957 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005958 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 /* Split into words */
5961 list = split(self, NULL, -1);
5962 if (!list)
5963 return NULL;
5964
5965 /* Capitalize each word */
5966 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5967 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005968 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 if (item == NULL)
5970 goto onError;
5971 Py_DECREF(PyList_GET_ITEM(list, i));
5972 PyList_SET_ITEM(list, i, item);
5973 }
5974
5975 /* Join the words to form a new string */
5976 item = PyUnicode_Join(NULL, list);
5977
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005978 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 Py_DECREF(list);
5980 return (PyObject *)item;
5981}
5982#endif
5983
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005984/* Argument converter. Coerces to a single unicode character */
5985
5986static int
5987convert_uc(PyObject *obj, void *addr)
5988{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005989 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5990 PyObject *uniobj;
5991 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005992
Benjamin Peterson857ce152009-01-31 16:29:18 +00005993 uniobj = PyUnicode_FromObject(obj);
5994 if (uniobj == NULL) {
5995 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005996 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00005997 return 0;
5998 }
5999 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6000 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006001 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006002 Py_DECREF(uniobj);
6003 return 0;
6004 }
6005 unistr = PyUnicode_AS_UNICODE(uniobj);
6006 *fillcharloc = unistr[0];
6007 Py_DECREF(uniobj);
6008 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006009}
6010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006011PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006012 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006014Return S centered in a Unicode string of length width. Padding is\n\
6015done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016
6017static PyObject *
6018unicode_center(PyUnicodeObject *self, PyObject *args)
6019{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006020 Py_ssize_t marg, left;
6021 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006022 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Thomas Woutersde017742006-02-16 19:34:37 +00006024 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 return NULL;
6026
Tim Peters7a29bd52001-09-12 03:03:31 +00006027 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 Py_INCREF(self);
6029 return (PyObject*) self;
6030 }
6031
6032 marg = width - self->length;
6033 left = marg / 2 + (marg & width & 1);
6034
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006035 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Marc-André Lemburge5034372000-08-08 08:04:29 +00006038#if 0
6039
6040/* This code should go into some future Unicode collation support
6041 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006042 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006043
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006044/* speedy UTF-16 code point order comparison */
6045/* gleaned from: */
6046/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6047
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006048static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006049{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006050 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006051 0, 0, 0, 0, 0, 0, 0, 0,
6052 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006053 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006054};
6055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056static int
6057unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6058{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006060
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 Py_UNICODE *s1 = str1->str;
6062 Py_UNICODE *s2 = str2->str;
6063
6064 len1 = str1->length;
6065 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006068 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006069
6070 c1 = *s1++;
6071 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006072
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006073 if (c1 > (1<<11) * 26)
6074 c1 += utf16Fixup[c1>>11];
6075 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006076 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006078
6079 if (c1 != c2)
6080 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006081
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006082 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 }
6084
6085 return (len1 < len2) ? -1 : (len1 != len2);
6086}
6087
Marc-André Lemburge5034372000-08-08 08:04:29 +00006088#else
6089
6090static int
6091unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6092{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006093 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006094
6095 Py_UNICODE *s1 = str1->str;
6096 Py_UNICODE *s2 = str2->str;
6097
6098 len1 = str1->length;
6099 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006100
Marc-André Lemburge5034372000-08-08 08:04:29 +00006101 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006102 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006103
Fredrik Lundh45714e92001-06-26 16:39:36 +00006104 c1 = *s1++;
6105 c2 = *s2++;
6106
6107 if (c1 != c2)
6108 return (c1 < c2) ? -1 : 1;
6109
Marc-André Lemburge5034372000-08-08 08:04:29 +00006110 len1--; len2--;
6111 }
6112
6113 return (len1 < len2) ? -1 : (len1 != len2);
6114}
6115
6116#endif
6117
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006119 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
6121 PyUnicodeObject *u = NULL, *v = NULL;
6122 int result;
6123
6124 /* Coerce the two arguments */
6125 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6126 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6129 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
Thomas Wouters7e474022000-07-16 12:04:32 +00006132 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006134 Py_DECREF(u);
6135 Py_DECREF(v);
6136 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 }
6138
6139 result = unicode_compare(u, v);
6140
6141 Py_DECREF(u);
6142 Py_DECREF(v);
6143 return result;
6144
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 Py_XDECREF(u);
6147 Py_XDECREF(v);
6148 return -1;
6149}
6150
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006151PyObject *PyUnicode_RichCompare(PyObject *left,
6152 PyObject *right,
6153 int op)
6154{
6155 int result;
6156
6157 result = PyUnicode_Compare(left, right);
6158 if (result == -1 && PyErr_Occurred())
6159 goto onError;
6160
6161 /* Convert the return value to a Boolean */
6162 switch (op) {
6163 case Py_EQ:
6164 result = (result == 0);
6165 break;
6166 case Py_NE:
6167 result = (result != 0);
6168 break;
6169 case Py_LE:
6170 result = (result <= 0);
6171 break;
6172 case Py_GE:
6173 result = (result >= 0);
6174 break;
6175 case Py_LT:
6176 result = (result == -1);
6177 break;
6178 case Py_GT:
6179 result = (result == 1);
6180 break;
6181 }
6182 return PyBool_FromLong(result);
6183
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006184 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006185
6186 /* Standard case
6187
6188 Type errors mean that PyUnicode_FromObject() could not convert
6189 one of the arguments (usually the right hand side) to Unicode,
6190 ie. we can't handle the comparison request. However, it is
6191 possible that the other object knows a comparison method, which
6192 is why we return Py_NotImplemented to give the other object a
6193 chance.
6194
6195 */
6196 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6197 PyErr_Clear();
6198 Py_INCREF(Py_NotImplemented);
6199 return Py_NotImplemented;
6200 }
6201 if (op != Py_EQ && op != Py_NE)
6202 return NULL;
6203
6204 /* Equality comparison.
6205
6206 This is a special case: we silence any PyExc_UnicodeDecodeError
6207 and instead turn it into a PyErr_UnicodeWarning.
6208
6209 */
6210 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6211 return NULL;
6212 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006213 if (PyErr_Warn(PyExc_UnicodeWarning,
6214 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006215 "Unicode equal comparison "
6216 "failed to convert both arguments to Unicode - "
6217 "interpreting them as being unequal" :
6218 "Unicode unequal comparison "
6219 "failed to convert both arguments to Unicode - "
6220 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006221 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006222 return NULL;
6223 result = (op == Py_NE);
6224 return PyBool_FromLong(result);
6225}
6226
Guido van Rossum403d68b2000-03-13 15:55:09 +00006227int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006228 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006229{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006230 PyObject *str, *sub;
6231 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006232
6233 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006234 sub = PyUnicode_FromObject(element);
6235 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006236 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006237 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006238
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006239 str = PyUnicode_FromObject(container);
6240 if (!str) {
6241 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006242 return -1;
6243 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006244
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006245 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006246
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006247 Py_DECREF(str);
6248 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006249
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006250 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006251}
6252
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253/* Concat to string or Unicode object giving a new Unicode object. */
6254
6255PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006256 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
6258 PyUnicodeObject *u = NULL, *v = NULL, *w;
6259
6260 /* Coerce the two arguments */
6261 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6262 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006263 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6265 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006266 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
6268 /* Shortcuts */
6269 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006270 Py_DECREF(v);
6271 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
6273 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006274 Py_DECREF(u);
6275 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 }
6277
6278 /* Concat the two Unicode strings */
6279 w = _PyUnicode_New(u->length + v->length);
6280 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006281 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 Py_UNICODE_COPY(w->str, u->str, u->length);
6283 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6284
6285 Py_DECREF(u);
6286 Py_DECREF(v);
6287 return (PyObject *)w;
6288
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006289 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 Py_XDECREF(u);
6291 Py_XDECREF(v);
6292 return NULL;
6293}
6294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006295PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006296 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006298Return the number of non-overlapping occurrences of substring sub in\n\
6299Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006300interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302static PyObject *
6303unicode_count(PyUnicodeObject *self, PyObject *args)
6304{
6305 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006306 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006307 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 PyObject *result;
6309
Guido van Rossumb8872e62000-05-09 14:14:27 +00006310 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006311 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return NULL;
6313
6314 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006315 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006317 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006318
Antoine Pitrou64672132010-01-13 07:55:48 +00006319 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006320 result = PyInt_FromSsize_t(
6321 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006322 substring->str, substring->length,
6323 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006324 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325
6326 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006327
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 return result;
6329}
6330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006331PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006332 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006334Encodes S using the codec registered for encoding. encoding defaults\n\
6335to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006336handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6338'xmlcharrefreplace' as well as any other name registered with\n\
6339codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340
6341static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006342unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006344 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 char *encoding = NULL;
6346 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006347 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006348
Benjamin Peterson332d7212009-09-18 21:14:55 +00006349 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6350 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006352 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006353 if (v == NULL)
6354 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006355 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006356 PyErr_Format(PyExc_TypeError,
6357 "encoder did not return a string/unicode object "
6358 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006359 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006360 Py_DECREF(v);
6361 return NULL;
6362 }
6363 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006365 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006366 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006367}
6368
6369PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006370 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006371\n\
6372Decodes S using the codec registered for encoding. encoding defaults\n\
6373to the default encoding. errors may be given to set a different error\n\
6374handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6375a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6376as well as any other name registerd with codecs.register_error that is\n\
6377able to handle UnicodeDecodeErrors.");
6378
6379static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006380unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006381{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006382 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006383 char *encoding = NULL;
6384 char *errors = NULL;
6385 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006386
Benjamin Peterson332d7212009-09-18 21:14:55 +00006387 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6388 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389 return NULL;
6390 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006391 if (v == NULL)
6392 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006393 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006394 PyErr_Format(PyExc_TypeError,
6395 "decoder did not return a string/unicode object "
6396 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006397 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006398 Py_DECREF(v);
6399 return NULL;
6400 }
6401 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006402
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006403 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006407PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006408 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409\n\
6410Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006411If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413static PyObject*
6414unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6415{
6416 Py_UNICODE *e;
6417 Py_UNICODE *p;
6418 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006419 Py_UNICODE *qe;
6420 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 PyUnicodeObject *u;
6422 int tabsize = 8;
6423
6424 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Thomas Wouters7e474022000-07-16 12:04:32 +00006427 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006428 i = 0; /* chars up to and including most recent \n or \r */
6429 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6430 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 for (p = self->str; p < e; p++)
6432 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006433 if (tabsize > 0) {
6434 incr = tabsize - (j % tabsize); /* cannot overflow */
6435 if (j > PY_SSIZE_T_MAX - incr)
6436 goto overflow1;
6437 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006438 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006441 if (j > PY_SSIZE_T_MAX - 1)
6442 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 j++;
6444 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006445 if (i > PY_SSIZE_T_MAX - j)
6446 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006448 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 }
6450 }
6451
Guido van Rossum5bdff602008-03-11 21:18:06 +00006452 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006453 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006454
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 /* Second pass: create output string and fill it */
6456 u = _PyUnicode_New(i + j);
6457 if (!u)
6458 return NULL;
6459
Guido van Rossum5bdff602008-03-11 21:18:06 +00006460 j = 0; /* same as in first pass */
6461 q = u->str; /* next output char */
6462 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463
6464 for (p = self->str; p < e; p++)
6465 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006466 if (tabsize > 0) {
6467 i = tabsize - (j % tabsize);
6468 j += i;
6469 while (i--) {
6470 if (q >= qe)
6471 goto overflow2;
6472 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006473 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006474 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006475 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006476 else {
6477 if (q >= qe)
6478 goto overflow2;
6479 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006480 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 if (*p == '\n' || *p == '\r')
6482 j = 0;
6483 }
6484
6485 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006486
6487 overflow2:
6488 Py_DECREF(u);
6489 overflow1:
6490 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492}
6493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006494PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006495 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496\n\
6497Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006498such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499arguments start and end are interpreted as in slice notation.\n\
6500\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502
6503static PyObject *
6504unicode_find(PyUnicodeObject *self, PyObject *args)
6505{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006506 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006507 Py_ssize_t start;
6508 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006509 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
Facundo Batista57d56692007-11-16 18:04:14 +00006511 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006514 result = stringlib_find_slice(
6515 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6516 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6517 start, end
6518 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519
6520 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006521
6522 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
6525static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
6528 if (index < 0 || index >= self->length) {
6529 PyErr_SetString(PyExc_IndexError, "string index out of range");
6530 return NULL;
6531 }
6532
6533 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6534}
6535
6536static long
6537unicode_hash(PyUnicodeObject *self)
6538{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006539 /* Since Unicode objects compare equal to their ASCII string
6540 counterparts, they should use the individual character values
6541 as basis for their hash value. This is needed to assure that
6542 strings and Unicode objects behave in the same way as
6543 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
Martin v. Löwis18e16552006-02-15 17:27:45 +00006545 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006546 register Py_UNICODE *p;
6547 register long x;
6548
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006550 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006551 len = PyUnicode_GET_SIZE(self);
6552 p = PyUnicode_AS_UNICODE(self);
6553 x = *p << 7;
6554 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006555 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006556 x ^= PyUnicode_GET_SIZE(self);
6557 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006558 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006559 self->hash = x;
6560 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006564 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
6568static PyObject *
6569unicode_index(PyUnicodeObject *self, PyObject *args)
6570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006572 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006573 Py_ssize_t start;
6574 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
Facundo Batista57d56692007-11-16 18:04:14 +00006576 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006579 result = stringlib_find_slice(
6580 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6581 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6582 start, end
6583 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006586
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 if (result < 0) {
6588 PyErr_SetString(PyExc_ValueError, "substring not found");
6589 return NULL;
6590 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006591
Martin v. Löwis18e16552006-02-15 17:27:45 +00006592 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593}
6594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006595PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006596 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006598Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006599at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600
6601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006602unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
6604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6605 register const Py_UNICODE *e;
6606 int cased;
6607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 /* Shortcut for single character strings */
6609 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006610 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006612 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006613 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 e = p + PyUnicode_GET_SIZE(self);
6617 cased = 0;
6618 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006619 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006620
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006621 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6622 return PyBool_FromLong(0);
6623 else if (!cased && Py_UNICODE_ISLOWER(ch))
6624 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006626 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627}
6628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006629PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006630 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006632Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
6635static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006636unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637{
6638 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6639 register const Py_UNICODE *e;
6640 int cased;
6641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 /* Shortcut for single character strings */
6643 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006644 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006646 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006647 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006648 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 e = p + PyUnicode_GET_SIZE(self);
6651 cased = 0;
6652 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006653 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006654
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006655 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6656 return PyBool_FromLong(0);
6657 else if (!cased && Py_UNICODE_ISUPPER(ch))
6658 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006660 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006664 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006666Return True if S is a titlecased string and there is at least one\n\
6667character in S, i.e. upper- and titlecase characters may only\n\
6668follow uncased characters and lowercase characters only cased ones.\n\
6669Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
6671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006672unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
6674 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6675 register const Py_UNICODE *e;
6676 int cased, previous_is_cased;
6677
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 /* Shortcut for single character strings */
6679 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006680 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6681 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006683 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006684 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006686
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 e = p + PyUnicode_GET_SIZE(self);
6688 cased = 0;
6689 previous_is_cased = 0;
6690 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006691 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006692
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006693 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6694 if (previous_is_cased)
6695 return PyBool_FromLong(0);
6696 previous_is_cased = 1;
6697 cased = 1;
6698 }
6699 else if (Py_UNICODE_ISLOWER(ch)) {
6700 if (!previous_is_cased)
6701 return PyBool_FromLong(0);
6702 previous_is_cased = 1;
6703 cased = 1;
6704 }
6705 else
6706 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006708 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709}
6710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006711PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006712 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006714Return True if all characters in S are whitespace\n\
6715and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716
6717static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006718unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6721 register const Py_UNICODE *e;
6722
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 /* Shortcut for single character strings */
6724 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006725 Py_UNICODE_ISSPACE(*p))
6726 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006728 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006729 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 e = p + PyUnicode_GET_SIZE(self);
6733 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006734 if (!Py_UNICODE_ISSPACE(*p))
6735 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006737 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738}
6739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006741 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006742\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006743Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006744and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006745
6746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006747unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006748{
6749 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6750 register const Py_UNICODE *e;
6751
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006752 /* Shortcut for single character strings */
6753 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006754 Py_UNICODE_ISALPHA(*p))
6755 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006756
6757 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006758 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006759 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006760
6761 e = p + PyUnicode_GET_SIZE(self);
6762 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006763 if (!Py_UNICODE_ISALPHA(*p))
6764 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767}
6768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006769PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006770 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006771\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006772Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006773and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006774
6775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006776unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006777{
6778 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6779 register const Py_UNICODE *e;
6780
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006781 /* Shortcut for single character strings */
6782 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006783 Py_UNICODE_ISALNUM(*p))
6784 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785
6786 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006787 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006788 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789
6790 e = p + PyUnicode_GET_SIZE(self);
6791 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006792 if (!Py_UNICODE_ISALNUM(*p))
6793 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006794 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006802False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006805unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
6807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6808 register const Py_UNICODE *e;
6809
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 /* Shortcut for single character strings */
6811 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006812 Py_UNICODE_ISDECIMAL(*p))
6813 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006815 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006816 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006817 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 e = p + PyUnicode_GET_SIZE(self);
6820 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006821 if (!Py_UNICODE_ISDECIMAL(*p))
6822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825}
6826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006828 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006830Return True if all characters in S are digits\n\
6831and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
6833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006834unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835{
6836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6837 register const Py_UNICODE *e;
6838
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 /* Shortcut for single character strings */
6840 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006841 Py_UNICODE_ISDIGIT(*p))
6842 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006844 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006845 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006846 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 e = p + PyUnicode_GET_SIZE(self);
6849 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006850 if (!Py_UNICODE_ISDIGIT(*p))
6851 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854}
6855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006857 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006860False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
6862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006863unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864{
6865 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6866 register const Py_UNICODE *e;
6867
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 /* Shortcut for single character strings */
6869 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006870 Py_UNICODE_ISNUMERIC(*p))
6871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006873 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006874 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006875 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006876
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 e = p + PyUnicode_GET_SIZE(self);
6878 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006879 if (!Py_UNICODE_ISNUMERIC(*p))
6880 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883}
6884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006885PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006886 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887\n\
6888Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006889iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
6891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006892unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006894 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Martin v. Löwis18e16552006-02-15 17:27:45 +00006897static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898unicode_length(PyUnicodeObject *self)
6899{
6900 return self->length;
6901}
6902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006903PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006904 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006906Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006907done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909static PyObject *
6910unicode_ljust(PyUnicodeObject *self, PyObject *args)
6911{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006912 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006913 Py_UNICODE fillchar = ' ';
6914
Martin v. Löwis412fb672006-04-13 06:34:32 +00006915 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 return NULL;
6917
Tim Peters7a29bd52001-09-12 03:03:31 +00006918 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 Py_INCREF(self);
6920 return (PyObject*) self;
6921 }
6922
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006923 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924}
6925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006926PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006927 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
6931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006932unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 return fixup(self, fixlower);
6935}
6936
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006937#define LEFTSTRIP 0
6938#define RIGHTSTRIP 1
6939#define BOTHSTRIP 2
6940
6941/* Arrays indexed by above */
6942static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6943
6944#define STRIPNAME(i) (stripformat[i]+3)
6945
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006946/* externally visible for str.strip(unicode) */
6947PyObject *
6948_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6949{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006950 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6951 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6952 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6953 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6954 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006955
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006956 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006957
Benjamin Peterson857ce152009-01-31 16:29:18 +00006958 i = 0;
6959 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006960 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6961 i++;
6962 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006963 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006964
Benjamin Peterson857ce152009-01-31 16:29:18 +00006965 j = len;
6966 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006967 do {
6968 j--;
6969 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6970 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006971 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006972
Benjamin Peterson857ce152009-01-31 16:29:18 +00006973 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006974 Py_INCREF(self);
6975 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006976 }
6977 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006978 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006979}
6980
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
6982static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006983do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006985 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6986 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006987
Benjamin Peterson857ce152009-01-31 16:29:18 +00006988 i = 0;
6989 if (striptype != RIGHTSTRIP) {
6990 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6991 i++;
6992 }
6993 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006994
Benjamin Peterson857ce152009-01-31 16:29:18 +00006995 j = len;
6996 if (striptype != LEFTSTRIP) {
6997 do {
6998 j--;
6999 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7000 j++;
7001 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007002
Benjamin Peterson857ce152009-01-31 16:29:18 +00007003 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7004 Py_INCREF(self);
7005 return (PyObject*)self;
7006 }
7007 else
7008 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009}
7010
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007011
7012static PyObject *
7013do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7014{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007015 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016
Benjamin Peterson857ce152009-01-31 16:29:18 +00007017 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7018 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019
Benjamin Peterson857ce152009-01-31 16:29:18 +00007020 if (sep != NULL && sep != Py_None) {
7021 if (PyUnicode_Check(sep))
7022 return _PyUnicode_XStrip(self, striptype, sep);
7023 else if (PyString_Check(sep)) {
7024 PyObject *res;
7025 sep = PyUnicode_FromObject(sep);
7026 if (sep==NULL)
7027 return NULL;
7028 res = _PyUnicode_XStrip(self, striptype, sep);
7029 Py_DECREF(sep);
7030 return res;
7031 }
7032 else {
7033 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007034 "%s arg must be None, unicode or str",
7035 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007036 return NULL;
7037 }
7038 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039
Benjamin Peterson857ce152009-01-31 16:29:18 +00007040 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041}
7042
7043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007044PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007045 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046\n\
7047Return a copy of the string S with leading and trailing\n\
7048whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007049If chars is given and not None, remove characters in chars instead.\n\
7050If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007051
7052static PyObject *
7053unicode_strip(PyUnicodeObject *self, PyObject *args)
7054{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007055 if (PyTuple_GET_SIZE(args) == 0)
7056 return do_strip(self, BOTHSTRIP); /* Common case */
7057 else
7058 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007059}
7060
7061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007063 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064\n\
7065Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007066If chars is given and not None, remove characters in chars instead.\n\
7067If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068
7069static PyObject *
7070unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7071{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007072 if (PyTuple_GET_SIZE(args) == 0)
7073 return do_strip(self, LEFTSTRIP); /* Common case */
7074 else
7075 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076}
7077
7078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007079PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007080 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081\n\
7082Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007083If chars is given and not None, remove characters in chars instead.\n\
7084If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085
7086static PyObject *
7087unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7088{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007089 if (PyTuple_GET_SIZE(args) == 0)
7090 return do_strip(self, RIGHTSTRIP); /* Common case */
7091 else
7092 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093}
7094
7095
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007097unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
7099 PyUnicodeObject *u;
7100 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007101 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007102 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103
7104 if (len < 0)
7105 len = 0;
7106
Tim Peters7a29bd52001-09-12 03:03:31 +00007107 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 /* no repeat, return original string */
7109 Py_INCREF(str);
7110 return (PyObject*) str;
7111 }
Tim Peters8f422462000-09-09 06:13:41 +00007112
7113 /* ensure # of chars needed doesn't overflow int and # of bytes
7114 * needed doesn't overflow size_t
7115 */
7116 nchars = len * str->length;
7117 if (len && nchars / len != str->length) {
7118 PyErr_SetString(PyExc_OverflowError,
7119 "repeated string is too long");
7120 return NULL;
7121 }
7122 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7123 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7124 PyErr_SetString(PyExc_OverflowError,
7125 "repeated string is too long");
7126 return NULL;
7127 }
7128 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 if (!u)
7130 return NULL;
7131
7132 p = u->str;
7133
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007134 if (str->length == 1 && len > 0) {
7135 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007136 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007137 Py_ssize_t done = 0; /* number of characters copied this far */
7138 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007139 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007140 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007141 }
7142 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007143 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007144 Py_UNICODE_COPY(p+done, p, n);
7145 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007146 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148
7149 return (PyObject*) u;
7150}
7151
7152PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007153 PyObject *subobj,
7154 PyObject *replobj,
7155 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156{
7157 PyObject *self;
7158 PyObject *str1;
7159 PyObject *str2;
7160 PyObject *result;
7161
7162 self = PyUnicode_FromObject(obj);
7163 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 str1 = PyUnicode_FromObject(subobj);
7166 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007167 Py_DECREF(self);
7168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 }
7170 str2 = PyUnicode_FromObject(replobj);
7171 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007172 Py_DECREF(self);
7173 Py_DECREF(str1);
7174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 }
Tim Petersced69f82003-09-16 20:30:58 +00007176 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007177 (PyUnicodeObject *)str1,
7178 (PyUnicodeObject *)str2,
7179 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180 Py_DECREF(self);
7181 Py_DECREF(str1);
7182 Py_DECREF(str2);
7183 return result;
7184}
7185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007186PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007187 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188\n\
7189Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007190old replaced by new. If the optional argument count is\n\
7191given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
7193static PyObject*
7194unicode_replace(PyUnicodeObject *self, PyObject *args)
7195{
7196 PyUnicodeObject *str1;
7197 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007198 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 PyObject *result;
7200
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 return NULL;
7203 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7204 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007207 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007208 Py_DECREF(str1);
7209 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212 result = replace(self, str1, str2, maxcount);
7213
7214 Py_DECREF(str1);
7215 Py_DECREF(str2);
7216 return result;
7217}
7218
7219static
7220PyObject *unicode_repr(PyObject *unicode)
7221{
7222 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007223 PyUnicode_GET_SIZE(unicode),
7224 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225}
7226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007227PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007228 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229\n\
7230Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007231such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232arguments start and end are interpreted as in slice notation.\n\
7233\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
7236static PyObject *
7237unicode_rfind(PyUnicodeObject *self, PyObject *args)
7238{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007239 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007240 Py_ssize_t start;
7241 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007242 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243
Facundo Batista57d56692007-11-16 18:04:14 +00007244 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007247 result = stringlib_rfind_slice(
7248 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7249 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7250 start, end
7251 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
7253 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007254
7255 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256}
7257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007258PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007259 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
7263static PyObject *
7264unicode_rindex(PyUnicodeObject *self, PyObject *args)
7265{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007266 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007267 Py_ssize_t start;
7268 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007269 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
Facundo Batista57d56692007-11-16 18:04:14 +00007271 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007274 result = stringlib_rfind_slice(
7275 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7276 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7277 start, end
7278 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279
7280 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007281
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 if (result < 0) {
7283 PyErr_SetString(PyExc_ValueError, "substring not found");
7284 return NULL;
7285 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287}
7288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007289PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007290 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007292Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007293done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295static PyObject *
7296unicode_rjust(PyUnicodeObject *self, PyObject *args)
7297{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007298 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007299 Py_UNICODE fillchar = ' ';
7300
Martin v. Löwis412fb672006-04-13 06:34:32 +00007301 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 return NULL;
7303
Tim Peters7a29bd52001-09-12 03:03:31 +00007304 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 Py_INCREF(self);
7306 return (PyObject*) self;
7307 }
7308
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007309 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310}
7311
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007313unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314{
7315 /* standard clamping */
7316 if (start < 0)
7317 start = 0;
7318 if (end < 0)
7319 end = 0;
7320 if (end > self->length)
7321 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007322 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 /* full slice, return original string */
7324 Py_INCREF(self);
7325 return (PyObject*) self;
7326 }
7327 if (start > end)
7328 start = end;
7329 /* copy slice */
7330 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007331 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332}
7333
7334PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007335 PyObject *sep,
7336 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337{
7338 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007339
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 s = PyUnicode_FromObject(s);
7341 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007342 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007343 if (sep != NULL) {
7344 sep = PyUnicode_FromObject(sep);
7345 if (sep == NULL) {
7346 Py_DECREF(s);
7347 return NULL;
7348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 }
7350
7351 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7352
7353 Py_DECREF(s);
7354 Py_XDECREF(sep);
7355 return result;
7356}
7357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007358PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007359 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360\n\
7361Return a list of the words in S, using sep as the\n\
7362delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007363splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007364whitespace string is a separator and empty strings are\n\
7365removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366
7367static PyObject*
7368unicode_split(PyUnicodeObject *self, PyObject *args)
7369{
7370 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007371 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
Martin v. Löwis18e16552006-02-15 17:27:45 +00007373 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 return NULL;
7375
7376 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007377 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007379 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007381 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382}
7383
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007384PyObject *
7385PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7386{
7387 PyObject* str_obj;
7388 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007389 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007390
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007391 str_obj = PyUnicode_FromObject(str_in);
7392 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007393 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007394 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007395 if (!sep_obj) {
7396 Py_DECREF(str_obj);
7397 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007398 }
7399
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007400 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007401 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7402 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7403 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007404
Fredrik Lundhb9479482006-05-26 17:22:38 +00007405 Py_DECREF(sep_obj);
7406 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007407
7408 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007409}
7410
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007411
7412PyObject *
7413PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7414{
7415 PyObject* str_obj;
7416 PyObject* sep_obj;
7417 PyObject* out;
7418
7419 str_obj = PyUnicode_FromObject(str_in);
7420 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007421 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007422 sep_obj = PyUnicode_FromObject(sep_in);
7423 if (!sep_obj) {
7424 Py_DECREF(str_obj);
7425 return NULL;
7426 }
7427
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007428 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007429 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7430 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7431 );
7432
7433 Py_DECREF(sep_obj);
7434 Py_DECREF(str_obj);
7435
7436 return out;
7437}
7438
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007439PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007440 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007441\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007442Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007443the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007444found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007445
7446static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007447unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007448{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007449 return PyUnicode_Partition((PyObject *)self, separator);
7450}
7451
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007452PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007453 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007454\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007455Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007456the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007457separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007458
7459static PyObject*
7460unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7461{
7462 return PyUnicode_RPartition((PyObject *)self, separator);
7463}
7464
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007465PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007466 PyObject *sep,
7467 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007468{
7469 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007470
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007471 s = PyUnicode_FromObject(s);
7472 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007473 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007474 if (sep != NULL) {
7475 sep = PyUnicode_FromObject(sep);
7476 if (sep == NULL) {
7477 Py_DECREF(s);
7478 return NULL;
7479 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007480 }
7481
7482 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7483
7484 Py_DECREF(s);
7485 Py_XDECREF(sep);
7486 return result;
7487}
7488
7489PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007490 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007491\n\
7492Return a list of the words in S, using sep as the\n\
7493delimiter string, starting at the end of the string and\n\
7494working to the front. If maxsplit is given, at most maxsplit\n\
7495splits are done. If sep is not specified, any whitespace string\n\
7496is a separator.");
7497
7498static PyObject*
7499unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7500{
7501 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007503
Martin v. Löwis18e16552006-02-15 17:27:45 +00007504 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007505 return NULL;
7506
7507 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007508 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007509 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007510 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007511 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007512 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007513}
7514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007515PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007516 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517\n\
7518Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007519Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521
7522static PyObject*
7523unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7524{
Guido van Rossum86662912000-04-11 15:38:46 +00007525 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526
Guido van Rossum86662912000-04-11 15:38:46 +00007527 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 return NULL;
7529
Guido van Rossum86662912000-04-11 15:38:46 +00007530 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531}
7532
7533static
7534PyObject *unicode_str(PyUnicodeObject *self)
7535{
Fred Drakee4315f52000-05-09 19:53:39 +00007536 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537}
7538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007539PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007540 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541\n\
7542Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007543and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
7545static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007546unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 return fixup(self, fixswapcase);
7549}
7550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007552 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553\n\
7554Return a copy of the string S, where all characters have been mapped\n\
7555through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007556Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7557Unmapped characters are left untouched. Characters mapped to None\n\
7558are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559
7560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007561unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562{
Tim Petersced69f82003-09-16 20:30:58 +00007563 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007564 self->length,
7565 table,
7566 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567}
7568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007570 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007572Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
7574static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007575unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 return fixup(self, fixupper);
7578}
7579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007580PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007581 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582\n\
Georg Brandl98064072008-09-09 19:26:00 +00007583Pad a numeric string S with zeros on the left, to fill a field\n\
7584of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585
7586static PyObject *
7587unicode_zfill(PyUnicodeObject *self, PyObject *args)
7588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007589 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 PyUnicodeObject *u;
7591
Martin v. Löwis18e16552006-02-15 17:27:45 +00007592 Py_ssize_t width;
7593 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 return NULL;
7595
7596 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007597 if (PyUnicode_CheckExact(self)) {
7598 Py_INCREF(self);
7599 return (PyObject*) self;
7600 }
7601 else
7602 return PyUnicode_FromUnicode(
7603 PyUnicode_AS_UNICODE(self),
7604 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007605 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 }
7607
7608 fill = width - self->length;
7609
7610 u = pad(self, fill, 0, '0');
7611
Walter Dörwald068325e2002-04-15 13:36:47 +00007612 if (u == NULL)
7613 return NULL;
7614
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 if (u->str[fill] == '+' || u->str[fill] == '-') {
7616 /* move sign to beginning of string */
7617 u->str[0] = u->str[fill];
7618 u->str[fill] = '0';
7619 }
7620
7621 return (PyObject*) u;
7622}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
7624#if 0
7625static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007626free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007628 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629}
7630#endif
7631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007632PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007633 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007635Return True if S starts with the specified prefix, False otherwise.\n\
7636With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007637With optional end, stop comparing S at that position.\n\
7638prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
7640static PyObject *
7641unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007642 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643{
Georg Brandl24250812006-06-09 18:45:48 +00007644 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007646 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007647 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007648 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
Georg Brandl24250812006-06-09 18:45:48 +00007650 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007651 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7652 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007653 if (PyTuple_Check(subobj)) {
7654 Py_ssize_t i;
7655 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7656 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007657 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007658 if (substring == NULL)
7659 return NULL;
7660 result = tailmatch(self, substring, start, end, -1);
7661 Py_DECREF(substring);
7662 if (result) {
7663 Py_RETURN_TRUE;
7664 }
7665 }
7666 /* nothing matched */
7667 Py_RETURN_FALSE;
7668 }
7669 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007671 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007672 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007674 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675}
7676
7677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007679 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007681Return True if S ends with the specified suffix, False otherwise.\n\
7682With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007683With optional end, stop comparing S at that position.\n\
7684suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject *
7687unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007688 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Georg Brandl24250812006-06-09 18:45:48 +00007690 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007693 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007694 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
Georg Brandl24250812006-06-09 18:45:48 +00007696 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007697 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7698 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007699 if (PyTuple_Check(subobj)) {
7700 Py_ssize_t i;
7701 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7702 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007703 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007704 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007705 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007706 result = tailmatch(self, substring, start, end, +1);
7707 Py_DECREF(substring);
7708 if (result) {
7709 Py_RETURN_TRUE;
7710 }
7711 }
7712 Py_RETURN_FALSE;
7713 }
7714 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
Georg Brandl24250812006-06-09 18:45:48 +00007718 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007720 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721}
7722
7723
Eric Smitha9f7d622008-02-17 19:46:49 +00007724/* Implements do_string_format, which is unicode because of stringlib */
7725#include "stringlib/string_format.h"
7726
7727PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007728 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007729\n\
7730");
7731
Eric Smithdc13b792008-05-30 18:10:04 +00007732static PyObject *
7733unicode__format__(PyObject *self, PyObject *args)
7734{
7735 PyObject *format_spec;
7736 PyObject *result = NULL;
7737 PyObject *tmp = NULL;
7738
7739 /* If 2.x, convert format_spec to the same type as value */
7740 /* This is to allow things like u''.format('') */
7741 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7742 goto done;
7743 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7744 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007745 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007746 goto done;
7747 }
7748 tmp = PyObject_Unicode(format_spec);
7749 if (tmp == NULL)
7750 goto done;
7751 format_spec = tmp;
7752
7753 result = _PyUnicode_FormatAdvanced(self,
7754 PyUnicode_AS_UNICODE(format_spec),
7755 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007756 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007757 Py_XDECREF(tmp);
7758 return result;
7759}
7760
Eric Smitha9f7d622008-02-17 19:46:49 +00007761PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007762 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007763\n\
7764");
7765
Robert Schuppenies901c9972008-06-10 10:10:31 +00007766static PyObject *
7767unicode__sizeof__(PyUnicodeObject *v)
7768{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007769 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7770 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007771}
7772
7773PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007774 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007775\n\
7776");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007777
7778static PyObject *
7779unicode_getnewargs(PyUnicodeObject *v)
7780{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007781 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007782}
7783
7784
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785static PyMethodDef unicode_methods[] = {
7786
7787 /* Order is according to common usage: often used methods should
7788 appear first, since lookup is done sequentially. */
7789
Benjamin Peterson332d7212009-09-18 21:14:55 +00007790 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007791 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7792 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007793 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007794 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7795 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7796 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7797 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7798 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7799 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7800 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007801 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7803 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7804 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007805 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007806 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007807/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7808 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7809 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7810 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007811 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007812 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007813 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007814 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007815 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7816 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7817 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7818 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7819 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7820 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7821 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7822 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7823 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7824 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7825 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7826 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7827 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7828 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007829 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007830 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7831 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7832 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7833 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007834 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007835#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007836 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837#endif
7838
7839#if 0
7840 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007841 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842#endif
7843
Benjamin Peterson857ce152009-01-31 16:29:18 +00007844 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 {NULL, NULL}
7846};
7847
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007848static PyObject *
7849unicode_mod(PyObject *v, PyObject *w)
7850{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007851 if (!PyUnicode_Check(v)) {
7852 Py_INCREF(Py_NotImplemented);
7853 return Py_NotImplemented;
7854 }
7855 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007856}
7857
7858static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007859 0, /*nb_add*/
7860 0, /*nb_subtract*/
7861 0, /*nb_multiply*/
7862 0, /*nb_divide*/
7863 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007864};
7865
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007867 (lenfunc) unicode_length, /* sq_length */
7868 PyUnicode_Concat, /* sq_concat */
7869 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7870 (ssizeargfunc) unicode_getitem, /* sq_item */
7871 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7872 0, /* sq_ass_item */
7873 0, /* sq_ass_slice */
7874 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875};
7876
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007877static PyObject*
7878unicode_subscript(PyUnicodeObject* self, PyObject* item)
7879{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007880 if (PyIndex_Check(item)) {
7881 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007882 if (i == -1 && PyErr_Occurred())
7883 return NULL;
7884 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007885 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007886 return unicode_getitem(self, i);
7887 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007888 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007889 Py_UNICODE* source_buf;
7890 Py_UNICODE* result_buf;
7891 PyObject* result;
7892
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007893 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007894 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007895 return NULL;
7896 }
7897
7898 if (slicelength <= 0) {
7899 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007900 } else if (start == 0 && step == 1 && slicelength == self->length &&
7901 PyUnicode_CheckExact(self)) {
7902 Py_INCREF(self);
7903 return (PyObject *)self;
7904 } else if (step == 1) {
7905 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007906 } else {
7907 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007908 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7909 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007910
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007911 if (result_buf == NULL)
7912 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007913
7914 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7915 result_buf[i] = source_buf[cur];
7916 }
Tim Petersced69f82003-09-16 20:30:58 +00007917
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007918 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007919 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007920 return result;
7921 }
7922 } else {
7923 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7924 return NULL;
7925 }
7926}
7927
7928static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007929 (lenfunc)unicode_length, /* mp_length */
7930 (binaryfunc)unicode_subscript, /* mp_subscript */
7931 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007932};
7933
Martin v. Löwis18e16552006-02-15 17:27:45 +00007934static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007936 Py_ssize_t index,
7937 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938{
7939 if (index != 0) {
7940 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007941 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 return -1;
7943 }
7944 *ptr = (void *) self->str;
7945 return PyUnicode_GET_DATA_SIZE(self);
7946}
7947
Martin v. Löwis18e16552006-02-15 17:27:45 +00007948static Py_ssize_t
7949unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007950 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951{
7952 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007953 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 return -1;
7955}
7956
7957static int
7958unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007959 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960{
7961 if (lenp)
7962 *lenp = PyUnicode_GET_DATA_SIZE(self);
7963 return 1;
7964}
7965
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007966static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007968 Py_ssize_t index,
7969 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970{
7971 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007972
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 if (index != 0) {
7974 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 return -1;
7977 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007978 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007980 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007981 *ptr = (void *) PyString_AS_STRING(str);
7982 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983}
7984
7985/* Helpers for PyUnicode_Format() */
7986
7987static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007988getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007990 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007992 (*p_argidx)++;
7993 if (arglen < 0)
7994 return args;
7995 else
7996 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
7998 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007999 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 return NULL;
8001}
8002
8003#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008004#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008006#define F_ALT (1<<3)
8007#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
Martin v. Löwis18e16552006-02-15 17:27:45 +00008009static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008010strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008012 register Py_ssize_t i;
8013 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008015 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 return len;
8018}
8019
Neal Norwitzfc76d632006-01-10 06:03:13 +00008020static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008021longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8022{
Tim Peters15231542006-02-16 01:08:01 +00008023 Py_ssize_t result;
8024
Neal Norwitzfc76d632006-01-10 06:03:13 +00008025 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008026 result = strtounicode(buffer, (char *)buffer);
8027 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008028}
8029
Guido van Rossum078151d2002-08-11 04:24:12 +00008030/* XXX To save some code duplication, formatfloat/long/int could have been
8031 shared with stringobject.c, converting from 8-bit to Unicode after the
8032 formatting is done. */
8033
Mark Dickinson18cfada2009-11-23 18:46:41 +00008034/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8035
8036static PyObject *
8037formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008039 char *p;
8040 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008042
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 x = PyFloat_AsDouble(v);
8044 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008045 return NULL;
8046
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008048 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008049
Mark Dickinson18cfada2009-11-23 18:46:41 +00008050 p = PyOS_double_to_string(x, type, prec,
8051 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8052 if (p == NULL)
8053 return NULL;
8054 result = PyUnicode_FromStringAndSize(p, strlen(p));
8055 PyMem_Free(p);
8056 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057}
8058
Tim Peters38fd5b62000-09-21 05:43:11 +00008059static PyObject*
8060formatlong(PyObject *val, int flags, int prec, int type)
8061{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008062 char *buf;
8063 int i, len;
8064 PyObject *str; /* temporary string object. */
8065 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008066
Benjamin Peterson857ce152009-01-31 16:29:18 +00008067 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8068 if (!str)
8069 return NULL;
8070 result = _PyUnicode_New(len);
8071 if (!result) {
8072 Py_DECREF(str);
8073 return NULL;
8074 }
8075 for (i = 0; i < len; i++)
8076 result->str[i] = buf[i];
8077 result->str[len] = 0;
8078 Py_DECREF(str);
8079 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008080}
8081
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082static int
8083formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008084 size_t buflen,
8085 int flags,
8086 int prec,
8087 int type,
8088 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008090 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008091 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8092 * + 1 + 1
8093 * = 24
8094 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008095 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008096 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 long x;
8098
8099 x = PyInt_AsLong(v);
8100 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008101 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008102 if (x < 0 && type == 'u') {
8103 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008104 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008105 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8106 sign = "-";
8107 else
8108 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008110 prec = 1;
8111
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008112 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8113 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008114 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008115 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008116 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008117 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008118 return -1;
8119 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008120
8121 if ((flags & F_ALT) &&
8122 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008123 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008124 * of issues that cause pain:
8125 * - when 0 is being converted, the C standard leaves off
8126 * the '0x' or '0X', which is inconsistent with other
8127 * %#x/%#X conversions and inconsistent with Python's
8128 * hex() function
8129 * - there are platforms that violate the standard and
8130 * convert 0 with the '0x' or '0X'
8131 * (Metrowerks, Compaq Tru64)
8132 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008133 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008134 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008135 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008136 * We can achieve the desired consistency by inserting our
8137 * own '0x' or '0X' prefix, and substituting %x/%X in place
8138 * of %#x/%#X.
8139 *
8140 * Note that this is the same approach as used in
8141 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008142 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008143 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8144 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008145 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008146 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008147 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8148 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008149 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008150 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008151 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008152 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008153 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008154 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155}
8156
8157static int
8158formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008159 size_t buflen,
8160 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161{
Ezio Melotti32125152010-02-25 17:36:04 +00008162 PyObject *unistr;
8163 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008164 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008165 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008166 if (PyUnicode_GET_SIZE(v) != 1)
8167 goto onError;
8168 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008171 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008172 if (PyString_GET_SIZE(v) != 1)
8173 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008174 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8175 with a UnicodeDecodeError if 'char' is not decodable with the
8176 default encoding (usually ASCII, but it might be something else) */
8177 str = PyString_AS_STRING(v);
8178 if ((unsigned char)str[0] > 0x7F) {
8179 /* the char is not ASCII; try to decode the string using the
8180 default encoding and return -1 to let the UnicodeDecodeError
8181 be raised if the string can't be decoded */
8182 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8183 if (unistr == NULL)
8184 return -1;
8185 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8186 Py_DECREF(unistr);
8187 }
8188 else
8189 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191
8192 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008193 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008195 x = PyInt_AsLong(v);
8196 if (x == -1 && PyErr_Occurred())
8197 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008198#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008199 if (x < 0 || x > 0x10ffff) {
8200 PyErr_SetString(PyExc_OverflowError,
8201 "%c arg not in range(0x110000) "
8202 "(wide Python build)");
8203 return -1;
8204 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008205#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008206 if (x < 0 || x > 0xffff) {
8207 PyErr_SetString(PyExc_OverflowError,
8208 "%c arg not in range(0x10000) "
8209 "(narrow Python build)");
8210 return -1;
8211 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008212#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008213 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214 }
8215 buf[1] = '\0';
8216 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008217
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008218 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008219 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008221 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222}
8223
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008224/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8225
Mark Dickinson18cfada2009-11-23 18:46:41 +00008226 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008227 chars are formatted. XXX This is a magic number. Each formatting
8228 routine does bounds checking to ensure no overflow, but a better
8229 solution may be to malloc a buffer of appropriate size for each
8230 format. For now, the current solution is sufficient.
8231*/
8232#define FORMATBUFLEN (size_t)120
8233
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008235 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
8237 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008238 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 int args_owned = 0;
8240 PyUnicodeObject *result = NULL;
8241 PyObject *dict = NULL;
8242 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008243
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008245 PyErr_BadInternalCall();
8246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 }
8248 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008249 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 fmt = PyUnicode_AS_UNICODE(uformat);
8252 fmtcnt = PyUnicode_GET_SIZE(uformat);
8253
8254 reslen = rescnt = fmtcnt + 100;
8255 result = _PyUnicode_New(reslen);
8256 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008257 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 res = PyUnicode_AS_UNICODE(result);
8259
8260 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008261 arglen = PyTuple_Size(args);
8262 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 }
8264 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008265 arglen = -1;
8266 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 }
Christian Heimese93237d2007-12-19 02:37:44 +00008268 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008269 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008270 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271
8272 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008273 if (*fmt != '%') {
8274 if (--rescnt < 0) {
8275 rescnt = fmtcnt + 100;
8276 reslen += rescnt;
8277 if (_PyUnicode_Resize(&result, reslen) < 0)
8278 goto onError;
8279 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8280 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008281 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008282 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008283 }
8284 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008285 /* Got a format specifier */
8286 int flags = 0;
8287 Py_ssize_t width = -1;
8288 int prec = -1;
8289 Py_UNICODE c = '\0';
8290 Py_UNICODE fill;
8291 int isnumok;
8292 PyObject *v = NULL;
8293 PyObject *temp = NULL;
8294 Py_UNICODE *pbuf;
8295 Py_UNICODE sign;
8296 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008297 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008298
8299 fmt++;
8300 if (*fmt == '(') {
8301 Py_UNICODE *keystart;
8302 Py_ssize_t keylen;
8303 PyObject *key;
8304 int pcount = 1;
8305
8306 if (dict == NULL) {
8307 PyErr_SetString(PyExc_TypeError,
8308 "format requires a mapping");
8309 goto onError;
8310 }
8311 ++fmt;
8312 --fmtcnt;
8313 keystart = fmt;
8314 /* Skip over balanced parentheses */
8315 while (pcount > 0 && --fmtcnt >= 0) {
8316 if (*fmt == ')')
8317 --pcount;
8318 else if (*fmt == '(')
8319 ++pcount;
8320 fmt++;
8321 }
8322 keylen = fmt - keystart - 1;
8323 if (fmtcnt < 0 || pcount > 0) {
8324 PyErr_SetString(PyExc_ValueError,
8325 "incomplete format key");
8326 goto onError;
8327 }
8328#if 0
8329 /* keys are converted to strings using UTF-8 and
8330 then looked up since Python uses strings to hold
8331 variables names etc. in its namespaces and we
8332 wouldn't want to break common idioms. */
8333 key = PyUnicode_EncodeUTF8(keystart,
8334 keylen,
8335 NULL);
8336#else
8337 key = PyUnicode_FromUnicode(keystart, keylen);
8338#endif
8339 if (key == NULL)
8340 goto onError;
8341 if (args_owned) {
8342 Py_DECREF(args);
8343 args_owned = 0;
8344 }
8345 args = PyObject_GetItem(dict, key);
8346 Py_DECREF(key);
8347 if (args == NULL) {
8348 goto onError;
8349 }
8350 args_owned = 1;
8351 arglen = -1;
8352 argidx = -2;
8353 }
8354 while (--fmtcnt >= 0) {
8355 switch (c = *fmt++) {
8356 case '-': flags |= F_LJUST; continue;
8357 case '+': flags |= F_SIGN; continue;
8358 case ' ': flags |= F_BLANK; continue;
8359 case '#': flags |= F_ALT; continue;
8360 case '0': flags |= F_ZERO; continue;
8361 }
8362 break;
8363 }
8364 if (c == '*') {
8365 v = getnextarg(args, arglen, &argidx);
8366 if (v == NULL)
8367 goto onError;
8368 if (!PyInt_Check(v)) {
8369 PyErr_SetString(PyExc_TypeError,
8370 "* wants int");
8371 goto onError;
8372 }
8373 width = PyInt_AsLong(v);
8374 if (width < 0) {
8375 flags |= F_LJUST;
8376 width = -width;
8377 }
8378 if (--fmtcnt >= 0)
8379 c = *fmt++;
8380 }
8381 else if (c >= '0' && c <= '9') {
8382 width = c - '0';
8383 while (--fmtcnt >= 0) {
8384 c = *fmt++;
8385 if (c < '0' || c > '9')
8386 break;
8387 if ((width*10) / 10 != width) {
8388 PyErr_SetString(PyExc_ValueError,
8389 "width too big");
8390 goto onError;
8391 }
8392 width = width*10 + (c - '0');
8393 }
8394 }
8395 if (c == '.') {
8396 prec = 0;
8397 if (--fmtcnt >= 0)
8398 c = *fmt++;
8399 if (c == '*') {
8400 v = getnextarg(args, arglen, &argidx);
8401 if (v == NULL)
8402 goto onError;
8403 if (!PyInt_Check(v)) {
8404 PyErr_SetString(PyExc_TypeError,
8405 "* wants int");
8406 goto onError;
8407 }
8408 prec = PyInt_AsLong(v);
8409 if (prec < 0)
8410 prec = 0;
8411 if (--fmtcnt >= 0)
8412 c = *fmt++;
8413 }
8414 else if (c >= '0' && c <= '9') {
8415 prec = c - '0';
8416 while (--fmtcnt >= 0) {
8417 c = Py_CHARMASK(*fmt++);
8418 if (c < '0' || c > '9')
8419 break;
8420 if ((prec*10) / 10 != prec) {
8421 PyErr_SetString(PyExc_ValueError,
8422 "prec too big");
8423 goto onError;
8424 }
8425 prec = prec*10 + (c - '0');
8426 }
8427 }
8428 } /* prec */
8429 if (fmtcnt >= 0) {
8430 if (c == 'h' || c == 'l' || c == 'L') {
8431 if (--fmtcnt >= 0)
8432 c = *fmt++;
8433 }
8434 }
8435 if (fmtcnt < 0) {
8436 PyErr_SetString(PyExc_ValueError,
8437 "incomplete format");
8438 goto onError;
8439 }
8440 if (c != '%') {
8441 v = getnextarg(args, arglen, &argidx);
8442 if (v == NULL)
8443 goto onError;
8444 }
8445 sign = 0;
8446 fill = ' ';
8447 switch (c) {
8448
8449 case '%':
8450 pbuf = formatbuf;
8451 /* presume that buffer length is at least 1 */
8452 pbuf[0] = '%';
8453 len = 1;
8454 break;
8455
8456 case 's':
8457 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008458 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008459 temp = v;
8460 Py_INCREF(temp);
8461 }
8462 else {
8463 PyObject *unicode;
8464 if (c == 's')
8465 temp = PyObject_Unicode(v);
8466 else
8467 temp = PyObject_Repr(v);
8468 if (temp == NULL)
8469 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008470 if (PyUnicode_Check(temp))
8471 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008472 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008473 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008474 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8475 PyString_GET_SIZE(temp),
8476 NULL,
8477 "strict");
8478 Py_DECREF(temp);
8479 temp = unicode;
8480 if (temp == NULL)
8481 goto onError;
8482 }
8483 else {
8484 Py_DECREF(temp);
8485 PyErr_SetString(PyExc_TypeError,
8486 "%s argument has non-string str()");
8487 goto onError;
8488 }
8489 }
8490 pbuf = PyUnicode_AS_UNICODE(temp);
8491 len = PyUnicode_GET_SIZE(temp);
8492 if (prec >= 0 && len > prec)
8493 len = prec;
8494 break;
8495
8496 case 'i':
8497 case 'd':
8498 case 'u':
8499 case 'o':
8500 case 'x':
8501 case 'X':
8502 if (c == 'i')
8503 c = 'd';
8504 isnumok = 0;
8505 if (PyNumber_Check(v)) {
8506 PyObject *iobj=NULL;
8507
8508 if (PyInt_Check(v) || (PyLong_Check(v))) {
8509 iobj = v;
8510 Py_INCREF(iobj);
8511 }
8512 else {
8513 iobj = PyNumber_Int(v);
8514 if (iobj==NULL) iobj = PyNumber_Long(v);
8515 }
8516 if (iobj!=NULL) {
8517 if (PyInt_Check(iobj)) {
8518 isnumok = 1;
8519 pbuf = formatbuf;
8520 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8521 flags, prec, c, iobj);
8522 Py_DECREF(iobj);
8523 if (len < 0)
8524 goto onError;
8525 sign = 1;
8526 }
8527 else if (PyLong_Check(iobj)) {
8528 isnumok = 1;
8529 temp = formatlong(iobj, flags, prec, c);
8530 Py_DECREF(iobj);
8531 if (!temp)
8532 goto onError;
8533 pbuf = PyUnicode_AS_UNICODE(temp);
8534 len = PyUnicode_GET_SIZE(temp);
8535 sign = 1;
8536 }
8537 else {
8538 Py_DECREF(iobj);
8539 }
8540 }
8541 }
8542 if (!isnumok) {
8543 PyErr_Format(PyExc_TypeError,
8544 "%%%c format: a number is required, "
8545 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8546 goto onError;
8547 }
8548 if (flags & F_ZERO)
8549 fill = '0';
8550 break;
8551
8552 case 'e':
8553 case 'E':
8554 case 'f':
8555 case 'F':
8556 case 'g':
8557 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008558 temp = formatfloat(v, flags, prec, c);
8559 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008560 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008561 pbuf = PyUnicode_AS_UNICODE(temp);
8562 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008563 sign = 1;
8564 if (flags & F_ZERO)
8565 fill = '0';
8566 break;
8567
8568 case 'c':
8569 pbuf = formatbuf;
8570 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8571 if (len < 0)
8572 goto onError;
8573 break;
8574
8575 default:
8576 PyErr_Format(PyExc_ValueError,
8577 "unsupported format character '%c' (0x%x) "
8578 "at index %zd",
8579 (31<=c && c<=126) ? (char)c : '?',
8580 (int)c,
8581 (Py_ssize_t)(fmt - 1 -
8582 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008583 goto onError;
8584 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008585 if (sign) {
8586 if (*pbuf == '-' || *pbuf == '+') {
8587 sign = *pbuf++;
8588 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008589 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008590 else if (flags & F_SIGN)
8591 sign = '+';
8592 else if (flags & F_BLANK)
8593 sign = ' ';
8594 else
8595 sign = 0;
8596 }
8597 if (width < len)
8598 width = len;
8599 if (rescnt - (sign != 0) < width) {
8600 reslen -= rescnt;
8601 rescnt = width + fmtcnt + 100;
8602 reslen += rescnt;
8603 if (reslen < 0) {
8604 Py_XDECREF(temp);
8605 PyErr_NoMemory();
8606 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008607 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008608 if (_PyUnicode_Resize(&result, reslen) < 0) {
8609 Py_XDECREF(temp);
8610 goto onError;
8611 }
8612 res = PyUnicode_AS_UNICODE(result)
8613 + reslen - rescnt;
8614 }
8615 if (sign) {
8616 if (fill != ' ')
8617 *res++ = sign;
8618 rescnt--;
8619 if (width > len)
8620 width--;
8621 }
8622 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8623 assert(pbuf[0] == '0');
8624 assert(pbuf[1] == c);
8625 if (fill != ' ') {
8626 *res++ = *pbuf++;
8627 *res++ = *pbuf++;
8628 }
8629 rescnt -= 2;
8630 width -= 2;
8631 if (width < 0)
8632 width = 0;
8633 len -= 2;
8634 }
8635 if (width > len && !(flags & F_LJUST)) {
8636 do {
8637 --rescnt;
8638 *res++ = fill;
8639 } while (--width > len);
8640 }
8641 if (fill == ' ') {
8642 if (sign)
8643 *res++ = sign;
8644 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8645 assert(pbuf[0] == '0');
8646 assert(pbuf[1] == c);
8647 *res++ = *pbuf++;
8648 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008649 }
8650 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008651 Py_UNICODE_COPY(res, pbuf, len);
8652 res += len;
8653 rescnt -= len;
8654 while (--width >= len) {
8655 --rescnt;
8656 *res++ = ' ';
8657 }
8658 if (dict && (argidx < arglen) && c != '%') {
8659 PyErr_SetString(PyExc_TypeError,
8660 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008661 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008662 goto onError;
8663 }
8664 Py_XDECREF(temp);
8665 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 } /* until end */
8667 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008668 PyErr_SetString(PyExc_TypeError,
8669 "not all arguments converted during string formatting");
8670 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
8672
Thomas Woutersa96affe2006-03-12 00:29:36 +00008673 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008674 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008676 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 }
8678 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 return (PyObject *)result;
8680
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 Py_XDECREF(result);
8683 Py_DECREF(uformat);
8684 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008685 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 }
8687 return NULL;
8688}
8689
8690static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 (readbufferproc) unicode_buffer_getreadbuf,
8692 (writebufferproc) unicode_buffer_getwritebuf,
8693 (segcountproc) unicode_buffer_getsegcount,
8694 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695};
8696
Jeremy Hylton938ace62002-07-17 16:30:39 +00008697static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008698unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8699
Tim Peters6d6c1a32001-08-02 04:15:00 +00008700static PyObject *
8701unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8702{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008703 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008704 static char *kwlist[] = {"string", "encoding", "errors", 0};
8705 char *encoding = NULL;
8706 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008707
Benjamin Peterson857ce152009-01-31 16:29:18 +00008708 if (type != &PyUnicode_Type)
8709 return unicode_subtype_new(type, args, kwds);
8710 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008711 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008712 return NULL;
8713 if (x == NULL)
8714 return (PyObject *)_PyUnicode_New(0);
8715 if (encoding == NULL && errors == NULL)
8716 return PyObject_Unicode(x);
8717 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008718 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008719}
8720
Guido van Rossume023fe02001-08-30 03:12:59 +00008721static PyObject *
8722unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8723{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008724 PyUnicodeObject *tmp, *pnew;
8725 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008726
Benjamin Peterson857ce152009-01-31 16:29:18 +00008727 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8728 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8729 if (tmp == NULL)
8730 return NULL;
8731 assert(PyUnicode_Check(tmp));
8732 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8733 if (pnew == NULL) {
8734 Py_DECREF(tmp);
8735 return NULL;
8736 }
8737 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8738 if (pnew->str == NULL) {
8739 _Py_ForgetReference((PyObject *)pnew);
8740 PyObject_Del(pnew);
8741 Py_DECREF(tmp);
8742 return PyErr_NoMemory();
8743 }
8744 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8745 pnew->length = n;
8746 pnew->hash = tmp->hash;
8747 Py_DECREF(tmp);
8748 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008749}
8750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008751PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008752 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008753\n\
8754Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008755encoding defaults to the current default string encoding.\n\
8756errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008757
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008759 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008760 "unicode", /* tp_name */
8761 sizeof(PyUnicodeObject), /* tp_size */
8762 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008764 (destructor)unicode_dealloc, /* tp_dealloc */
8765 0, /* tp_print */
8766 0, /* tp_getattr */
8767 0, /* tp_setattr */
8768 0, /* tp_compare */
8769 unicode_repr, /* tp_repr */
8770 &unicode_as_number, /* tp_as_number */
8771 &unicode_as_sequence, /* tp_as_sequence */
8772 &unicode_as_mapping, /* tp_as_mapping */
8773 (hashfunc) unicode_hash, /* tp_hash*/
8774 0, /* tp_call*/
8775 (reprfunc) unicode_str, /* tp_str */
8776 PyObject_GenericGetAttr, /* tp_getattro */
8777 0, /* tp_setattro */
8778 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008779 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008780 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008781 unicode_doc, /* tp_doc */
8782 0, /* tp_traverse */
8783 0, /* tp_clear */
8784 PyUnicode_RichCompare, /* tp_richcompare */
8785 0, /* tp_weaklistoffset */
8786 0, /* tp_iter */
8787 0, /* tp_iternext */
8788 unicode_methods, /* tp_methods */
8789 0, /* tp_members */
8790 0, /* tp_getset */
8791 &PyBaseString_Type, /* tp_base */
8792 0, /* tp_dict */
8793 0, /* tp_descr_get */
8794 0, /* tp_descr_set */
8795 0, /* tp_dictoffset */
8796 0, /* tp_init */
8797 0, /* tp_alloc */
8798 unicode_new, /* tp_new */
8799 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800};
8801
8802/* Initialize the Unicode implementation */
8803
Thomas Wouters78890102000-07-22 19:25:51 +00008804void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008806 int i;
8807
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008808 /* XXX - move this array to unicodectype.c ? */
8809 Py_UNICODE linebreak[] = {
8810 0x000A, /* LINE FEED */
8811 0x000D, /* CARRIAGE RETURN */
8812 0x001C, /* FILE SEPARATOR */
8813 0x001D, /* GROUP SEPARATOR */
8814 0x001E, /* RECORD SEPARATOR */
8815 0x0085, /* NEXT LINE */
8816 0x2028, /* LINE SEPARATOR */
8817 0x2029, /* PARAGRAPH SEPARATOR */
8818 };
8819
Fred Drakee4315f52000-05-09 19:53:39 +00008820 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008821 free_list = NULL;
8822 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008824 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008825 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008826
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008827 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008828 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008829 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008830 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008831 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008832
8833 /* initialize the linebreak bloom filter */
8834 bloom_linebreak = make_bloom_mask(
8835 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8836 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008837
8838 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839}
8840
8841/* Finalize the Unicode implementation */
8842
Christian Heimes3b718a72008-02-14 12:47:33 +00008843int
8844PyUnicode_ClearFreeList(void)
8845{
8846 int freelist_size = numfree;
8847 PyUnicodeObject *u;
8848
8849 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008850 PyUnicodeObject *v = u;
8851 u = *(PyUnicodeObject **)u;
8852 if (v->str)
8853 PyObject_DEL(v->str);
8854 Py_XDECREF(v->defenc);
8855 PyObject_Del(v);
8856 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008857 }
8858 free_list = NULL;
8859 assert(numfree == 0);
8860 return freelist_size;
8861}
8862
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863void
Thomas Wouters78890102000-07-22 19:25:51 +00008864_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008866 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008868 Py_XDECREF(unicode_empty);
8869 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008870
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008871 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008872 if (unicode_latin1[i]) {
8873 Py_DECREF(unicode_latin1[i]);
8874 unicode_latin1[i] = NULL;
8875 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008876 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008877 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008879
Anthony Baxterac6bd462006-04-13 02:06:09 +00008880#ifdef __cplusplus
8881}
8882#endif