blob: 0e378a5a40855ee8d6d10967b148b802d3251fd1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000297 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000300 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Andrew Dalkee0df7622006-05-27 11:04:36 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
Neal Norwitze7d8be82008-07-31 17:17:14 +0000315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
318 }
319
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000329 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
332 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000333 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000334 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000337 }
338 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 }
340 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000341 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 if (unicode == NULL)
344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 }
348
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000349 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000350 PyErr_NoMemory();
351 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000352 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000353 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
359 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000360 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000362 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000378 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000380 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
385 }
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
389 }
390 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 }
395 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400}
401
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000402static
403int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404{
405 register PyUnicodeObject *v;
406
407 /* Argument checks */
408 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 PyErr_BadInternalCall();
410 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000412 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000414 PyErr_BadInternalCall();
415 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000421 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 }
432
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
436}
437
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000438int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
439{
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000456 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000457
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
468 }
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
471 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
Tim Petersced69f82003-09-16 20:30:58 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
477
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000480 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481
482 return (PyObject *)unicode;
483}
484
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000485PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
486{
487 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000488
Benjamin Peterson857ce152009-01-31 16:29:18 +0000489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000491 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 return NULL;
493 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000494
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
500
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000505 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000506
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
517 }
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
520 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000521
522 return PyUnicode_DecodeUTF8(u, size, NULL);
523 }
524
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
528
529 return (PyObject *)unicode;
530}
531
532PyObject *PyUnicode_FromString(const char *u)
533{
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
538 }
539
540 return PyUnicode_FromStringAndSize(u, size);
541}
542
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543#ifdef HAVE_WCHAR_H
544
Mark Dickinson6b265f12009-03-18 16:07:26 +0000545#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546# define CONVERT_WCHAR_TO_SURROGATES
547#endif
548
549#ifdef CONVERT_WCHAR_TO_SURROGATES
550
551/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
553
554PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
556{
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
561
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
573 }
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
578
579 /* Copy the wchar_t data into the new object */
580 {
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
589 }
590 else
591 *u++ = *w++;
592 }
593 }
594 return (PyObject *)unicode;
595}
596
597#else
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000600 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601{
602 PyUnicodeObject *unicode;
603
604 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 PyErr_BadInternalCall();
606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 }
608
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614#ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000616#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624#endif
625
626 return (PyObject *)unicode;
627}
628
Mark Dickinson6b265f12009-03-18 16:07:26 +0000629#endif /* CONVERT_WCHAR_TO_SURROGATES */
630
631#undef CONVERT_WCHAR_TO_SURROGATES
632
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633static void
634makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
635{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
641 }
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
650 }
651 *fmt++ = c;
652 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653}
654
655#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656
657PyObject *
658PyUnicode_FromFormatV(const char *format, va_list vargs)
659{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000680
681#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000682 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000683#else
684#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000687 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000688#endif
689#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000697 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
703 if (*f == 's')
704 ++callcount;
705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000706 }
707 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
714 }
715 callresult = callresults;
716 }
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
729 */
730 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
755 {
756 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000757 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000764 break;
765 }
766 case 'U':
767 {
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
772 }
773 case 'V':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
784 }
785 case 'S':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
797 }
798 case 'R':
799 {
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
810 }
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
817 */
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
829 }
830 } else
831 n++;
832 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000833 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
839 }
840 realbuffer = abuffer;
841 }
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000851
Benjamin Peterson857ce152009-01-31 16:29:18 +0000852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000854
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
870 }
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
876 }
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
881 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000882
Benjamin Peterson857ce152009-01-31 16:29:18 +0000883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
918 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000928 break;
929 }
930 case 'U':
931 {
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
937 }
938 case 'V':
939 {
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
948 }
949 break;
950 }
951 case 'S':
952 case 'R':
953 {
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
968 }
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
978 }
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
987 }
988 } else
989 *s++ = *f;
990 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000992 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1005 }
1006 PyObject_Free(callresults);
1007 }
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011}
1012
1013#undef appendstring
1014
1015PyObject *
1016PyUnicode_FromFormat(const char *format, ...)
1017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 PyObject* ret;
1019 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001020
1021#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001025#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001029}
1030
Martin v. Löwis18e16552006-02-15 17:27:45 +00001031Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001032 wchar_t *w,
1033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyErr_BadInternalCall();
1037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039
1040 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044#ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046#else
1047 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 }
1054#endif
1055
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060}
1061
1062#endif
1063
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064PyObject *PyUnicode_FromOrdinal(int ordinal)
1065{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001066 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067
1068#ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074 }
1075#else
1076 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#endif
1083
Hye-Shik Chang40574832004-04-06 07:24:51 +00001084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001086}
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_FromObject(register PyObject *obj)
1089{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001092 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 Py_INCREF(obj);
1094 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 }
1096 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103}
1104
1105PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 const char *encoding,
1107 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001110 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 PyErr_BadInternalCall();
1115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118#if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001123
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001126
1127 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 return PyObject_Unicode(obj);
1135 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136#else
1137 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#endif
1143
1144 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001145 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 }
Christian Heimes3497f942008-05-26 12:29:14 +00001149 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 }
Tim Petersced69f82003-09-16 20:30:58 +00001165
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001173
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 return v;
1175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184{
1185 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186
1187 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001189
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001211 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245 return NULL;
1246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252{
1253 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1261}
1262
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1266{
1267 PyObject *v;
1268
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1272 }
1273
1274 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284 return NULL;
1285}
1286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
Fred Drakee4315f52000-05-09 19:53:39 +00001297
Tim Petersced69f82003-09-16 20:30:58 +00001298 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001300
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001307#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001319 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001321 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001322 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 Py_DECREF(v);
1324 goto onError;
1325 }
1326 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 return NULL;
1330}
1331
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001333 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001334{
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_AS_UNICODE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return NULL;
1355}
1356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358{
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1362 }
1363 return PyUnicode_GET_SIZE(unicode);
1364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 return -1;
1367}
1368
Thomas Wouters78890102000-07-22 19:25:51 +00001369const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001370{
1371 return unicode_default_encoding;
1372}
1373
1374int PyUnicode_SetDefaultEncoding(const char *encoding)
1375{
1376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001377
Fred Drakee4315f52000-05-09 19:53:39 +00001378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 encoding,
1386 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001387 return 0;
1388
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return -1;
1391}
1392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393/* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001395 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 and adjust various state variables.
1397 return 0 on success, -1 on error
1398*/
1399
1400static
1401int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 int res = -1;
1417
1418 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 }
1423
1424 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001425 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 }
1430 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 }
1438
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 }
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001450 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 }
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_XDECREF(restuple);
1479 return res;
1480}
1481
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482/* --- UTF-7 Codec -------------------------------------------------------- */
1483
Antoine Pitrou653dece2009-05-04 18:32:32 +00001484/* See RFC2152 for details. We encode conservatively and decode liberally. */
1485
1486/* Three simple macros defining base-64. */
1487
1488/* Is c a base-64 character? */
1489
1490#define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493/* given that c is a base-64 character, what is its base-64 value? */
1494
1495#define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1500
1501/* What is the base-64 character of the bottom 6 bits of n? */
1502
1503#define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1510
1511#define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1513
1514/* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
Tim Petersced69f82003-09-16 20:30:58 +00001528static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001529char utf7_category[128] = {
1530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534/* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538/* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542/* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546};
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554#define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001561 Py_ssize_t size,
1562 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565}
1566
Antoine Pitrou653dece2009-05-04 18:32:32 +00001567/* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1573
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001574PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001605 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 e = s + size;
1607
1608 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625#ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628#else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631#endif
1632 surrogate = 0;
1633 }
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1638 }
1639 }
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1643 }
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1647 }
1648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001900 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1925
1926 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 s++;
1932 continue;
1933 }
1934
1935 n = utf8_code_length[ch];
1936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001946 goto utf8Error;
1947 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
1950 switch (n) {
1951
1952 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001953 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001968 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001982 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001990 endinpos = startinpos + 1;
1991
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001998 goto utf8Error;
1999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2020 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 goto utf8Error;
2022 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
2043 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 }
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 goto onError;
2061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return (PyObject *)unicode;
2065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002066 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_DECREF(unicode);
2070 return NULL;
2071}
2072
Tim Peters602f7402002-04-27 18:03:26 +00002073/* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002077*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002078PyObject *
2079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002080 Py_ssize_t size,
2081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082{
Tim Peters602f7402002-04-27 18:03:26 +00002083#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002084
Martin v. Löwis18e16552006-02-15 17:27:45 +00002085 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002089 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002090 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 assert(s != NULL);
2093 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094
Tim Peters602f7402002-04-27 18:03:26 +00002095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2099 */
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2103 }
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002109 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002110 if (v == NULL)
2111 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002112 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002113 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002114
Tim Peters602f7402002-04-27 18:03:26 +00002115 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002117
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002119 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127 else {
Tim Peters602f7402002-04-27 18:03:26 +00002128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002137 i++;
2138 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (v == NULL) {
2157 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002158 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
2162 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002170
Tim Peters602f7402002-04-27 18:03:26 +00002171#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172}
2173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183}
2184
Walter Dörwald6e390802007-08-17 16:41:28 +00002185/* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187PyObject *
2188PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002192{
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194}
2195
2196PyObject *
2197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002202{
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002210 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002211 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002212#else
2213 const int pairs = 0;
2214#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002215 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002216 int bo = 0; /* assume native ordering by default */
2217 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002218 /* Offsets from q for retrieving bytes in the right order. */
2219#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220 int iorder[] = {0, 1, 2, 3};
2221#else
2222 int iorder[] = {3, 2, 1, 0};
2223#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002224 PyObject *errorHandler = NULL;
2225 PyObject *exc = NULL;
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002226
Walter Dörwald6e390802007-08-17 16:41:28 +00002227 q = (unsigned char *)s;
2228 e = q + size;
2229
2230 if (byteorder)
2231 bo = *byteorder;
2232
2233 /* Check for BOM marks (U+FEFF) in the input and adjust current
2234 byte order setting accordingly. In native mode, the leading BOM
2235 mark is skipped, in all other modes, it is copied to the output
2236 stream as-is (giving a ZWNBSP character). */
2237 if (bo == 0) {
2238 if (size >= 4) {
2239 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002240 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = -1;
2245 }
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = 1;
2249 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002250#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002251 if (bom == 0x0000FEFF) {
2252 q += 4;
2253 bo = 1;
2254 }
2255 else if (bom == 0xFFFE0000) {
2256 q += 4;
2257 bo = -1;
2258 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002261 }
2262
2263 if (bo == -1) {
2264 /* force LE */
2265 iorder[0] = 0;
2266 iorder[1] = 1;
2267 iorder[2] = 2;
2268 iorder[3] = 3;
2269 }
2270 else if (bo == 1) {
2271 /* force BE */
2272 iorder[0] = 3;
2273 iorder[1] = 2;
2274 iorder[2] = 1;
2275 iorder[3] = 0;
2276 }
2277
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002278 /* On narrow builds we split characters outside the BMP into two
2279 codepoints => count how much extra space we need. */
2280#ifndef Py_UNICODE_WIDE
2281 for (qq = q; qq < e; qq += 4)
2282 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2283 pairs++;
2284#endif
2285
2286 /* This might be one to much, because of a BOM */
2287 unicode = _PyUnicode_New((size+3)/4+pairs);
2288 if (!unicode)
2289 return NULL;
2290 if (size == 0)
2291 return (PyObject *)unicode;
2292
2293 /* Unpack UTF-32 encoded data */
2294 p = unicode->str;
2295
Walter Dörwald6e390802007-08-17 16:41:28 +00002296 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2308 }
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002311
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (ch >= 0x110000)
2313 {
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2318 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002319#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002320 if (ch >= 0x10000)
2321 {
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324 }
2325 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002326#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002336 &unicode, &outpos, &p))
2337 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002338 }
2339
2340 if (byteorder)
2341 *byteorder = bo;
2342
2343 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002345
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2349
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002354 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2359}
2360
2361PyObject *
2362PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002366{
2367 PyObject *v;
2368 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002369 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002370#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002371 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#else
2373 const int pairs = 0;
2374#endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378#else
2379 int iorder[] = {3, 2, 1, 0};
2380#endif
2381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382#define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002389 } while(0)
2390
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393#ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002398#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (v == NULL)
2405 return NULL;
2406
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002407 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 if (size == 0)
2411 return v;
2412
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2419 }
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2426 }
2427
2428 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002429 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002430#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2437 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002438 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002439#endif
2440 STORECHAR(ch);
2441 }
2442 return v;
2443#undef STORECHAR
2444}
2445
2446PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447{
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2451 }
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2455 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002456}
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458/* --- UTF-16 Codec ------------------------------------------------------- */
2459
Tim Peters772747b2001-08-09 22:21:55 +00002460PyObject *
2461PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465{
Walter Dörwald69652032004-09-07 20:24:22 +00002466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467}
2468
2469PyObject *
2470PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002485 /* Offsets from q for retrieving byte pairs in the right order. */
2486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488#else
2489 int ihi = 0, ilo = 1;
2490#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2501
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002504 q = (unsigned char *)s;
2505 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002508 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2521 }
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2530 }
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002535#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538
Tim Peters772747b2001-08-09 22:21:55 +00002539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2543 }
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2548 }
2549
2550 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2562 }
2563 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564
Benjamin Peterson857ce152009-01-31 16:29:18 +00002565 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2570 }
2571
2572 /* UTF-16 code pair: */
2573 if (q >= e) {
2574 errmsg = "unexpected end of data";
2575 startinpos = (((const char *)q)-2)-starts;
2576 endinpos = ((const char *)e)-starts;
2577 goto utf16Error;
2578 }
2579 if (0xD800 <= ch && ch <= 0xDBFF) {
2580 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2581 q += 2;
2582 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002583#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002584 *p++ = ch;
2585 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589 continue;
2590 }
2591 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002592 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002593 startinpos = (((const char *)q)-4)-starts;
2594 endinpos = startinpos+2;
2595 goto utf16Error;
2596 }
2597
Benjamin Peterson857ce152009-01-31 16:29:18 +00002598 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002599 errmsg = "illegal encoding";
2600 startinpos = (((const char *)q)-2)-starts;
2601 endinpos = startinpos+2;
2602 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002603
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002604 utf16Error:
2605 outpos = p-PyUnicode_AS_UNICODE(unicode);
2606 if (unicode_decode_call_errorhandler(
2607 errors, &errorHandler,
2608 "utf16", errmsg,
2609 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2610 &unicode, &outpos, &p))
2611 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 }
2613
2614 if (byteorder)
2615 *byteorder = bo;
2616
Walter Dörwald69652032004-09-07 20:24:22 +00002617 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002618 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002619
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002621 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 goto onError;
2623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 return (PyObject *)unicode;
2627
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002628 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 Py_XDECREF(errorHandler);
2631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 return NULL;
2633}
2634
Tim Peters772747b2001-08-09 22:21:55 +00002635PyObject *
2636PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002637 Py_ssize_t size,
2638 const char *errors,
2639 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640{
2641 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002642 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002643 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002644#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002645 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002646#else
2647 const int pairs = 0;
2648#endif
Tim Peters772747b2001-08-09 22:21:55 +00002649 /* Offsets from p for storing byte pairs in the right order. */
2650#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int ihi = 1, ilo = 0;
2652#else
2653 int ihi = 0, ilo = 1;
2654#endif
2655
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002656#define STORECHAR(CH) \
2657 do { \
2658 p[ihi] = ((CH) >> 8) & 0xff; \
2659 p[ilo] = (CH) & 0xff; \
2660 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002661 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002663#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002664 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002665 if (s[i] >= 0x10000)
2666 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002667#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002668 /* 2 * (size + pairs + (byteorder == 0)) */
2669 if (size > PY_SSIZE_T_MAX ||
2670 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002671 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002672 nsize = size + pairs + (byteorder == 0);
2673 bytesize = nsize * 2;
2674 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002675 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002676 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 if (v == NULL)
2678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002680 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002682 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002683 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002684 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002685
2686 if (byteorder == -1) {
2687 /* force LE */
2688 ihi = 1;
2689 ilo = 0;
2690 }
2691 else if (byteorder == 1) {
2692 /* force BE */
2693 ihi = 0;
2694 ilo = 1;
2695 }
2696
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002697 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002698 Py_UNICODE ch = *s++;
2699 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002700#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 if (ch >= 0x10000) {
2702 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2703 ch = 0xD800 | ((ch-0x10000) >> 10);
2704 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002705#endif
Tim Peters772747b2001-08-09 22:21:55 +00002706 STORECHAR(ch);
2707 if (ch2)
2708 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002711#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712}
2713
2714PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2715{
2716 if (!PyUnicode_Check(unicode)) {
2717 PyErr_BadArgument();
2718 return NULL;
2719 }
2720 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002721 PyUnicode_GET_SIZE(unicode),
2722 NULL,
2723 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724}
2725
2726/* --- Unicode Escape Codec ----------------------------------------------- */
2727
Fredrik Lundh06d12682001-01-24 07:59:11 +00002728static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002729
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002731 Py_ssize_t size,
2732 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002735 Py_ssize_t startinpos;
2736 Py_ssize_t endinpos;
2737 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002742 char* message;
2743 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 PyObject *errorHandler = NULL;
2745 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 /* Escaped strings will always be longer than the resulting
2748 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 length after conversion to the true value.
2750 (but if the error callback returns a long replacement string
2751 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 v = _PyUnicode_New(size);
2753 if (v == NULL)
2754 goto onError;
2755 if (size == 0)
2756 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002760
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 while (s < end) {
2762 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002763 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765
2766 /* Non-escape characters are interpreted as Unicode ordinals */
2767 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002768 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 continue;
2770 }
2771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 /* \ - Escapes */
2774 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002775 c = *s++;
2776 if (s > end)
2777 c = '\0'; /* Invalid after \ */
2778 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002780 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 case '\n': break;
2782 case '\\': *p++ = '\\'; break;
2783 case '\'': *p++ = '\''; break;
2784 case '\"': *p++ = '\"'; break;
2785 case 'b': *p++ = '\b'; break;
2786 case 'f': *p++ = '\014'; break; /* FF */
2787 case 't': *p++ = '\t'; break;
2788 case 'n': *p++ = '\n'; break;
2789 case 'r': *p++ = '\r'; break;
2790 case 'v': *p++ = '\013'; break; /* VT */
2791 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2792
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002793 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 case '0': case '1': case '2': case '3':
2795 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002796 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002797 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002798 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002799 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 break;
2804
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002805 /* hex escapes */
2806 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808 digits = 2;
2809 message = "truncated \\xXX escape";
2810 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002812 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 digits = 4;
2815 message = "truncated \\uXXXX escape";
2816 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002818 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002819 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820 digits = 8;
2821 message = "truncated \\UXXXXXXXX escape";
2822 hexescape:
2823 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (s+digits>end) {
2826 endinpos = size;
2827 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002828 errors, &errorHandler,
2829 "unicodeescape", "end of string in escape sequence",
2830 starts, size, &startinpos, &endinpos, &exc, &s,
2831 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 goto onError;
2833 goto nextByte;
2834 }
2835 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002836 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002837 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 endinpos = (s+i+1)-starts;
2839 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002840 errors, &errorHandler,
2841 "unicodeescape", message,
2842 starts, size, &startinpos, &endinpos, &exc, &s,
2843 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002846 }
2847 chr = (chr<<4) & ~0xF;
2848 if (c >= '0' && c <= '9')
2849 chr += c - '0';
2850 else if (c >= 'a' && c <= 'f')
2851 chr += 10 + c - 'a';
2852 else
2853 chr += 10 + c - 'A';
2854 }
2855 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002856 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 /* _decoding_error will have already written into the
2858 target buffer. */
2859 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002860 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002861 /* when we get here, chr is a 32-bit unicode character */
2862 if (chr <= 0xffff)
2863 /* UCS-2 character */
2864 *p++ = (Py_UNICODE) chr;
2865 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002866 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002867 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002868#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002869 *p++ = chr;
2870#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002871 chr -= 0x10000L;
2872 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002873 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002874#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002875 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002876 endinpos = s-starts;
2877 outpos = p-PyUnicode_AS_UNICODE(v);
2878 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002879 errors, &errorHandler,
2880 "unicodeescape", "illegal Unicode character",
2881 starts, size, &startinpos, &endinpos, &exc, &s,
2882 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002883 goto onError;
2884 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 break;
2886
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002887 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 case 'N':
2889 message = "malformed \\N character escape";
2890 if (ucnhash_CAPI == NULL) {
2891 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002892 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002893 if (ucnhash_CAPI == NULL)
2894 goto ucnhashError;
2895 }
2896 if (*s == '{') {
2897 const char *start = s+1;
2898 /* look for the closing brace */
2899 while (*s != '}' && s < end)
2900 s++;
2901 if (s > start && s < end && *s == '}') {
2902 /* found a name. look it up in the unicode database */
2903 message = "unknown Unicode character name";
2904 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002905 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002906 goto store;
2907 }
2908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 endinpos = s-starts;
2910 outpos = p-PyUnicode_AS_UNICODE(v);
2911 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002912 errors, &errorHandler,
2913 "unicodeescape", message,
2914 starts, size, &startinpos, &endinpos, &exc, &s,
2915 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002917 break;
2918
2919 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002920 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 message = "\\ at end of string";
2922 s--;
2923 endinpos = s-starts;
2924 outpos = p-PyUnicode_AS_UNICODE(v);
2925 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002926 errors, &errorHandler,
2927 "unicodeescape", message,
2928 starts, size, &startinpos, &endinpos, &exc, &s,
2929 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002930 goto onError;
2931 }
2932 else {
2933 *p++ = '\\';
2934 *p++ = (unsigned char)s[-1];
2935 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002936 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002938 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002941 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002943 Py_XDECREF(errorHandler);
2944 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002946
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002947 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002948 PyErr_SetString(
2949 PyExc_UnicodeError,
2950 "\\N escapes not supported (can't load unicodedata module)"
2951 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002952 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 Py_XDECREF(errorHandler);
2954 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002955 return NULL;
2956
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002957 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 Py_XDECREF(errorHandler);
2960 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 return NULL;
2962}
2963
2964/* Return a Unicode-Escape string version of the Unicode object.
2965
2966 If quotes is true, the string is enclosed in u"" or u'' quotes as
2967 appropriate.
2968
2969*/
2970
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002971Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002972 Py_ssize_t size,
2973 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002974{
2975 /* like wcschr, but doesn't stop at NULL characters */
2976
2977 while (size-- > 0) {
2978 if (*s == ch)
2979 return s;
2980 s++;
2981 }
2982
2983 return NULL;
2984}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002985
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986static
2987PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002988 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 int quotes)
2990{
2991 PyObject *repr;
2992 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002994 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002995#ifdef Py_UNICODE_WIDE
2996 const Py_ssize_t expandsize = 10;
2997#else
2998 const Py_ssize_t expandsize = 6;
2999#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000
Neal Norwitz17753ec2006-08-21 22:21:19 +00003001 /* XXX(nnorwitz): rather than over-allocating, it would be
3002 better to choose a different scheme. Perhaps scan the
3003 first N-chars of the string and allocate based on that size.
3004 */
3005 /* Initial allocation is based on the longest-possible unichr
3006 escape.
3007
3008 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3009 unichr, so in this case it's the longest unichr escape. In
3010 narrow (UTF-16) builds this is five chars per source unichr
3011 since there are two unichrs in the surrogate pair, so in narrow
3012 (UTF-16) builds it's not the longest unichr escape.
3013
3014 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3015 so in the narrow (UTF-16) build case it's the longest unichr
3016 escape.
3017 */
3018
Neal Norwitze7d8be82008-07-31 17:17:14 +00003019 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003020 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003021
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003022 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003023 2
3024 + expandsize*size
3025 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 if (repr == NULL)
3027 return NULL;
3028
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003029 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030
3031 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003033 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 !findchar(s, size, '"')) ? '"' : '\'';
3035 }
3036 while (size-- > 0) {
3037 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003038
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003039 /* Escape quotes and backslashes */
3040 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003041 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 *p++ = '\\';
3043 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003045 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003046
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003047#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003048 /* Map 21-bit characters to '\U00xxxxxx' */
3049 else if (ch >= 0x10000) {
3050 *p++ = '\\';
3051 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003052 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3058 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003059 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003060 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003061 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003062#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003063 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3064 else if (ch >= 0xD800 && ch < 0xDC00) {
3065 Py_UNICODE ch2;
3066 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003067
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003068 ch2 = *s++;
3069 size--;
3070 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3071 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3072 *p++ = '\\';
3073 *p++ = 'U';
3074 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3080 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3081 *p++ = hexdigit[ucs & 0x0000000F];
3082 continue;
3083 }
3084 /* Fall through: isolated surrogates are copied as-is */
3085 s--;
3086 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003087 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003088#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003091 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 *p++ = '\\';
3093 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003094 *p++ = hexdigit[(ch >> 12) & 0x000F];
3095 *p++ = hexdigit[(ch >> 8) & 0x000F];
3096 *p++ = hexdigit[(ch >> 4) & 0x000F];
3097 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003099
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003100 /* Map special whitespace to '\t', \n', '\r' */
3101 else if (ch == '\t') {
3102 *p++ = '\\';
3103 *p++ = 't';
3104 }
3105 else if (ch == '\n') {
3106 *p++ = '\\';
3107 *p++ = 'n';
3108 }
3109 else if (ch == '\r') {
3110 *p++ = '\\';
3111 *p++ = 'r';
3112 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003113
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003114 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003115 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003117 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003118 *p++ = hexdigit[(ch >> 4) & 0x000F];
3119 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003120 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003121
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 /* Copy everything else as-is */
3123 else
3124 *p++ = (char) ch;
3125 }
3126 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003127 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128
3129 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003130 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 return repr;
3133}
3134
3135PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003136 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137{
3138 return unicodeescape_string(s, size, 0);
3139}
3140
3141PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3142{
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 return NULL;
3146 }
3147 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003148 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149}
3150
3151/* --- Raw Unicode Escape Codec ------------------------------------------- */
3152
3153PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003154 Py_ssize_t size,
3155 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003158 Py_ssize_t startinpos;
3159 Py_ssize_t endinpos;
3160 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 const char *end;
3164 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 PyObject *errorHandler = NULL;
3166 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003167
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 /* Escaped strings will always be longer than the resulting
3169 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 length after conversion to the true value. (But decoding error
3171 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 v = _PyUnicode_New(size);
3173 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003176 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 end = s + size;
3179 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003180 unsigned char c;
3181 Py_UCS4 x;
3182 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003183 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003185 /* Non-escape characters are interpreted as Unicode ordinals */
3186 if (*s != '\\') {
3187 *p++ = (unsigned char)*s++;
3188 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003189 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003190 startinpos = s-starts;
3191
3192 /* \u-escapes are only interpreted iff the number of leading
3193 backslashes if odd */
3194 bs = s;
3195 for (;s < end;) {
3196 if (*s != '\\')
3197 break;
3198 *p++ = (unsigned char)*s++;
3199 }
3200 if (((s - bs) & 1) == 0 ||
3201 s >= end ||
3202 (*s != 'u' && *s != 'U')) {
3203 continue;
3204 }
3205 p--;
3206 count = *s=='u' ? 4 : 8;
3207 s++;
3208
3209 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3210 outpos = p-PyUnicode_AS_UNICODE(v);
3211 for (x = 0, i = 0; i < count; ++i, ++s) {
3212 c = (unsigned char)*s;
3213 if (!isxdigit(c)) {
3214 endinpos = s-starts;
3215 if (unicode_decode_call_errorhandler(
3216 errors, &errorHandler,
3217 "rawunicodeescape", "truncated \\uXXXX",
3218 starts, size, &startinpos, &endinpos, &exc, &s,
3219 &v, &outpos, &p))
3220 goto onError;
3221 goto nextByte;
3222 }
3223 x = (x<<4) & ~0xF;
3224 if (c >= '0' && c <= '9')
3225 x += c - '0';
3226 else if (c >= 'a' && c <= 'f')
3227 x += 10 + c - 'a';
3228 else
3229 x += 10 + c - 'A';
3230 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003231 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 /* UCS-2 character */
3233 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003234 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003235 /* UCS-4 character. Either store directly, or as
3236 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003237#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003239#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003240 x -= 0x10000L;
3241 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3242 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003243#endif
3244 } else {
3245 endinpos = s-starts;
3246 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003247 if (unicode_decode_call_errorhandler(
3248 errors, &errorHandler,
3249 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003250 starts, size, &startinpos, &endinpos, &exc, &s,
3251 &v, &outpos, &p))
3252 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003253 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003254 nextByte:
3255 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003257 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259 Py_XDECREF(errorHandler);
3260 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003262
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003263 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 Py_XDECREF(errorHandler);
3266 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 return NULL;
3268}
3269
3270PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003271 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272{
3273 PyObject *repr;
3274 char *p;
3275 char *q;
3276
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003277 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003279 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003281 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003282#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003283
Neal Norwitze7d8be82008-07-31 17:17:14 +00003284 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003285 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003286
Neal Norwitze7d8be82008-07-31 17:17:14 +00003287 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 if (repr == NULL)
3289 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003290 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003291 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003293 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 while (size-- > 0) {
3295 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003296#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003297 /* Map 32-bit characters to '\Uxxxxxxxx' */
3298 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003299 *p++ = '\\';
3300 *p++ = 'U';
3301 *p++ = hexdigit[(ch >> 28) & 0xf];
3302 *p++ = hexdigit[(ch >> 24) & 0xf];
3303 *p++ = hexdigit[(ch >> 20) & 0xf];
3304 *p++ = hexdigit[(ch >> 16) & 0xf];
3305 *p++ = hexdigit[(ch >> 12) & 0xf];
3306 *p++ = hexdigit[(ch >> 8) & 0xf];
3307 *p++ = hexdigit[(ch >> 4) & 0xf];
3308 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003309 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003310 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003311#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003312 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3313 if (ch >= 0xD800 && ch < 0xDC00) {
3314 Py_UNICODE ch2;
3315 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003316
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003317 ch2 = *s++;
3318 size--;
3319 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3320 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3321 *p++ = '\\';
3322 *p++ = 'U';
3323 *p++ = hexdigit[(ucs >> 28) & 0xf];
3324 *p++ = hexdigit[(ucs >> 24) & 0xf];
3325 *p++ = hexdigit[(ucs >> 20) & 0xf];
3326 *p++ = hexdigit[(ucs >> 16) & 0xf];
3327 *p++ = hexdigit[(ucs >> 12) & 0xf];
3328 *p++ = hexdigit[(ucs >> 8) & 0xf];
3329 *p++ = hexdigit[(ucs >> 4) & 0xf];
3330 *p++ = hexdigit[ucs & 0xf];
3331 continue;
3332 }
3333 /* Fall through: isolated surrogates are copied as-is */
3334 s--;
3335 size++;
3336 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003337#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003338 /* Map 16-bit characters to '\uxxxx' */
3339 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 *p++ = '\\';
3341 *p++ = 'u';
3342 *p++ = hexdigit[(ch >> 12) & 0xf];
3343 *p++ = hexdigit[(ch >> 8) & 0xf];
3344 *p++ = hexdigit[(ch >> 4) & 0xf];
3345 *p++ = hexdigit[ch & 15];
3346 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003347 /* Copy everything else as-is */
3348 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 *p++ = (char) ch;
3350 }
3351 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003352 if (_PyString_Resize(&repr, p - q))
3353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 return repr;
3355}
3356
3357PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3358{
3359 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360 PyErr_BadArgument();
3361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 }
3363 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365}
3366
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003367/* --- Unicode Internal Codec ------------------------------------------- */
3368
3369PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003370 Py_ssize_t size,
3371 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003372{
3373 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003374 Py_ssize_t startinpos;
3375 Py_ssize_t endinpos;
3376 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003377 PyUnicodeObject *v;
3378 Py_UNICODE *p;
3379 const char *end;
3380 const char *reason;
3381 PyObject *errorHandler = NULL;
3382 PyObject *exc = NULL;
3383
Neal Norwitzd43069c2006-01-08 01:12:10 +00003384#ifdef Py_UNICODE_WIDE
3385 Py_UNICODE unimax = PyUnicode_GetMax();
3386#endif
3387
Armin Rigo7ccbca92006-10-04 12:17:45 +00003388 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3390 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003392 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003393 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003394 p = PyUnicode_AS_UNICODE(v);
3395 end = s + size;
3396
3397 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003398 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003399 /* We have to sanity check the raw data, otherwise doom looms for
3400 some malformed UCS-4 data. */
3401 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003402#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003405 end-s < Py_UNICODE_SIZE
3406 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003407 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003408 startinpos = s - starts;
3409 if (end-s < Py_UNICODE_SIZE) {
3410 endinpos = end-starts;
3411 reason = "truncated input";
3412 }
3413 else {
3414 endinpos = s - starts + Py_UNICODE_SIZE;
3415 reason = "illegal code point (> 0x10FFFF)";
3416 }
3417 outpos = p - PyUnicode_AS_UNICODE(v);
3418 if (unicode_decode_call_errorhandler(
3419 errors, &errorHandler,
3420 "unicode_internal", reason,
3421 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003422 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003423 goto onError;
3424 }
3425 }
3426 else {
3427 p++;
3428 s += Py_UNICODE_SIZE;
3429 }
3430 }
3431
Martin v. Löwis412fb672006-04-13 06:34:32 +00003432 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003433 goto onError;
3434 Py_XDECREF(errorHandler);
3435 Py_XDECREF(exc);
3436 return (PyObject *)v;
3437
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003438 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003439 Py_XDECREF(v);
3440 Py_XDECREF(errorHandler);
3441 Py_XDECREF(exc);
3442 return NULL;
3443}
3444
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445/* --- Latin-1 Codec ------------------------------------------------------ */
3446
3447PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003448 Py_ssize_t size,
3449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450{
3451 PyUnicodeObject *v;
3452 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003453
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003455 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003456 Py_UNICODE r = *(unsigned char*)s;
3457 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003458 }
3459
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 v = _PyUnicode_New(size);
3461 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 p = PyUnicode_AS_UNICODE(v);
3466 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003467 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003469
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003470 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 Py_XDECREF(v);
3472 return NULL;
3473}
3474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475/* create or adjust a UnicodeEncodeError */
3476static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003477 const char *encoding,
3478 const Py_UNICODE *unicode, Py_ssize_t size,
3479 Py_ssize_t startpos, Py_ssize_t endpos,
3480 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003483 *exceptionObject = PyUnicodeEncodeError_Create(
3484 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 }
3486 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003487 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3488 goto onError;
3489 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3490 goto onError;
3491 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3492 goto onError;
3493 return;
3494 onError:
3495 Py_DECREF(*exceptionObject);
3496 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 }
3498}
3499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500/* raises a UnicodeEncodeError */
3501static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003502 const char *encoding,
3503 const Py_UNICODE *unicode, Py_ssize_t size,
3504 Py_ssize_t startpos, Py_ssize_t endpos,
3505 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506{
3507 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003510 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511}
3512
3513/* error handling callback helper:
3514 build arguments, call the callback and check the arguments,
3515 put the result into newpos and return the replacement string, which
3516 has to be freed by the caller */
3517static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003518 PyObject **errorHandler,
3519 const char *encoding, const char *reason,
3520 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3521 Py_ssize_t startpos, Py_ssize_t endpos,
3522 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525
3526 PyObject *restuple;
3527 PyObject *resunicode;
3528
3529 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003532 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 }
3534
3535 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539
3540 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003541 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003545 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003546 Py_DECREF(restuple);
3547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 }
3549 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 &resunicode, newpos)) {
3551 Py_DECREF(restuple);
3552 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 }
3554 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003555 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003556 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003557 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3558 Py_DECREF(restuple);
3559 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003560 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 Py_INCREF(resunicode);
3562 Py_DECREF(restuple);
3563 return resunicode;
3564}
3565
3566static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003567 Py_ssize_t size,
3568 const char *errors,
3569 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570{
3571 /* output object */
3572 PyObject *res;
3573 /* pointers to the beginning and end+1 of input */
3574 const Py_UNICODE *startp = p;
3575 const Py_UNICODE *endp = p + size;
3576 /* pointer to the beginning of the unencodable characters */
3577 /* const Py_UNICODE *badp = NULL; */
3578 /* pointer into the output */
3579 char *str;
3580 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003581 Py_ssize_t respos = 0;
3582 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003583 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3584 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 PyObject *errorHandler = NULL;
3586 PyObject *exc = NULL;
3587 /* the following variable is used for caching string comparisons
3588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3589 int known_errorHandler = -1;
3590
3591 /* allocate enough for a simple encoding without
3592 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003593 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 if (res == NULL)
3595 goto onError;
3596 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003597 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003598 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 ressize = size;
3600
3601 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003602 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003604 /* can we encode this? */
3605 if (c<limit) {
3606 /* no overflow check, because we know that the space is enough */
3607 *str++ = (char)c;
3608 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003609 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003610 else {
3611 Py_ssize_t unicodepos = p-startp;
3612 Py_ssize_t requiredsize;
3613 PyObject *repunicode;
3614 Py_ssize_t repsize;
3615 Py_ssize_t newpos;
3616 Py_ssize_t respos;
3617 Py_UNICODE *uni2;
3618 /* startpos for collecting unencodable chars */
3619 const Py_UNICODE *collstart = p;
3620 const Py_UNICODE *collend = p;
3621 /* find all unecodable characters */
3622 while ((collend < endp) && ((*collend)>=limit))
3623 ++collend;
3624 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3625 if (known_errorHandler==-1) {
3626 if ((errors==NULL) || (!strcmp(errors, "strict")))
3627 known_errorHandler = 1;
3628 else if (!strcmp(errors, "replace"))
3629 known_errorHandler = 2;
3630 else if (!strcmp(errors, "ignore"))
3631 known_errorHandler = 3;
3632 else if (!strcmp(errors, "xmlcharrefreplace"))
3633 known_errorHandler = 4;
3634 else
3635 known_errorHandler = 0;
3636 }
3637 switch (known_errorHandler) {
3638 case 1: /* strict */
3639 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3640 goto onError;
3641 case 2: /* replace */
3642 while (collstart++<collend)
3643 *str++ = '?'; /* fall through */
3644 case 3: /* ignore */
3645 p = collend;
3646 break;
3647 case 4: /* xmlcharrefreplace */
3648 respos = str-PyString_AS_STRING(res);
3649 /* determine replacement size (temporarily (mis)uses p) */
3650 for (p = collstart, repsize = 0; p < collend; ++p) {
3651 if (*p<10)
3652 repsize += 2+1+1;
3653 else if (*p<100)
3654 repsize += 2+2+1;
3655 else if (*p<1000)
3656 repsize += 2+3+1;
3657 else if (*p<10000)
3658 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003659#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 else
3661 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003662#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003663 else if (*p<100000)
3664 repsize += 2+5+1;
3665 else if (*p<1000000)
3666 repsize += 2+6+1;
3667 else
3668 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003669#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003670 }
3671 requiredsize = respos+repsize+(endp-collend);
3672 if (requiredsize > ressize) {
3673 if (requiredsize<2*ressize)
3674 requiredsize = 2*ressize;
3675 if (_PyString_Resize(&res, requiredsize))
3676 goto onError;
3677 str = PyString_AS_STRING(res) + respos;
3678 ressize = requiredsize;
3679 }
3680 /* generate replacement (temporarily (mis)uses p) */
3681 for (p = collstart; p < collend; ++p) {
3682 str += sprintf(str, "&#%d;", (int)*p);
3683 }
3684 p = collend;
3685 break;
3686 default:
3687 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3688 encoding, reason, startp, size, &exc,
3689 collstart-startp, collend-startp, &newpos);
3690 if (repunicode == NULL)
3691 goto onError;
3692 /* need more space? (at least enough for what we have+the
3693 replacement+the rest of the string, so we won't have to
3694 check space for encodable characters) */
3695 respos = str-PyString_AS_STRING(res);
3696 repsize = PyUnicode_GET_SIZE(repunicode);
3697 requiredsize = respos+repsize+(endp-collend);
3698 if (requiredsize > ressize) {
3699 if (requiredsize<2*ressize)
3700 requiredsize = 2*ressize;
3701 if (_PyString_Resize(&res, requiredsize)) {
3702 Py_DECREF(repunicode);
3703 goto onError;
3704 }
3705 str = PyString_AS_STRING(res) + respos;
3706 ressize = requiredsize;
3707 }
3708 /* check if there is anything unencodable in the replacement
3709 and copy it to the output */
3710 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3711 c = *uni2;
3712 if (c >= limit) {
3713 raise_encode_exception(&exc, encoding, startp, size,
3714 unicodepos, unicodepos+1, reason);
3715 Py_DECREF(repunicode);
3716 goto onError;
3717 }
3718 *str = (char)c;
3719 }
3720 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003721 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003722 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003723 }
3724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003726 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003728 /* If this falls res will be NULL */
3729 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 Py_XDECREF(errorHandler);
3731 Py_XDECREF(exc);
3732 return res;
3733
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003734 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 Py_XDECREF(res);
3736 Py_XDECREF(errorHandler);
3737 Py_XDECREF(exc);
3738 return NULL;
3739}
3740
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003742 Py_ssize_t size,
3743 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746}
3747
3748PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3749{
3750 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003751 PyErr_BadArgument();
3752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 }
3754 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003755 PyUnicode_GET_SIZE(unicode),
3756 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757}
3758
3759/* --- 7-bit ASCII Codec -------------------------------------------------- */
3760
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003762 Py_ssize_t size,
3763 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 PyUnicodeObject *v;
3767 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003768 Py_ssize_t startinpos;
3769 Py_ssize_t endinpos;
3770 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 const char *e;
3772 PyObject *errorHandler = NULL;
3773 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003776 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003777 Py_UNICODE r = *(unsigned char*)s;
3778 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003779 }
Tim Petersced69f82003-09-16 20:30:58 +00003780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 v = _PyUnicode_New(size);
3782 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003783 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003785 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 e = s + size;
3788 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 register unsigned char c = (unsigned char)*s;
3790 if (c < 128) {
3791 *p++ = c;
3792 ++s;
3793 }
3794 else {
3795 startinpos = s-starts;
3796 endinpos = startinpos + 1;
3797 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3798 if (unicode_decode_call_errorhandler(
3799 errors, &errorHandler,
3800 "ascii", "ordinal not in range(128)",
3801 starts, size, &startinpos, &endinpos, &exc, &s,
3802 &v, &outpos, &p))
3803 goto onError;
3804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003806 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003807 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3808 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003812
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003813 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 Py_XDECREF(errorHandler);
3816 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 return NULL;
3818}
3819
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003821 Py_ssize_t size,
3822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825}
3826
3827PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3828{
3829 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 PyErr_BadArgument();
3831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 }
3833 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 PyUnicode_GET_SIZE(unicode),
3835 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836}
3837
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003838#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003839
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003840/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003841
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003842#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003843#define NEED_RETRY
3844#endif
3845
3846/* XXX This code is limited to "true" double-byte encodings, as
3847 a) it assumes an incomplete character consists of a single byte, and
3848 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003849 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003850
3851static int is_dbcs_lead_byte(const char *s, int offset)
3852{
3853 const char *curr = s + offset;
3854
3855 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003856 const char *prev = CharPrev(s, curr);
3857 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003858 }
3859 return 0;
3860}
3861
3862/*
3863 * Decode MBCS string into unicode object. If 'final' is set, converts
3864 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3865 */
3866static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003867 const char *s, /* MBCS string */
3868 int size, /* sizeof MBCS string */
3869 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003870{
3871 Py_UNICODE *p;
3872 Py_ssize_t n = 0;
3873 int usize = 0;
3874
3875 assert(size >= 0);
3876
3877 /* Skip trailing lead-byte unless 'final' is set */
3878 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003879 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003880
3881 /* First get the size of the result */
3882 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003883 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3884 if (usize == 0) {
3885 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3886 return -1;
3887 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003888 }
3889
3890 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003891 /* Create unicode object */
3892 *v = _PyUnicode_New(usize);
3893 if (*v == NULL)
3894 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895 }
3896 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003897 /* Extend unicode object */
3898 n = PyUnicode_GET_SIZE(*v);
3899 if (_PyUnicode_Resize(v, n + usize) < 0)
3900 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003901 }
3902
3903 /* Do the conversion */
3904 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003905 p = PyUnicode_AS_UNICODE(*v) + n;
3906 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3907 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3908 return -1;
3909 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910 }
3911
3912 return size;
3913}
3914
3915PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003916 Py_ssize_t size,
3917 const char *errors,
3918 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003919{
3920 PyUnicodeObject *v = NULL;
3921 int done;
3922
3923 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003924 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003925
3926#ifdef NEED_RETRY
3927 retry:
3928 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930 else
3931#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003932 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933
3934 if (done < 0) {
3935 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937 }
3938
3939 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941
3942#ifdef NEED_RETRY
3943 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003944 s += done;
3945 size -= done;
3946 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003947 }
3948#endif
3949
3950 return (PyObject *)v;
3951}
3952
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003954 Py_ssize_t size,
3955 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003956{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003957 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3958}
3959
3960/*
3961 * Convert unicode into string object (MBCS).
3962 * Returns 0 if succeed, -1 otherwise.
3963 */
3964static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003965 const Py_UNICODE *p, /* unicode */
3966 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003967{
3968 int mbcssize = 0;
3969 Py_ssize_t n = 0;
3970
3971 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003972
3973 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003974 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003975 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3976 if (mbcssize == 0) {
3977 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3978 return -1;
3979 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003980 }
3981
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003983 /* Create string object */
3984 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3985 if (*repr == NULL)
3986 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003987 }
3988 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 /* Extend string object */
3990 n = PyString_Size(*repr);
3991 if (_PyString_Resize(repr, n + mbcssize) < 0)
3992 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003993 }
3994
3995 /* Do the conversion */
3996 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003997 char *s = PyString_AS_STRING(*repr) + n;
3998 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3999 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4000 return -1;
4001 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004002 }
4003
4004 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004005}
4006
4007PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004008 Py_ssize_t size,
4009 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004010{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004011 PyObject *repr = NULL;
4012 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004013
Martin v. Löwisd8251432006-06-14 05:21:04 +00004014#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004016 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018 else
4019#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004021
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004023 Py_XDECREF(repr);
4024 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004025 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026
4027#ifdef NEED_RETRY
4028 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004029 p += INT_MAX;
4030 size -= INT_MAX;
4031 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004032 }
4033#endif
4034
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004035 return repr;
4036}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004037
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004038PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4039{
4040 if (!PyUnicode_Check(unicode)) {
4041 PyErr_BadArgument();
4042 return NULL;
4043 }
4044 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004045 PyUnicode_GET_SIZE(unicode),
4046 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004047}
4048
Martin v. Löwisd8251432006-06-14 05:21:04 +00004049#undef NEED_RETRY
4050
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004051#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053/* --- Character Mapping Codec -------------------------------------------- */
4054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004056 Py_ssize_t size,
4057 PyObject *mapping,
4058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004061 Py_ssize_t startinpos;
4062 Py_ssize_t endinpos;
4063 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 PyUnicodeObject *v;
4066 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 PyObject *errorHandler = NULL;
4069 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004070 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004072
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 /* Default to Latin-1 */
4074 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
4077 v = _PyUnicode_New(size);
4078 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004081 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004084 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 mapstring = PyUnicode_AS_UNICODE(mapping);
4086 maplen = PyUnicode_GET_SIZE(mapping);
4087 while (s < e) {
4088 unsigned char ch = *s;
4089 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 if (ch < maplen)
4092 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004094 if (x == 0xfffe) {
4095 /* undefined mapping */
4096 outpos = p-PyUnicode_AS_UNICODE(v);
4097 startinpos = s-starts;
4098 endinpos = startinpos+1;
4099 if (unicode_decode_call_errorhandler(
4100 errors, &errorHandler,
4101 "charmap", "character maps to <undefined>",
4102 starts, size, &startinpos, &endinpos, &exc, &s,
4103 &v, &outpos, &p)) {
4104 goto onError;
4105 }
4106 continue;
4107 }
4108 *p++ = x;
4109 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004110 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004111 }
4112 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004113 while (s < e) {
4114 unsigned char ch = *s;
4115 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004116
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4118 w = PyInt_FromLong((long)ch);
4119 if (w == NULL)
4120 goto onError;
4121 x = PyObject_GetItem(mapping, w);
4122 Py_DECREF(w);
4123 if (x == NULL) {
4124 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4125 /* No mapping found means: mapping is undefined. */
4126 PyErr_Clear();
4127 x = Py_None;
4128 Py_INCREF(x);
4129 } else
4130 goto onError;
4131 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004132
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004133 /* Apply mapping */
4134 if (PyInt_Check(x)) {
4135 long value = PyInt_AS_LONG(x);
4136 if (value < 0 || value > 65535) {
4137 PyErr_SetString(PyExc_TypeError,
4138 "character mapping must be in range(65536)");
4139 Py_DECREF(x);
4140 goto onError;
4141 }
4142 *p++ = (Py_UNICODE)value;
4143 }
4144 else if (x == Py_None) {
4145 /* undefined mapping */
4146 outpos = p-PyUnicode_AS_UNICODE(v);
4147 startinpos = s-starts;
4148 endinpos = startinpos+1;
4149 if (unicode_decode_call_errorhandler(
4150 errors, &errorHandler,
4151 "charmap", "character maps to <undefined>",
4152 starts, size, &startinpos, &endinpos, &exc, &s,
4153 &v, &outpos, &p)) {
4154 Py_DECREF(x);
4155 goto onError;
4156 }
4157 Py_DECREF(x);
4158 continue;
4159 }
4160 else if (PyUnicode_Check(x)) {
4161 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004162
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004163 if (targetsize == 1)
4164 /* 1-1 mapping */
4165 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004166
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004167 else if (targetsize > 1) {
4168 /* 1-n mapping */
4169 if (targetsize > extrachars) {
4170 /* resize first */
4171 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4172 Py_ssize_t needed = (targetsize - extrachars) + \
4173 (targetsize << 2);
4174 extrachars += needed;
4175 /* XXX overflow detection missing */
4176 if (_PyUnicode_Resize(&v,
4177 PyUnicode_GET_SIZE(v) + needed) < 0) {
4178 Py_DECREF(x);
4179 goto onError;
4180 }
4181 p = PyUnicode_AS_UNICODE(v) + oldpos;
4182 }
4183 Py_UNICODE_COPY(p,
4184 PyUnicode_AS_UNICODE(x),
4185 targetsize);
4186 p += targetsize;
4187 extrachars -= targetsize;
4188 }
4189 /* 1-0 mapping: skip the character */
4190 }
4191 else {
4192 /* wrong return value */
4193 PyErr_SetString(PyExc_TypeError,
4194 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004195 Py_DECREF(x);
4196 goto onError;
4197 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004198 Py_DECREF(x);
4199 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 }
4202 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004203 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4204 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 Py_XDECREF(errorHandler);
4206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004208
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004209 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 Py_XDECREF(errorHandler);
4211 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 Py_XDECREF(v);
4213 return NULL;
4214}
4215
Martin v. Löwis3f767792006-06-04 19:36:28 +00004216/* Charmap encoding: the lookup table */
4217
4218struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004219 PyObject_HEAD
4220 unsigned char level1[32];
4221 int count2, count3;
4222 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004223};
4224
4225static PyObject*
4226encoding_map_size(PyObject *obj, PyObject* args)
4227{
4228 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004229 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004230 128*map->count3);
4231}
4232
4233static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004234 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004235 PyDoc_STR("Return the size (in bytes) of this object") },
4236 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004237};
4238
4239static void
4240encoding_map_dealloc(PyObject* o)
4241{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004242 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004243}
4244
4245static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004246 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004247 "EncodingMap", /*tp_name*/
4248 sizeof(struct encoding_map), /*tp_basicsize*/
4249 0, /*tp_itemsize*/
4250 /* methods */
4251 encoding_map_dealloc, /*tp_dealloc*/
4252 0, /*tp_print*/
4253 0, /*tp_getattr*/
4254 0, /*tp_setattr*/
4255 0, /*tp_compare*/
4256 0, /*tp_repr*/
4257 0, /*tp_as_number*/
4258 0, /*tp_as_sequence*/
4259 0, /*tp_as_mapping*/
4260 0, /*tp_hash*/
4261 0, /*tp_call*/
4262 0, /*tp_str*/
4263 0, /*tp_getattro*/
4264 0, /*tp_setattro*/
4265 0, /*tp_as_buffer*/
4266 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4267 0, /*tp_doc*/
4268 0, /*tp_traverse*/
4269 0, /*tp_clear*/
4270 0, /*tp_richcompare*/
4271 0, /*tp_weaklistoffset*/
4272 0, /*tp_iter*/
4273 0, /*tp_iternext*/
4274 encoding_map_methods, /*tp_methods*/
4275 0, /*tp_members*/
4276 0, /*tp_getset*/
4277 0, /*tp_base*/
4278 0, /*tp_dict*/
4279 0, /*tp_descr_get*/
4280 0, /*tp_descr_set*/
4281 0, /*tp_dictoffset*/
4282 0, /*tp_init*/
4283 0, /*tp_alloc*/
4284 0, /*tp_new*/
4285 0, /*tp_free*/
4286 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004287};
4288
4289PyObject*
4290PyUnicode_BuildEncodingMap(PyObject* string)
4291{
4292 Py_UNICODE *decode;
4293 PyObject *result;
4294 struct encoding_map *mresult;
4295 int i;
4296 int need_dict = 0;
4297 unsigned char level1[32];
4298 unsigned char level2[512];
4299 unsigned char *mlevel1, *mlevel2, *mlevel3;
4300 int count2 = 0, count3 = 0;
4301
4302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4303 PyErr_BadArgument();
4304 return NULL;
4305 }
4306 decode = PyUnicode_AS_UNICODE(string);
4307 memset(level1, 0xFF, sizeof level1);
4308 memset(level2, 0xFF, sizeof level2);
4309
4310 /* If there isn't a one-to-one mapping of NULL to \0,
4311 or if there are non-BMP characters, we need to use
4312 a mapping dictionary. */
4313 if (decode[0] != 0)
4314 need_dict = 1;
4315 for (i = 1; i < 256; i++) {
4316 int l1, l2;
4317 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004318#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004320#endif
4321 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004322 need_dict = 1;
4323 break;
4324 }
4325 if (decode[i] == 0xFFFE)
4326 /* unmapped character */
4327 continue;
4328 l1 = decode[i] >> 11;
4329 l2 = decode[i] >> 7;
4330 if (level1[l1] == 0xFF)
4331 level1[l1] = count2++;
4332 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004333 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004334 }
4335
4336 if (count2 >= 0xFF || count3 >= 0xFF)
4337 need_dict = 1;
4338
4339 if (need_dict) {
4340 PyObject *result = PyDict_New();
4341 PyObject *key, *value;
4342 if (!result)
4343 return NULL;
4344 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004345 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004346 key = PyInt_FromLong(decode[i]);
4347 value = PyInt_FromLong(i);
4348 if (!key || !value)
4349 goto failed1;
4350 if (PyDict_SetItem(result, key, value) == -1)
4351 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004352 Py_DECREF(key);
4353 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004354 }
4355 return result;
4356 failed1:
4357 Py_XDECREF(key);
4358 Py_XDECREF(value);
4359 Py_DECREF(result);
4360 return NULL;
4361 }
4362
4363 /* Create a three-level trie */
4364 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4365 16*count2 + 128*count3 - 1);
4366 if (!result)
4367 return PyErr_NoMemory();
4368 PyObject_Init(result, &EncodingMapType);
4369 mresult = (struct encoding_map*)result;
4370 mresult->count2 = count2;
4371 mresult->count3 = count3;
4372 mlevel1 = mresult->level1;
4373 mlevel2 = mresult->level23;
4374 mlevel3 = mresult->level23 + 16*count2;
4375 memcpy(mlevel1, level1, 32);
4376 memset(mlevel2, 0xFF, 16*count2);
4377 memset(mlevel3, 0, 128*count3);
4378 count3 = 0;
4379 for (i = 1; i < 256; i++) {
4380 int o1, o2, o3, i2, i3;
4381 if (decode[i] == 0xFFFE)
4382 /* unmapped character */
4383 continue;
4384 o1 = decode[i]>>11;
4385 o2 = (decode[i]>>7) & 0xF;
4386 i2 = 16*mlevel1[o1] + o2;
4387 if (mlevel2[i2] == 0xFF)
4388 mlevel2[i2] = count3++;
4389 o3 = decode[i] & 0x7F;
4390 i3 = 128*mlevel2[i2] + o3;
4391 mlevel3[i3] = i;
4392 }
4393 return result;
4394}
4395
4396static int
4397encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4398{
4399 struct encoding_map *map = (struct encoding_map*)mapping;
4400 int l1 = c>>11;
4401 int l2 = (c>>7) & 0xF;
4402 int l3 = c & 0x7F;
4403 int i;
4404
4405#ifdef Py_UNICODE_WIDE
4406 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004407 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 }
4409#endif
4410 if (c == 0)
4411 return 0;
4412 /* level 1*/
4413 i = map->level1[l1];
4414 if (i == 0xFF) {
4415 return -1;
4416 }
4417 /* level 2*/
4418 i = map->level23[16*i+l2];
4419 if (i == 0xFF) {
4420 return -1;
4421 }
4422 /* level 3 */
4423 i = map->level23[16*map->count2 + 128*i + l3];
4424 if (i == 0) {
4425 return -1;
4426 }
4427 return i;
4428}
4429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430/* Lookup the character ch in the mapping. If the character
4431 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004432 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject *w = PyInt_FromLong((long)c);
4436 PyObject *x;
4437
4438 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004439 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 x = PyObject_GetItem(mapping, w);
4441 Py_DECREF(w);
4442 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004443 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4444 /* No mapping found means: mapping is undefined. */
4445 PyErr_Clear();
4446 x = Py_None;
4447 Py_INCREF(x);
4448 return x;
4449 } else
4450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004452 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004453 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004455 long value = PyInt_AS_LONG(x);
4456 if (value < 0 || value > 255) {
4457 PyErr_SetString(PyExc_TypeError,
4458 "character mapping must be in range(256)");
4459 Py_DECREF(x);
4460 return NULL;
4461 }
4462 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004464 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004465 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004467 /* wrong return value */
4468 PyErr_SetString(PyExc_TypeError,
4469 "character mapping must return integer, None or str");
4470 Py_DECREF(x);
4471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 }
4473}
4474
Martin v. Löwis3f767792006-06-04 19:36:28 +00004475static int
4476charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4477{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4479 /* exponentially overallocate to minimize reallocations */
4480 if (requiredsize < 2*outsize)
4481 requiredsize = 2*outsize;
4482 if (_PyString_Resize(outobj, requiredsize)) {
4483 return 0;
4484 }
4485 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004486}
4487
Benjamin Peterson857ce152009-01-31 16:29:18 +00004488typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004490}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491/* lookup the character, put the result in the output string and adjust
4492 various state variables. Reallocate the output string if not enough
4493 space is available. Return a new reference to the object that
4494 was put in the output buffer, or Py_None, if the mapping was undefined
4495 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004496 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004498charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004499 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004501 PyObject *rep;
4502 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504
Christian Heimese93237d2007-12-19 02:37:44 +00004505 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004506 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004507 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004508 if (res == -1)
4509 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004510 if (outsize<requiredsize)
4511 if (!charmapencode_resize(outobj, outpos, requiredsize))
4512 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004513 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004514 outstart[(*outpos)++] = (char)res;
4515 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516 }
4517
4518 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004520 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004521 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004522 Py_DECREF(rep);
4523 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004524 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004525 if (PyInt_Check(rep)) {
4526 Py_ssize_t requiredsize = *outpos+1;
4527 if (outsize<requiredsize)
4528 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4529 Py_DECREF(rep);
4530 return enc_EXCEPTION;
4531 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004532 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004534 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004535 else {
4536 const char *repchars = PyString_AS_STRING(rep);
4537 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4538 Py_ssize_t requiredsize = *outpos+repsize;
4539 if (outsize<requiredsize)
4540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4541 Py_DECREF(rep);
4542 return enc_EXCEPTION;
4543 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004544 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004545 memcpy(outstart + *outpos, repchars, repsize);
4546 *outpos += repsize;
4547 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 }
Georg Brandl9f167602006-06-04 21:46:16 +00004549 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004550 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551}
4552
4553/* handle an error in PyUnicode_EncodeCharmap
4554 Return 0 on success, -1 on error */
4555static
4556int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004559 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004560 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561{
4562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t repsize;
4564 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 Py_UNICODE *uni2;
4566 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t collstartpos = *inpos;
4568 Py_ssize_t collendpos = *inpos+1;
4569 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 char *encoding = "charmap";
4571 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004572 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 /* find all unencodable characters */
4575 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004576 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004577 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004578 int res = encoding_map_lookup(p[collendpos], mapping);
4579 if (res != -1)
4580 break;
4581 ++collendpos;
4582 continue;
4583 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004584
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004585 rep = charmapencode_lookup(p[collendpos], mapping);
4586 if (rep==NULL)
4587 return -1;
4588 else if (rep!=Py_None) {
4589 Py_DECREF(rep);
4590 break;
4591 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004592 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004593 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 }
4595 /* cache callback name lookup
4596 * (if not done yet, i.e. it's the first error) */
4597 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004598 if ((errors==NULL) || (!strcmp(errors, "strict")))
4599 *known_errorHandler = 1;
4600 else if (!strcmp(errors, "replace"))
4601 *known_errorHandler = 2;
4602 else if (!strcmp(errors, "ignore"))
4603 *known_errorHandler = 3;
4604 else if (!strcmp(errors, "xmlcharrefreplace"))
4605 *known_errorHandler = 4;
4606 else
4607 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
4609 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004610 case 1: /* strict */
4611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4612 return -1;
4613 case 2: /* replace */
4614 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004615 x = charmapencode_output('?', mapping, res, respos);
4616 if (x==enc_EXCEPTION) {
4617 return -1;
4618 }
4619 else if (x==enc_FAILED) {
4620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4621 return -1;
4622 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004623 }
4624 /* fall through */
4625 case 3: /* ignore */
4626 *inpos = collendpos;
4627 break;
4628 case 4: /* xmlcharrefreplace */
4629 /* generate replacement (temporarily (mis)uses p) */
4630 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004631 char buffer[2+29+1+1];
4632 char *cp;
4633 sprintf(buffer, "&#%d;", (int)p[collpos]);
4634 for (cp = buffer; *cp; ++cp) {
4635 x = charmapencode_output(*cp, mapping, res, respos);
4636 if (x==enc_EXCEPTION)
4637 return -1;
4638 else if (x==enc_FAILED) {
4639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4640 return -1;
4641 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004642 }
4643 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004644 *inpos = collendpos;
4645 break;
4646 default:
4647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004648 encoding, reason, p, size, exceptionObject,
4649 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004650 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004651 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004652 /* generate replacement */
4653 repsize = PyUnicode_GET_SIZE(repunicode);
4654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004655 x = charmapencode_output(*uni2, mapping, res, respos);
4656 if (x==enc_EXCEPTION) {
4657 return -1;
4658 }
4659 else if (x==enc_FAILED) {
4660 Py_DECREF(repunicode);
4661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4662 return -1;
4663 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004664 }
4665 *inpos = newpos;
4666 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 }
4668 return 0;
4669}
4670
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004672 Py_ssize_t size,
4673 PyObject *mapping,
4674 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 /* output object */
4677 PyObject *res = NULL;
4678 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 PyObject *errorHandler = NULL;
4683 PyObject *exc = NULL;
4684 /* the following variable is used for caching string comparisons
4685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4686 * 3=ignore, 4=xmlcharrefreplace */
4687 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
4689 /* Default to Latin-1 */
4690 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004691 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 /* allocate enough for a simple encoding without
4694 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004695 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 if (res == NULL)
4697 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004698 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004702 /* try to encode it */
4703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4704 if (x==enc_EXCEPTION) /* error */
4705 goto onError;
4706 if (x==enc_FAILED) { /* unencodable character */
4707 if (charmap_encoding_error(p, size, &inpos, mapping,
4708 &exc,
4709 &known_errorHandler, &errorHandler, errors,
4710 &res, &respos)) {
4711 goto onError;
4712 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004713 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004714 else
4715 /* done with this character => adjust input position */
4716 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004720 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004721 if (_PyString_Resize(&res, respos))
4722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 }
4724 Py_XDECREF(exc);
4725 Py_XDECREF(errorHandler);
4726 return res;
4727
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 Py_XDECREF(res);
4730 Py_XDECREF(exc);
4731 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 return NULL;
4733}
4734
4735PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004736 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737{
4738 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004739 PyErr_BadArgument();
4740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 }
4742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 PyUnicode_GET_SIZE(unicode),
4744 mapping,
4745 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746}
4747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748/* create or adjust a UnicodeTranslateError */
4749static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004750 const Py_UNICODE *unicode, Py_ssize_t size,
4751 Py_ssize_t startpos, Py_ssize_t endpos,
4752 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004755 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004756 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 }
4758 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4760 goto onError;
4761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4762 goto onError;
4763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4764 goto onError;
4765 return;
4766 onError:
4767 Py_DECREF(*exceptionObject);
4768 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 }
4770}
4771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772/* raises a UnicodeTranslateError */
4773static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004774 const Py_UNICODE *unicode, Py_ssize_t size,
4775 Py_ssize_t startpos, Py_ssize_t endpos,
4776 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777{
4778 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004779 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004781 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782}
4783
4784/* error handling callback helper:
4785 build arguments, call the callback and check the arguments,
4786 put the result into newpos and return the replacement string, which
4787 has to be freed by the caller */
4788static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004789 PyObject **errorHandler,
4790 const char *reason,
4791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4792 Py_ssize_t startpos, Py_ssize_t endpos,
4793 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796
Martin v. Löwis412fb672006-04-13 06:34:32 +00004797 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 PyObject *restuple;
4799 PyObject *resunicode;
4800
4801 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004802 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004804 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 }
4806
4807 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811
4812 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004817 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004818 Py_DECREF(restuple);
4819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 }
4821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 &resunicode, &i_newpos)) {
4823 Py_DECREF(restuple);
4824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004826 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004828 else
4829 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004830 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4832 Py_DECREF(restuple);
4833 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 Py_INCREF(resunicode);
4836 Py_DECREF(restuple);
4837 return resunicode;
4838}
4839
4840/* Lookup the character ch in the mapping and put the result in result,
4841 which must be decrefed by the caller.
4842 Return 0 on success, -1 on error */
4843static
4844int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4845{
4846 PyObject *w = PyInt_FromLong((long)c);
4847 PyObject *x;
4848
4849 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 x = PyObject_GetItem(mapping, w);
4852 Py_DECREF(w);
4853 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4855 /* No mapping found means: use 1:1 mapping. */
4856 PyErr_Clear();
4857 *result = NULL;
4858 return 0;
4859 } else
4860 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 }
4862 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004863 *result = x;
4864 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 }
4866 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004867 long value = PyInt_AS_LONG(x);
4868 long max = PyUnicode_GetMax();
4869 if (value < 0 || value > max) {
4870 PyErr_Format(PyExc_TypeError,
4871 "character mapping must be in range(0x%lx)", max+1);
4872 Py_DECREF(x);
4873 return -1;
4874 }
4875 *result = x;
4876 return 0;
4877 }
4878 else if (PyUnicode_Check(x)) {
4879 *result = x;
4880 return 0;
4881 }
4882 else {
4883 /* wrong return value */
4884 PyErr_SetString(PyExc_TypeError,
4885 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004886 Py_DECREF(x);
4887 return -1;
4888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889}
4890/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 if not reallocate and adjust various state variables.
4892 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893static
Walter Dörwald4894c302003-10-24 14:25:28 +00004894int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004898 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004899 /* remember old output position */
4900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4901 /* exponentially overallocate to minimize reallocations */
4902 if (requiredsize < 2 * oldsize)
4903 requiredsize = 2 * oldsize;
4904 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4905 return -1;
4906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 }
4908 return 0;
4909}
4910/* lookup the character, put the result in the output string and adjust
4911 various state variables. Return a new reference to the object that
4912 was put in the output buffer in *result, or Py_None, if the mapping was
4913 undefined (in which case no character was written).
4914 The called must decref result.
4915 Return 0 on success, -1 on error. */
4916static
Walter Dörwald4894c302003-10-24 14:25:28 +00004917int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4919 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920{
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004924 /* not found => default to 1:1 mapping */
4925 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 }
4927 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004930 /* no overflow check, because we know that the space is enough */
4931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 }
4933 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4935 if (repsize==1) {
4936 /* no overflow check, because we know that the space is enough */
4937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4938 }
4939 else if (repsize!=0) {
4940 /* more than one character */
4941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4942 (insize - (curinp-startinp)) +
4943 repsize - 1;
4944 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4945 return -1;
4946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4947 *outp += repsize;
4948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 }
4950 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004951 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 return 0;
4953}
4954
4955PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004956 Py_ssize_t size,
4957 PyObject *mapping,
4958 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 /* output object */
4961 PyObject *res = NULL;
4962 /* pointers to the beginning and end+1 of input */
4963 const Py_UNICODE *startp = p;
4964 const Py_UNICODE *endp = p + size;
4965 /* pointer into the output */
4966 Py_UNICODE *str;
4967 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 char *reason = "character maps to <undefined>";
4970 PyObject *errorHandler = NULL;
4971 PyObject *exc = NULL;
4972 /* the following variable is used for caching string comparisons
4973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4974 * 3=ignore, 4=xmlcharrefreplace */
4975 int known_errorHandler = -1;
4976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004978 PyErr_BadArgument();
4979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981
4982 /* allocate enough for a simple 1:1 translation without
4983 replacements, if we need more, we'll resize */
4984 res = PyUnicode_FromUnicode(NULL, size);
4985 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004992 /* try to encode it */
4993 PyObject *x = NULL;
4994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4995 Py_XDECREF(x);
4996 goto onError;
4997 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004998 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004999 if (x!=Py_None) /* it worked => adjust input pointer */
5000 ++p;
5001 else { /* untranslatable character */
5002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5003 Py_ssize_t repsize;
5004 Py_ssize_t newpos;
5005 Py_UNICODE *uni2;
5006 /* startpos for collecting untranslatable chars */
5007 const Py_UNICODE *collstart = p;
5008 const Py_UNICODE *collend = p+1;
5009 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 /* find all untranslatable characters */
5012 while (collend < endp) {
5013 if (charmaptranslate_lookup(*collend, mapping, &x))
5014 goto onError;
5015 Py_XDECREF(x);
5016 if (x!=Py_None)
5017 break;
5018 ++collend;
5019 }
5020 /* cache callback name lookup
5021 * (if not done yet, i.e. it's the first error) */
5022 if (known_errorHandler==-1) {
5023 if ((errors==NULL) || (!strcmp(errors, "strict")))
5024 known_errorHandler = 1;
5025 else if (!strcmp(errors, "replace"))
5026 known_errorHandler = 2;
5027 else if (!strcmp(errors, "ignore"))
5028 known_errorHandler = 3;
5029 else if (!strcmp(errors, "xmlcharrefreplace"))
5030 known_errorHandler = 4;
5031 else
5032 known_errorHandler = 0;
5033 }
5034 switch (known_errorHandler) {
5035 case 1: /* strict */
5036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005037 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005038 case 2: /* replace */
5039 /* No need to check for space, this is a 1:1 replacement */
5040 for (coll = collstart; coll<collend; ++coll)
5041 *str++ = '?';
5042 /* fall through */
5043 case 3: /* ignore */
5044 p = collend;
5045 break;
5046 case 4: /* xmlcharrefreplace */
5047 /* generate replacement (temporarily (mis)uses p) */
5048 for (p = collstart; p < collend; ++p) {
5049 char buffer[2+29+1+1];
5050 char *cp;
5051 sprintf(buffer, "&#%d;", (int)*p);
5052 if (charmaptranslate_makespace(&res, &str,
5053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5054 goto onError;
5055 for (cp = buffer; *cp; ++cp)
5056 *str++ = *cp;
5057 }
5058 p = collend;
5059 break;
5060 default:
5061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5062 reason, startp, size, &exc,
5063 collstart-startp, collend-startp, &newpos);
5064 if (repunicode == NULL)
5065 goto onError;
5066 /* generate replacement */
5067 repsize = PyUnicode_GET_SIZE(repunicode);
5068 if (charmaptranslate_makespace(&res, &str,
5069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5070 Py_DECREF(repunicode);
5071 goto onError;
5072 }
5073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5074 *str++ = *uni2;
5075 p = startp + newpos;
5076 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005077 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005078 }
5079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 /* Resize if we allocated to much */
5081 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005082 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005083 if (PyUnicode_Resize(&res, respos) < 0)
5084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005085 }
5086 Py_XDECREF(exc);
5087 Py_XDECREF(errorHandler);
5088 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005090 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 Py_XDECREF(res);
5092 Py_XDECREF(exc);
5093 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 return NULL;
5095}
5096
5097PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005098 PyObject *mapping,
5099 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100{
5101 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 str = PyUnicode_FromObject(str);
5104 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005105 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005107 PyUnicode_GET_SIZE(str),
5108 mapping,
5109 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 Py_DECREF(str);
5111 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005112
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005113 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_XDECREF(str);
5115 return NULL;
5116}
Tim Petersced69f82003-09-16 20:30:58 +00005117
Guido van Rossum9e896b32000-04-05 20:11:21 +00005118/* --- Decimal Encoder ---------------------------------------------------- */
5119
5120int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005121 Py_ssize_t length,
5122 char *output,
5123 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005124{
5125 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 PyObject *errorHandler = NULL;
5127 PyObject *exc = NULL;
5128 const char *encoding = "decimal";
5129 const char *reason = "invalid decimal Unicode string";
5130 /* the following variable is used for caching string comparisons
5131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5132 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005133
5134 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005135 PyErr_BadArgument();
5136 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005137 }
5138
5139 p = s;
5140 end = s + length;
5141 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005142 register Py_UNICODE ch = *p;
5143 int decimal;
5144 PyObject *repunicode;
5145 Py_ssize_t repsize;
5146 Py_ssize_t newpos;
5147 Py_UNICODE *uni2;
5148 Py_UNICODE *collstart;
5149 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005150
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005151 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005152 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 ++p;
5154 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005155 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005156 decimal = Py_UNICODE_TODECIMAL(ch);
5157 if (decimal >= 0) {
5158 *output++ = '0' + decimal;
5159 ++p;
5160 continue;
5161 }
5162 if (0 < ch && ch < 256) {
5163 *output++ = (char)ch;
5164 ++p;
5165 continue;
5166 }
5167 /* All other characters are considered unencodable */
5168 collstart = p;
5169 collend = p+1;
5170 while (collend < end) {
5171 if ((0 < *collend && *collend < 256) ||
5172 !Py_UNICODE_ISSPACE(*collend) ||
5173 Py_UNICODE_TODECIMAL(*collend))
5174 break;
5175 }
5176 /* cache callback name lookup
5177 * (if not done yet, i.e. it's the first error) */
5178 if (known_errorHandler==-1) {
5179 if ((errors==NULL) || (!strcmp(errors, "strict")))
5180 known_errorHandler = 1;
5181 else if (!strcmp(errors, "replace"))
5182 known_errorHandler = 2;
5183 else if (!strcmp(errors, "ignore"))
5184 known_errorHandler = 3;
5185 else if (!strcmp(errors, "xmlcharrefreplace"))
5186 known_errorHandler = 4;
5187 else
5188 known_errorHandler = 0;
5189 }
5190 switch (known_errorHandler) {
5191 case 1: /* strict */
5192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5193 goto onError;
5194 case 2: /* replace */
5195 for (p = collstart; p < collend; ++p)
5196 *output++ = '?';
5197 /* fall through */
5198 case 3: /* ignore */
5199 p = collend;
5200 break;
5201 case 4: /* xmlcharrefreplace */
5202 /* generate replacement (temporarily (mis)uses p) */
5203 for (p = collstart; p < collend; ++p)
5204 output += sprintf(output, "&#%d;", (int)*p);
5205 p = collend;
5206 break;
5207 default:
5208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5209 encoding, reason, s, length, &exc,
5210 collstart-s, collend-s, &newpos);
5211 if (repunicode == NULL)
5212 goto onError;
5213 /* generate replacement */
5214 repsize = PyUnicode_GET_SIZE(repunicode);
5215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5216 Py_UNICODE ch = *uni2;
5217 if (Py_UNICODE_ISSPACE(ch))
5218 *output++ = ' ';
5219 else {
5220 decimal = Py_UNICODE_TODECIMAL(ch);
5221 if (decimal >= 0)
5222 *output++ = '0' + decimal;
5223 else if (0 < ch && ch < 256)
5224 *output++ = (char)ch;
5225 else {
5226 Py_DECREF(repunicode);
5227 raise_encode_exception(&exc, encoding,
5228 s, length, collstart-s, collend-s, reason);
5229 goto onError;
5230 }
5231 }
5232 }
5233 p = s + newpos;
5234 Py_DECREF(repunicode);
5235 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005236 }
5237 /* 0-terminate the output string */
5238 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 Py_XDECREF(exc);
5240 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005241 return 0;
5242
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005243 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 Py_XDECREF(exc);
5245 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005246 return -1;
5247}
5248
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249/* --- Helpers ------------------------------------------------------------ */
5250
Eric Smitha9f7d622008-02-17 19:46:49 +00005251#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005252#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005253
5254#include "stringlib/count.h"
5255#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005256#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005257#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005258
Fredrik Lundhc8162812006-05-26 19:33:03 +00005259/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005260#define ADJUST_INDICES(start, end, len) \
5261 if (end > len) \
5262 end = len; \
5263 else if (end < 0) { \
5264 end += len; \
5265 if (end < 0) \
5266 end = 0; \
5267 } \
5268 if (start < 0) { \
5269 start += len; \
5270 if (start < 0) \
5271 start = 0; \
5272 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005273
Martin v. Löwis18e16552006-02-15 17:27:45 +00005274Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005275 PyObject *substr,
5276 Py_ssize_t start,
5277 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005279 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005280 PyUnicodeObject* str_obj;
5281 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005282
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005283 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5284 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005285 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005286 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5287 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005288 Py_DECREF(str_obj);
5289 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 }
Tim Petersced69f82003-09-16 20:30:58 +00005291
Antoine Pitrou64672132010-01-13 07:55:48 +00005292 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005293 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005294 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5295 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005296 );
5297
5298 Py_DECREF(sub_obj);
5299 Py_DECREF(str_obj);
5300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 return result;
5302}
5303
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005305 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005306 Py_ssize_t start,
5307 Py_ssize_t end,
5308 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005311
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005312 str = PyUnicode_FromObject(str);
5313 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005314 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005315 sub = PyUnicode_FromObject(sub);
5316 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005317 Py_DECREF(str);
5318 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 }
Tim Petersced69f82003-09-16 20:30:58 +00005320
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005321 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005322 result = stringlib_find_slice(
5323 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5324 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5325 start, end
5326 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005327 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005328 result = stringlib_rfind_slice(
5329 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5330 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5331 start, end
5332 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005333
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005334 Py_DECREF(str);
5335 Py_DECREF(sub);
5336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 return result;
5338}
5339
Tim Petersced69f82003-09-16 20:30:58 +00005340static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005342 PyUnicodeObject *substring,
5343 Py_ssize_t start,
5344 Py_ssize_t end,
5345 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 if (substring->length == 0)
5348 return 1;
5349
Antoine Pitrou64672132010-01-13 07:55:48 +00005350 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 end -= substring->length;
5352 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005353 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354
5355 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005356 if (Py_UNICODE_MATCH(self, end, substring))
5357 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 } else {
5359 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005360 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 }
5362
5363 return 0;
5364}
5365
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005367 PyObject *substr,
5368 Py_ssize_t start,
5369 Py_ssize_t end,
5370 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 str = PyUnicode_FromObject(str);
5375 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 substr = PyUnicode_FromObject(substr);
5378 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005379 Py_DECREF(str);
5380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 }
Tim Petersced69f82003-09-16 20:30:58 +00005382
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 (PyUnicodeObject *)substr,
5385 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 Py_DECREF(str);
5387 Py_DECREF(substr);
5388 return result;
5389}
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391/* Apply fixfct filter to the Unicode object self and return a
5392 reference to the modified object */
5393
Tim Petersced69f82003-09-16 20:30:58 +00005394static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005396 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397{
5398
5399 PyUnicodeObject *u;
5400
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005401 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005403 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005404
5405 Py_UNICODE_COPY(u->str, self->str, self->length);
5406
Tim Peters7a29bd52001-09-12 03:03:31 +00005407 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005408 /* fixfct should return TRUE if it modified the buffer. If
5409 FALSE, return a reference to the original buffer instead
5410 (to save space, not time) */
5411 Py_INCREF(self);
5412 Py_DECREF(u);
5413 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 }
5415 return (PyObject*) u;
5416}
5417
Tim Petersced69f82003-09-16 20:30:58 +00005418static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419int fixupper(PyUnicodeObject *self)
5420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005421 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 Py_UNICODE *s = self->str;
5423 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005426 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005427
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005428 ch = Py_UNICODE_TOUPPER(*s);
5429 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005431 *s = ch;
5432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 s++;
5434 }
5435
5436 return status;
5437}
5438
Tim Petersced69f82003-09-16 20:30:58 +00005439static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440int fixlower(PyUnicodeObject *self)
5441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005442 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 Py_UNICODE *s = self->str;
5444 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005447 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005448
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 ch = Py_UNICODE_TOLOWER(*s);
5450 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005452 *s = ch;
5453 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 s++;
5455 }
5456
5457 return status;
5458}
5459
Tim Petersced69f82003-09-16 20:30:58 +00005460static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461int fixswapcase(PyUnicodeObject *self)
5462{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005463 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 Py_UNICODE *s = self->str;
5465 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 while (len-- > 0) {
5468 if (Py_UNICODE_ISUPPER(*s)) {
5469 *s = Py_UNICODE_TOLOWER(*s);
5470 status = 1;
5471 } else if (Py_UNICODE_ISLOWER(*s)) {
5472 *s = Py_UNICODE_TOUPPER(*s);
5473 status = 1;
5474 }
5475 s++;
5476 }
5477
5478 return status;
5479}
5480
Tim Petersced69f82003-09-16 20:30:58 +00005481static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482int fixcapitalize(PyUnicodeObject *self)
5483{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005484 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005485 Py_UNICODE *s = self->str;
5486 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005487
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005488 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005489 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005490 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005491 *s = Py_UNICODE_TOUPPER(*s);
5492 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005494 s++;
5495 while (--len > 0) {
5496 if (Py_UNICODE_ISUPPER(*s)) {
5497 *s = Py_UNICODE_TOLOWER(*s);
5498 status = 1;
5499 }
5500 s++;
5501 }
5502 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503}
5504
5505static
5506int fixtitle(PyUnicodeObject *self)
5507{
5508 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5509 register Py_UNICODE *e;
5510 int previous_is_cased;
5511
5512 /* Shortcut for single character strings */
5513 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005514 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5515 if (*p != ch) {
5516 *p = ch;
5517 return 1;
5518 }
5519 else
5520 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 }
Tim Petersced69f82003-09-16 20:30:58 +00005522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 e = p + PyUnicode_GET_SIZE(self);
5524 previous_is_cased = 0;
5525 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005526 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005527
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005528 if (previous_is_cased)
5529 *p = Py_UNICODE_TOLOWER(ch);
5530 else
5531 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005532
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005533 if (Py_UNICODE_ISLOWER(ch) ||
5534 Py_UNICODE_ISUPPER(ch) ||
5535 Py_UNICODE_ISTITLE(ch))
5536 previous_is_cased = 1;
5537 else
5538 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
5540 return 1;
5541}
5542
Tim Peters8ce9f162004-08-27 01:49:32 +00005543PyObject *
5544PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005547 const Py_UNICODE blank = ' ';
5548 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005549 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005551 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5552 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005553 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5554 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005555 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005557 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
Tim Peters05eba1f2004-08-27 21:32:02 +00005559 fseq = PySequence_Fast(seq, "");
5560 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005561 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005562 }
5563
Tim Peters91879ab2004-08-27 22:35:44 +00005564 /* Grrrr. A codec may be invoked to convert str objects to
5565 * Unicode, and so it's possible to call back into Python code
5566 * during PyUnicode_FromObject(), and so it's possible for a sick
5567 * codec to change the size of fseq (if seq is a list). Therefore
5568 * we have to keep refetching the size -- can't assume seqlen
5569 * is invariant.
5570 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005571 seqlen = PySequence_Fast_GET_SIZE(fseq);
5572 /* If empty sequence, return u"". */
5573 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005574 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5575 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 }
5577 /* If singleton sequence with an exact Unicode, return that. */
5578 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005579 item = PySequence_Fast_GET_ITEM(fseq, 0);
5580 if (PyUnicode_CheckExact(item)) {
5581 Py_INCREF(item);
5582 res = (PyUnicodeObject *)item;
5583 goto Done;
5584 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005585 }
5586
Tim Peters05eba1f2004-08-27 21:32:02 +00005587 /* At least two items to join, or one that isn't exact Unicode. */
5588 if (seqlen > 1) {
5589 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005590 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005591 sep = &blank;
5592 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005593 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005594 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005595 internal_separator = PyUnicode_FromObject(separator);
5596 if (internal_separator == NULL)
5597 goto onError;
5598 sep = PyUnicode_AS_UNICODE(internal_separator);
5599 seplen = PyUnicode_GET_SIZE(internal_separator);
5600 /* In case PyUnicode_FromObject() mutated seq. */
5601 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005602 }
5603 }
5604
5605 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005606 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005607 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005608 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 res_p = PyUnicode_AS_UNICODE(res);
5610 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005611
Tim Peters05eba1f2004-08-27 21:32:02 +00005612 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 Py_ssize_t itemlen;
5614 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005615
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005616 item = PySequence_Fast_GET_ITEM(fseq, i);
5617 /* Convert item to Unicode. */
5618 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5619 PyErr_Format(PyExc_TypeError,
5620 "sequence item %zd: expected string or Unicode,"
5621 " %.80s found",
5622 i, Py_TYPE(item)->tp_name);
5623 goto onError;
5624 }
5625 item = PyUnicode_FromObject(item);
5626 if (item == NULL)
5627 goto onError;
5628 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005629
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005630 /* In case PyUnicode_FromObject() mutated seq. */
5631 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005632
Tim Peters8ce9f162004-08-27 01:49:32 +00005633 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 itemlen = PyUnicode_GET_SIZE(item);
5635 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005636 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005637 goto Overflow;
5638 if (i < seqlen - 1) {
5639 new_res_used += seplen;
5640 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005641 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005642 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005643 if (new_res_used > res_alloc) {
5644 /* double allocated size until it's big enough */
5645 do {
5646 res_alloc += res_alloc;
5647 if (res_alloc <= 0)
5648 goto Overflow;
5649 } while (new_res_used > res_alloc);
5650 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5651 Py_DECREF(item);
5652 goto onError;
5653 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005654 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005655 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005656
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005657 /* Copy item, and maybe the separator. */
5658 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5659 res_p += itemlen;
5660 if (i < seqlen - 1) {
5661 Py_UNICODE_COPY(res_p, sep, seplen);
5662 res_p += seplen;
5663 }
5664 Py_DECREF(item);
5665 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005667
Tim Peters05eba1f2004-08-27 21:32:02 +00005668 /* Shrink res to match the used area; this probably can't fail,
5669 * but it's cheap to check.
5670 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005671 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005672 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005673
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005674 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005675 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 return (PyObject *)res;
5678
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005681 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005682 Py_DECREF(item);
5683 /* fall through */
5684
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005685 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005687 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005688 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 return NULL;
5690}
5691
Tim Petersced69f82003-09-16 20:30:58 +00005692static
5693PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005694 Py_ssize_t left,
5695 Py_ssize_t right,
5696 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697{
5698 PyUnicodeObject *u;
5699
5700 if (left < 0)
5701 left = 0;
5702 if (right < 0)
5703 right = 0;
5704
Tim Peters7a29bd52001-09-12 03:03:31 +00005705 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 Py_INCREF(self);
5707 return self;
5708 }
5709
Neal Norwitze7d8be82008-07-31 17:17:14 +00005710 if (left > PY_SSIZE_T_MAX - self->length ||
5711 right > PY_SSIZE_T_MAX - (left + self->length)) {
5712 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5713 return NULL;
5714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 u = _PyUnicode_New(left + self->length + right);
5716 if (u) {
5717 if (left)
5718 Py_UNICODE_FILL(u->str, fill, left);
5719 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5720 if (right)
5721 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5722 }
5723
5724 return u;
5725}
5726
Antoine Pitrou64672132010-01-13 07:55:48 +00005727PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
5731 string = PyUnicode_FromObject(string);
5732 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
Antoine Pitrou64672132010-01-13 07:55:48 +00005735 list = stringlib_splitlines(
5736 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5737 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
5739 Py_DECREF(string);
5740 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741}
5742
Tim Petersced69f82003-09-16 20:30:58 +00005743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005745 PyUnicodeObject *substring,
5746 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005749 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005752 return stringlib_split_whitespace(
5753 (PyObject*) self, self->str, self->length, maxcount
5754 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Antoine Pitrou64672132010-01-13 07:55:48 +00005756 return stringlib_split(
5757 (PyObject*) self, self->str, self->length,
5758 substring->str, substring->length,
5759 maxcount
5760 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761}
5762
Tim Petersced69f82003-09-16 20:30:58 +00005763static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005764PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005765 PyUnicodeObject *substring,
5766 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005769 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005770
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005772 return stringlib_rsplit_whitespace(
5773 (PyObject*) self, self->str, self->length, maxcount
5774 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775
Antoine Pitrou64672132010-01-13 07:55:48 +00005776 return stringlib_rsplit(
5777 (PyObject*) self, self->str, self->length,
5778 substring->str, substring->length,
5779 maxcount
5780 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005781}
5782
5783static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005785 PyUnicodeObject *str1,
5786 PyUnicodeObject *str2,
5787 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788{
5789 PyUnicodeObject *u;
5790
5791 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005792 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005793 else if (maxcount == 0 || self->length == 0)
5794 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Fredrik Lundh347ee272006-05-24 16:35:18 +00005796 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005797 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005798 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005799 if (str1->length == 0)
5800 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005801 if (str1->length == 1) {
5802 /* replace characters */
5803 Py_UNICODE u1, u2;
5804 if (!findchar(self->str, self->length, str1->str[0]))
5805 goto nothing;
5806 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5807 if (!u)
5808 return NULL;
5809 Py_UNICODE_COPY(u->str, self->str, self->length);
5810 u1 = str1->str[0];
5811 u2 = str2->str[0];
5812 for (i = 0; i < u->length; i++)
5813 if (u->str[i] == u1) {
5814 if (--maxcount < 0)
5815 break;
5816 u->str[i] = u2;
5817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005819 i = stringlib_find(
5820 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005822 if (i < 0)
5823 goto nothing;
5824 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5825 if (!u)
5826 return NULL;
5827 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005828
5829 /* change everything in-place, starting with this one */
5830 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5831 i += str1->length;
5832
5833 while ( --maxcount > 0) {
5834 i = stringlib_find(self->str+i, self->length-i,
5835 str1->str, str1->length,
5836 i);
5837 if (i == -1)
5838 break;
5839 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5840 i += str1->length;
5841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005844
Brett Cannona7f13ee2010-05-04 01:16:51 +00005845 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005846 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 Py_UNICODE *p;
5848
5849 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005850 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5851 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005852 if (n == 0)
5853 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005854 /* new_size = self->length + n * (str2->length - str1->length)); */
5855 delta = (str2->length - str1->length);
5856 if (delta == 0) {
5857 new_size = self->length;
5858 } else {
5859 product = n * (str2->length - str1->length);
5860 if ((product / (str2->length - str1->length)) != n) {
5861 PyErr_SetString(PyExc_OverflowError,
5862 "replace string is too long");
5863 return NULL;
5864 }
5865 new_size = self->length + product;
5866 if (new_size < 0) {
5867 PyErr_SetString(PyExc_OverflowError,
5868 "replace string is too long");
5869 return NULL;
5870 }
5871 }
5872 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005873 if (!u)
5874 return NULL;
5875 i = 0;
5876 p = u->str;
5877 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005878 while (n-- > 0) {
5879 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005880 j = stringlib_find(self->str+i, self->length-i,
5881 str1->str, str1->length,
5882 i);
5883 if (j == -1)
5884 break;
5885 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005886 /* copy unchanged part [i:j] */
5887 Py_UNICODE_COPY(p, self->str+i, j-i);
5888 p += j - i;
5889 }
5890 /* copy substitution string */
5891 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005892 Py_UNICODE_COPY(p, str2->str, str2->length);
5893 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005894 }
5895 i = j + str1->length;
5896 }
5897 if (i < self->length)
5898 /* copy tail [i:] */
5899 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005900 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005901 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005902 while (n > 0) {
5903 Py_UNICODE_COPY(p, str2->str, str2->length);
5904 p += str2->length;
5905 if (--n <= 0)
5906 break;
5907 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005909 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 }
5911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005913
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005914 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005915 /* nothing to replace; return original string (when possible) */
5916 if (PyUnicode_CheckExact(self)) {
5917 Py_INCREF(self);
5918 return (PyObject *) self;
5919 }
5920 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921}
5922
5923/* --- Unicode Object Methods --------------------------------------------- */
5924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005925PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005926 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927\n\
5928Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005929characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
5931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005932unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 return fixup(self, fixtitle);
5935}
5936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005937PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005938 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939\n\
5940Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005941have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
5943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005944unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 return fixup(self, fixcapitalize);
5947}
5948
5949#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005950PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005951 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952\n\
5953Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005957unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958{
5959 PyObject *list;
5960 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 /* Split into words */
5964 list = split(self, NULL, -1);
5965 if (!list)
5966 return NULL;
5967
5968 /* Capitalize each word */
5969 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5970 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005971 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 if (item == NULL)
5973 goto onError;
5974 Py_DECREF(PyList_GET_ITEM(list, i));
5975 PyList_SET_ITEM(list, i, item);
5976 }
5977
5978 /* Join the words to form a new string */
5979 item = PyUnicode_Join(NULL, list);
5980
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005981 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 Py_DECREF(list);
5983 return (PyObject *)item;
5984}
5985#endif
5986
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005987/* Argument converter. Coerces to a single unicode character */
5988
5989static int
5990convert_uc(PyObject *obj, void *addr)
5991{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005992 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5993 PyObject *uniobj;
5994 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005995
Benjamin Peterson857ce152009-01-31 16:29:18 +00005996 uniobj = PyUnicode_FromObject(obj);
5997 if (uniobj == NULL) {
5998 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005999 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006000 return 0;
6001 }
6002 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6003 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006004 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006005 Py_DECREF(uniobj);
6006 return 0;
6007 }
6008 unistr = PyUnicode_AS_UNICODE(uniobj);
6009 *fillcharloc = unistr[0];
6010 Py_DECREF(uniobj);
6011 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006012}
6013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006015 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006017Return S centered in a Unicode string of length width. Padding is\n\
6018done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020static PyObject *
6021unicode_center(PyUnicodeObject *self, PyObject *args)
6022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006023 Py_ssize_t marg, left;
6024 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006025 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Thomas Woutersde017742006-02-16 19:34:37 +00006027 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 return NULL;
6029
Tim Peters7a29bd52001-09-12 03:03:31 +00006030 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 Py_INCREF(self);
6032 return (PyObject*) self;
6033 }
6034
6035 marg = width - self->length;
6036 left = marg / 2 + (marg & width & 1);
6037
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006038 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039}
6040
Marc-André Lemburge5034372000-08-08 08:04:29 +00006041#if 0
6042
6043/* This code should go into some future Unicode collation support
6044 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006045 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006046
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006047/* speedy UTF-16 code point order comparison */
6048/* gleaned from: */
6049/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6050
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006051static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006052{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006053 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006054 0, 0, 0, 0, 0, 0, 0, 0,
6055 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006056 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006057};
6058
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059static int
6060unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6061{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 Py_UNICODE *s1 = str1->str;
6065 Py_UNICODE *s2 = str2->str;
6066
6067 len1 = str1->length;
6068 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006071 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006072
6073 c1 = *s1++;
6074 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006075
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006076 if (c1 > (1<<11) * 26)
6077 c1 += utf16Fixup[c1>>11];
6078 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006079 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006080 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006081
6082 if (c1 != c2)
6083 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006084
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 }
6087
6088 return (len1 < len2) ? -1 : (len1 != len2);
6089}
6090
Marc-André Lemburge5034372000-08-08 08:04:29 +00006091#else
6092
6093static int
6094unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006097
6098 Py_UNICODE *s1 = str1->str;
6099 Py_UNICODE *s2 = str2->str;
6100
6101 len1 = str1->length;
6102 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006103
Marc-André Lemburge5034372000-08-08 08:04:29 +00006104 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006105 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006106
Fredrik Lundh45714e92001-06-26 16:39:36 +00006107 c1 = *s1++;
6108 c2 = *s2++;
6109
6110 if (c1 != c2)
6111 return (c1 < c2) ? -1 : 1;
6112
Marc-André Lemburge5034372000-08-08 08:04:29 +00006113 len1--; len2--;
6114 }
6115
6116 return (len1 < len2) ? -1 : (len1 != len2);
6117}
6118
6119#endif
6120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006122 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123{
6124 PyUnicodeObject *u = NULL, *v = NULL;
6125 int result;
6126
6127 /* Coerce the two arguments */
6128 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6129 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6132 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006133 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Thomas Wouters7e474022000-07-16 12:04:32 +00006135 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006137 Py_DECREF(u);
6138 Py_DECREF(v);
6139 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 }
6141
6142 result = unicode_compare(u, v);
6143
6144 Py_DECREF(u);
6145 Py_DECREF(v);
6146 return result;
6147
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 Py_XDECREF(u);
6150 Py_XDECREF(v);
6151 return -1;
6152}
6153
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006154PyObject *PyUnicode_RichCompare(PyObject *left,
6155 PyObject *right,
6156 int op)
6157{
6158 int result;
6159
6160 result = PyUnicode_Compare(left, right);
6161 if (result == -1 && PyErr_Occurred())
6162 goto onError;
6163
6164 /* Convert the return value to a Boolean */
6165 switch (op) {
6166 case Py_EQ:
6167 result = (result == 0);
6168 break;
6169 case Py_NE:
6170 result = (result != 0);
6171 break;
6172 case Py_LE:
6173 result = (result <= 0);
6174 break;
6175 case Py_GE:
6176 result = (result >= 0);
6177 break;
6178 case Py_LT:
6179 result = (result == -1);
6180 break;
6181 case Py_GT:
6182 result = (result == 1);
6183 break;
6184 }
6185 return PyBool_FromLong(result);
6186
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006187 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006188
6189 /* Standard case
6190
6191 Type errors mean that PyUnicode_FromObject() could not convert
6192 one of the arguments (usually the right hand side) to Unicode,
6193 ie. we can't handle the comparison request. However, it is
6194 possible that the other object knows a comparison method, which
6195 is why we return Py_NotImplemented to give the other object a
6196 chance.
6197
6198 */
6199 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6200 PyErr_Clear();
6201 Py_INCREF(Py_NotImplemented);
6202 return Py_NotImplemented;
6203 }
6204 if (op != Py_EQ && op != Py_NE)
6205 return NULL;
6206
6207 /* Equality comparison.
6208
6209 This is a special case: we silence any PyExc_UnicodeDecodeError
6210 and instead turn it into a PyErr_UnicodeWarning.
6211
6212 */
6213 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6214 return NULL;
6215 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006216 if (PyErr_Warn(PyExc_UnicodeWarning,
6217 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006218 "Unicode equal comparison "
6219 "failed to convert both arguments to Unicode - "
6220 "interpreting them as being unequal" :
6221 "Unicode unequal comparison "
6222 "failed to convert both arguments to Unicode - "
6223 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006224 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006225 return NULL;
6226 result = (op == Py_NE);
6227 return PyBool_FromLong(result);
6228}
6229
Guido van Rossum403d68b2000-03-13 15:55:09 +00006230int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006231 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006232{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006233 PyObject *str, *sub;
6234 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006235
6236 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006237 sub = PyUnicode_FromObject(element);
6238 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006239 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006240 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006241
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006242 str = PyUnicode_FromObject(container);
6243 if (!str) {
6244 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006245 return -1;
6246 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006247
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006248 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006249
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006250 Py_DECREF(str);
6251 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006253 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006254}
6255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256/* Concat to string or Unicode object giving a new Unicode object. */
6257
6258PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006259 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
6261 PyUnicodeObject *u = NULL, *v = NULL, *w;
6262
6263 /* Coerce the two arguments */
6264 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6265 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006266 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6268 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006269 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
6271 /* Shortcuts */
6272 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006273 Py_DECREF(v);
6274 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 }
6276 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006277 Py_DECREF(u);
6278 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 }
6280
6281 /* Concat the two Unicode strings */
6282 w = _PyUnicode_New(u->length + v->length);
6283 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006284 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 Py_UNICODE_COPY(w->str, u->str, u->length);
6286 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6287
6288 Py_DECREF(u);
6289 Py_DECREF(v);
6290 return (PyObject *)w;
6291
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 Py_XDECREF(u);
6294 Py_XDECREF(v);
6295 return NULL;
6296}
6297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006299 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006301Return the number of non-overlapping occurrences of substring sub in\n\
6302Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006303interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304
6305static PyObject *
6306unicode_count(PyUnicodeObject *self, PyObject *args)
6307{
6308 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006309 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006310 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 PyObject *result;
6312
Guido van Rossumb8872e62000-05-09 14:14:27 +00006313 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006314 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 return NULL;
6316
6317 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006318 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006320 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006321
Antoine Pitrou64672132010-01-13 07:55:48 +00006322 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006323 result = PyInt_FromSsize_t(
6324 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006325 substring->str, substring->length,
6326 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006327 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
6329 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006330
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 return result;
6332}
6333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006334PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006335 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006337Encodes S using the codec registered for encoding. encoding defaults\n\
6338to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006339handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6341'xmlcharrefreplace' as well as any other name registered with\n\
6342codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006345unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006347 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 char *encoding = NULL;
6349 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006350 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006351
Benjamin Peterson332d7212009-09-18 21:14:55 +00006352 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6353 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006355 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006356 if (v == NULL)
6357 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006358 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006359 PyErr_Format(PyExc_TypeError,
6360 "encoder did not return a string/unicode object "
6361 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006362 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006363 Py_DECREF(v);
6364 return NULL;
6365 }
6366 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006367
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006368 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006369 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006370}
6371
6372PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006373 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006374\n\
6375Decodes S using the codec registered for encoding. encoding defaults\n\
6376to the default encoding. errors may be given to set a different error\n\
6377handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6378a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6379as well as any other name registerd with codecs.register_error that is\n\
6380able to handle UnicodeDecodeErrors.");
6381
6382static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006383unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006385 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006386 char *encoding = NULL;
6387 char *errors = NULL;
6388 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006389
Benjamin Peterson332d7212009-09-18 21:14:55 +00006390 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6391 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006392 return NULL;
6393 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006394 if (v == NULL)
6395 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006396 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397 PyErr_Format(PyExc_TypeError,
6398 "decoder did not return a string/unicode object "
6399 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006400 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006401 Py_DECREF(v);
6402 return NULL;
6403 }
6404 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006405
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006406 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408}
6409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006410PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006411 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412\n\
6413Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
6416static PyObject*
6417unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6418{
6419 Py_UNICODE *e;
6420 Py_UNICODE *p;
6421 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006422 Py_UNICODE *qe;
6423 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 PyUnicodeObject *u;
6425 int tabsize = 8;
6426
6427 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Thomas Wouters7e474022000-07-16 12:04:32 +00006430 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006431 i = 0; /* chars up to and including most recent \n or \r */
6432 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6433 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 for (p = self->str; p < e; p++)
6435 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006436 if (tabsize > 0) {
6437 incr = tabsize - (j % tabsize); /* cannot overflow */
6438 if (j > PY_SSIZE_T_MAX - incr)
6439 goto overflow1;
6440 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006441 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006444 if (j > PY_SSIZE_T_MAX - 1)
6445 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 j++;
6447 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006448 if (i > PY_SSIZE_T_MAX - j)
6449 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006451 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
6453 }
6454
Guido van Rossum5bdff602008-03-11 21:18:06 +00006455 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006456 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006457
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 /* Second pass: create output string and fill it */
6459 u = _PyUnicode_New(i + j);
6460 if (!u)
6461 return NULL;
6462
Guido van Rossum5bdff602008-03-11 21:18:06 +00006463 j = 0; /* same as in first pass */
6464 q = u->str; /* next output char */
6465 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
6467 for (p = self->str; p < e; p++)
6468 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006469 if (tabsize > 0) {
6470 i = tabsize - (j % tabsize);
6471 j += i;
6472 while (i--) {
6473 if (q >= qe)
6474 goto overflow2;
6475 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006476 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006477 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006478 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006479 else {
6480 if (q >= qe)
6481 goto overflow2;
6482 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006483 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 if (*p == '\n' || *p == '\r')
6485 j = 0;
6486 }
6487
6488 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006489
6490 overflow2:
6491 Py_DECREF(u);
6492 overflow1:
6493 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006497PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006498 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499\n\
6500Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006501such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502arguments start and end are interpreted as in slice notation.\n\
6503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
6506static PyObject *
6507unicode_find(PyUnicodeObject *self, PyObject *args)
6508{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006509 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006510 Py_ssize_t start;
6511 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006512 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
Facundo Batista57d56692007-11-16 18:04:14 +00006514 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006517 result = stringlib_find_slice(
6518 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6519 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6520 start, end
6521 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
6523 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006524
6525 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526}
6527
6528static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006529unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530{
6531 if (index < 0 || index >= self->length) {
6532 PyErr_SetString(PyExc_IndexError, "string index out of range");
6533 return NULL;
6534 }
6535
6536 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6537}
6538
6539static long
6540unicode_hash(PyUnicodeObject *self)
6541{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006542 /* Since Unicode objects compare equal to their ASCII string
6543 counterparts, they should use the individual character values
6544 as basis for their hash value. This is needed to assure that
6545 strings and Unicode objects behave in the same way as
6546 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547
Martin v. Löwis18e16552006-02-15 17:27:45 +00006548 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006549 register Py_UNICODE *p;
6550 register long x;
6551
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006553 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006554 len = PyUnicode_GET_SIZE(self);
6555 p = PyUnicode_AS_UNICODE(self);
6556 x = *p << 7;
6557 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006558 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006559 x ^= PyUnicode_GET_SIZE(self);
6560 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006561 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006562 self->hash = x;
6563 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564}
6565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006567 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006569Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
6571static PyObject *
6572unicode_index(PyUnicodeObject *self, PyObject *args)
6573{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006574 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006575 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006576 Py_ssize_t start;
6577 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
Facundo Batista57d56692007-11-16 18:04:14 +00006579 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006582 result = stringlib_find_slice(
6583 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6584 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6585 start, end
6586 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587
6588 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 if (result < 0) {
6591 PyErr_SetString(PyExc_ValueError, "substring not found");
6592 return NULL;
6593 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006594
Martin v. Löwis18e16552006-02-15 17:27:45 +00006595 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596}
6597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006598PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006599 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006601Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
6604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006605unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
6607 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6608 register const Py_UNICODE *e;
6609 int cased;
6610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 /* Shortcut for single character strings */
6612 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006613 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006615 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006616 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006617 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 e = p + PyUnicode_GET_SIZE(self);
6620 cased = 0;
6621 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006622 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006623
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006624 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6625 return PyBool_FromLong(0);
6626 else if (!cased && Py_UNICODE_ISLOWER(ch))
6627 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006629 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006632PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006633 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006635Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
6638static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006639unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640{
6641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6642 register const Py_UNICODE *e;
6643 int cased;
6644
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 /* Shortcut for single character strings */
6646 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006647 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006649 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006650 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006651 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006652
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 e = p + PyUnicode_GET_SIZE(self);
6654 cased = 0;
6655 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006657
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006658 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6659 return PyBool_FromLong(0);
6660 else if (!cased && Py_UNICODE_ISUPPER(ch))
6661 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006663 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006667 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006669Return True if S is a titlecased string and there is at least one\n\
6670character in S, i.e. upper- and titlecase characters may only\n\
6671follow uncased characters and lowercase characters only cased ones.\n\
6672Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673
6674static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006675unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676{
6677 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6678 register const Py_UNICODE *e;
6679 int cased, previous_is_cased;
6680
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 /* Shortcut for single character strings */
6682 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006683 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6684 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006686 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006687 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006688 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006689
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 e = p + PyUnicode_GET_SIZE(self);
6691 cased = 0;
6692 previous_is_cased = 0;
6693 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006694 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006695
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006696 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6697 if (previous_is_cased)
6698 return PyBool_FromLong(0);
6699 previous_is_cased = 1;
6700 cased = 1;
6701 }
6702 else if (Py_UNICODE_ISLOWER(ch)) {
6703 if (!previous_is_cased)
6704 return PyBool_FromLong(0);
6705 previous_is_cased = 1;
6706 cased = 1;
6707 }
6708 else
6709 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006711 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712}
6713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006714PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006715 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006717Return True if all characters in S are whitespace\n\
6718and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
6720static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006721unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722{
6723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6724 register const Py_UNICODE *e;
6725
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 /* Shortcut for single character strings */
6727 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006728 Py_UNICODE_ISSPACE(*p))
6729 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006731 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006732 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 e = p + PyUnicode_GET_SIZE(self);
6736 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006737 if (!Py_UNICODE_ISSPACE(*p))
6738 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006740 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006744 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006745\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006746Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006748
6749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006750unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006751{
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6754
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006755 /* Shortcut for single character strings */
6756 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006757 Py_UNICODE_ISALPHA(*p))
6758 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006759
6760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006761 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006763
6764 e = p + PyUnicode_GET_SIZE(self);
6765 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006766 if (!Py_UNICODE_ISALPHA(*p))
6767 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006768 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006769 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006770}
6771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006772PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006773 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006774\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006775Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006776and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006777
6778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006779unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006780{
6781 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6782 register const Py_UNICODE *e;
6783
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006784 /* Shortcut for single character strings */
6785 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006786 Py_UNICODE_ISALNUM(*p))
6787 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006788
6789 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006790 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792
6793 e = p + PyUnicode_GET_SIZE(self);
6794 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006795 if (!Py_UNICODE_ISALNUM(*p))
6796 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006798 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006799}
6800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006801PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006802 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006804Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006805False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
6807static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006808unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
6810 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6811 register const Py_UNICODE *e;
6812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 /* Shortcut for single character strings */
6814 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006815 Py_UNICODE_ISDECIMAL(*p))
6816 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006819 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 e = p + PyUnicode_GET_SIZE(self);
6823 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006824 if (!Py_UNICODE_ISDECIMAL(*p))
6825 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006827 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828}
6829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006830PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006831 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006833Return True if all characters in S are digits\n\
6834and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
6836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006837unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838{
6839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840 register const Py_UNICODE *e;
6841
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 /* Shortcut for single character strings */
6843 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006844 Py_UNICODE_ISDIGIT(*p))
6845 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006847 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006848 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006849 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 e = p + PyUnicode_GET_SIZE(self);
6852 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006853 if (!Py_UNICODE_ISDIGIT(*p))
6854 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857}
6858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006860 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006863False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
6865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006866unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
6868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869 register const Py_UNICODE *e;
6870
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 /* Shortcut for single character strings */
6872 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006873 Py_UNICODE_ISNUMERIC(*p))
6874 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006876 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006877 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006878 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 e = p + PyUnicode_GET_SIZE(self);
6881 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006882 if (!Py_UNICODE_ISNUMERIC(*p))
6883 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886}
6887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006889 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890\n\
6891Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006892iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
6894static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006895unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006897 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898}
6899
Martin v. Löwis18e16552006-02-15 17:27:45 +00006900static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901unicode_length(PyUnicodeObject *self)
6902{
6903 return self->length;
6904}
6905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006907 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006909Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006910done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912static PyObject *
6913unicode_ljust(PyUnicodeObject *self, PyObject *args)
6914{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006915 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006916 Py_UNICODE fillchar = ' ';
6917
Martin v. Löwis412fb672006-04-13 06:34:32 +00006918 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 return NULL;
6920
Tim Peters7a29bd52001-09-12 03:03:31 +00006921 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 Py_INCREF(self);
6923 return (PyObject*) self;
6924 }
6925
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006926 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927}
6928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006930 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006932Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
6934static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006935unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 return fixup(self, fixlower);
6938}
6939
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006940#define LEFTSTRIP 0
6941#define RIGHTSTRIP 1
6942#define BOTHSTRIP 2
6943
6944/* Arrays indexed by above */
6945static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6946
6947#define STRIPNAME(i) (stripformat[i]+3)
6948
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006949/* externally visible for str.strip(unicode) */
6950PyObject *
6951_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6952{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006953 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6954 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6955 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6956 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6957 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006958
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006959 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006960
Benjamin Peterson857ce152009-01-31 16:29:18 +00006961 i = 0;
6962 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006963 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6964 i++;
6965 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006966 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006967
Benjamin Peterson857ce152009-01-31 16:29:18 +00006968 j = len;
6969 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006970 do {
6971 j--;
6972 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6973 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006974 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006975
Benjamin Peterson857ce152009-01-31 16:29:18 +00006976 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006977 Py_INCREF(self);
6978 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006979 }
6980 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006982}
6983
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984
6985static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006986do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006988 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6989 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990
Benjamin Peterson857ce152009-01-31 16:29:18 +00006991 i = 0;
6992 if (striptype != RIGHTSTRIP) {
6993 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6994 i++;
6995 }
6996 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006997
Benjamin Peterson857ce152009-01-31 16:29:18 +00006998 j = len;
6999 if (striptype != LEFTSTRIP) {
7000 do {
7001 j--;
7002 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7003 j++;
7004 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007005
Benjamin Peterson857ce152009-01-31 16:29:18 +00007006 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7007 Py_INCREF(self);
7008 return (PyObject*)self;
7009 }
7010 else
7011 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012}
7013
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007014
7015static PyObject *
7016do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007018 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019
Benjamin Peterson857ce152009-01-31 16:29:18 +00007020 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7021 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007022
Benjamin Peterson857ce152009-01-31 16:29:18 +00007023 if (sep != NULL && sep != Py_None) {
7024 if (PyUnicode_Check(sep))
7025 return _PyUnicode_XStrip(self, striptype, sep);
7026 else if (PyString_Check(sep)) {
7027 PyObject *res;
7028 sep = PyUnicode_FromObject(sep);
7029 if (sep==NULL)
7030 return NULL;
7031 res = _PyUnicode_XStrip(self, striptype, sep);
7032 Py_DECREF(sep);
7033 return res;
7034 }
7035 else {
7036 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007037 "%s arg must be None, unicode or str",
7038 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007039 return NULL;
7040 }
7041 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042
Benjamin Peterson857ce152009-01-31 16:29:18 +00007043 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007044}
7045
7046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007047PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007048 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049\n\
7050Return a copy of the string S with leading and trailing\n\
7051whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007052If chars is given and not None, remove characters in chars instead.\n\
7053If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054
7055static PyObject *
7056unicode_strip(PyUnicodeObject *self, PyObject *args)
7057{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007058 if (PyTuple_GET_SIZE(args) == 0)
7059 return do_strip(self, BOTHSTRIP); /* Common case */
7060 else
7061 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062}
7063
7064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007065PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007066 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067\n\
7068Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007069If chars is given and not None, remove characters in chars instead.\n\
7070If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071
7072static PyObject *
7073unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7074{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007075 if (PyTuple_GET_SIZE(args) == 0)
7076 return do_strip(self, LEFTSTRIP); /* Common case */
7077 else
7078 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079}
7080
7081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007082PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007083 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084\n\
7085Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007086If chars is given and not None, remove characters in chars instead.\n\
7087If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007088
7089static PyObject *
7090unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7091{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007092 if (PyTuple_GET_SIZE(args) == 0)
7093 return do_strip(self, RIGHTSTRIP); /* Common case */
7094 else
7095 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007096}
7097
7098
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007100unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101{
7102 PyUnicodeObject *u;
7103 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007104 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007105 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106
7107 if (len < 0)
7108 len = 0;
7109
Tim Peters7a29bd52001-09-12 03:03:31 +00007110 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 /* no repeat, return original string */
7112 Py_INCREF(str);
7113 return (PyObject*) str;
7114 }
Tim Peters8f422462000-09-09 06:13:41 +00007115
7116 /* ensure # of chars needed doesn't overflow int and # of bytes
7117 * needed doesn't overflow size_t
7118 */
7119 nchars = len * str->length;
7120 if (len && nchars / len != str->length) {
7121 PyErr_SetString(PyExc_OverflowError,
7122 "repeated string is too long");
7123 return NULL;
7124 }
7125 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7126 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7127 PyErr_SetString(PyExc_OverflowError,
7128 "repeated string is too long");
7129 return NULL;
7130 }
7131 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 if (!u)
7133 return NULL;
7134
7135 p = u->str;
7136
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007137 if (str->length == 1 && len > 0) {
7138 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007139 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007140 Py_ssize_t done = 0; /* number of characters copied this far */
7141 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007142 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007143 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007144 }
7145 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007146 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007147 Py_UNICODE_COPY(p+done, p, n);
7148 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007149 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152 return (PyObject*) u;
7153}
7154
7155PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007156 PyObject *subobj,
7157 PyObject *replobj,
7158 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
7160 PyObject *self;
7161 PyObject *str1;
7162 PyObject *str2;
7163 PyObject *result;
7164
7165 self = PyUnicode_FromObject(obj);
7166 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 str1 = PyUnicode_FromObject(subobj);
7169 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007170 Py_DECREF(self);
7171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 }
7173 str2 = PyUnicode_FromObject(replobj);
7174 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007175 Py_DECREF(self);
7176 Py_DECREF(str1);
7177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 }
Tim Petersced69f82003-09-16 20:30:58 +00007179 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007180 (PyUnicodeObject *)str1,
7181 (PyUnicodeObject *)str2,
7182 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 Py_DECREF(self);
7184 Py_DECREF(str1);
7185 Py_DECREF(str2);
7186 return result;
7187}
7188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007189PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007190 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191\n\
7192Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007193old replaced by new. If the optional argument count is\n\
7194given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195
7196static PyObject*
7197unicode_replace(PyUnicodeObject *self, PyObject *args)
7198{
7199 PyUnicodeObject *str1;
7200 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 PyObject *result;
7203
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
7206 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7207 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007210 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007211 Py_DECREF(str1);
7212 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215 result = replace(self, str1, str2, maxcount);
7216
7217 Py_DECREF(str1);
7218 Py_DECREF(str2);
7219 return result;
7220}
7221
7222static
7223PyObject *unicode_repr(PyObject *unicode)
7224{
7225 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007226 PyUnicode_GET_SIZE(unicode),
7227 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228}
7229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007230PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007231 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232\n\
7233Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007234such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235arguments start and end are interpreted as in slice notation.\n\
7236\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007237Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238
7239static PyObject *
7240unicode_rfind(PyUnicodeObject *self, PyObject *args)
7241{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007242 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007243 Py_ssize_t start;
7244 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007245 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
Facundo Batista57d56692007-11-16 18:04:14 +00007247 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007250 result = stringlib_rfind_slice(
7251 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7252 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7253 start, end
7254 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255
7256 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007257
7258 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259}
7260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007262 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007264Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
7266static PyObject *
7267unicode_rindex(PyUnicodeObject *self, PyObject *args)
7268{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007269 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007270 Py_ssize_t start;
7271 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007272 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
Facundo Batista57d56692007-11-16 18:04:14 +00007274 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007277 result = stringlib_rfind_slice(
7278 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7279 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7280 start, end
7281 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
7283 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 if (result < 0) {
7286 PyErr_SetString(PyExc_ValueError, "substring not found");
7287 return NULL;
7288 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007293 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007295Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007296done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject *
7299unicode_rjust(PyUnicodeObject *self, PyObject *args)
7300{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007301 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007302 Py_UNICODE fillchar = ' ';
7303
Martin v. Löwis412fb672006-04-13 06:34:32 +00007304 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 return NULL;
7306
Tim Peters7a29bd52001-09-12 03:03:31 +00007307 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 Py_INCREF(self);
7309 return (PyObject*) self;
7310 }
7311
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007312 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313}
7314
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007316unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317{
7318 /* standard clamping */
7319 if (start < 0)
7320 start = 0;
7321 if (end < 0)
7322 end = 0;
7323 if (end > self->length)
7324 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007325 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 /* full slice, return original string */
7327 Py_INCREF(self);
7328 return (PyObject*) self;
7329 }
7330 if (start > end)
7331 start = end;
7332 /* copy slice */
7333 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007334 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335}
7336
7337PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007338 PyObject *sep,
7339 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340{
7341 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007342
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 s = PyUnicode_FromObject(s);
7344 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007345 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007346 if (sep != NULL) {
7347 sep = PyUnicode_FromObject(sep);
7348 if (sep == NULL) {
7349 Py_DECREF(s);
7350 return NULL;
7351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 }
7353
7354 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7355
7356 Py_DECREF(s);
7357 Py_XDECREF(sep);
7358 return result;
7359}
7360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007362 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363\n\
7364Return a list of the words in S, using sep as the\n\
7365delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007366splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007367whitespace string is a separator and empty strings are\n\
7368removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369
7370static PyObject*
7371unicode_split(PyUnicodeObject *self, PyObject *args)
7372{
7373 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007374 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
Martin v. Löwis18e16552006-02-15 17:27:45 +00007376 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 return NULL;
7378
7379 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007380 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007382 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007384 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385}
7386
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007387PyObject *
7388PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7389{
7390 PyObject* str_obj;
7391 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007392 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007393
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007394 str_obj = PyUnicode_FromObject(str_in);
7395 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007396 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007397 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007398 if (!sep_obj) {
7399 Py_DECREF(str_obj);
7400 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007401 }
7402
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007403 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007404 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7405 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7406 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007407
Fredrik Lundhb9479482006-05-26 17:22:38 +00007408 Py_DECREF(sep_obj);
7409 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007410
7411 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007412}
7413
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007414
7415PyObject *
7416PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7417{
7418 PyObject* str_obj;
7419 PyObject* sep_obj;
7420 PyObject* out;
7421
7422 str_obj = PyUnicode_FromObject(str_in);
7423 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007424 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007425 sep_obj = PyUnicode_FromObject(sep_in);
7426 if (!sep_obj) {
7427 Py_DECREF(str_obj);
7428 return NULL;
7429 }
7430
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007431 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007432 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7433 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7434 );
7435
7436 Py_DECREF(sep_obj);
7437 Py_DECREF(str_obj);
7438
7439 return out;
7440}
7441
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007442PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007443 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007444\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007445Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007446the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007447found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007448
7449static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007450unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007451{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007452 return PyUnicode_Partition((PyObject *)self, separator);
7453}
7454
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007455PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007456 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007457\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007458Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007459the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007460separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007461
7462static PyObject*
7463unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7464{
7465 return PyUnicode_RPartition((PyObject *)self, separator);
7466}
7467
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007468PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007469 PyObject *sep,
7470 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007471{
7472 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007473
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007474 s = PyUnicode_FromObject(s);
7475 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007476 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007477 if (sep != NULL) {
7478 sep = PyUnicode_FromObject(sep);
7479 if (sep == NULL) {
7480 Py_DECREF(s);
7481 return NULL;
7482 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007483 }
7484
7485 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7486
7487 Py_DECREF(s);
7488 Py_XDECREF(sep);
7489 return result;
7490}
7491
7492PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007493 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007494\n\
7495Return a list of the words in S, using sep as the\n\
7496delimiter string, starting at the end of the string and\n\
7497working to the front. If maxsplit is given, at most maxsplit\n\
7498splits are done. If sep is not specified, any whitespace string\n\
7499is a separator.");
7500
7501static PyObject*
7502unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7503{
7504 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007506
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007508 return NULL;
7509
7510 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007511 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007512 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007513 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007514 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007515 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007516}
7517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007519 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520\n\
7521Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007522Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
7525static PyObject*
7526unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7527{
Guido van Rossum86662912000-04-11 15:38:46 +00007528 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
Guido van Rossum86662912000-04-11 15:38:46 +00007530 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 return NULL;
7532
Guido van Rossum86662912000-04-11 15:38:46 +00007533 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534}
7535
7536static
7537PyObject *unicode_str(PyUnicodeObject *self)
7538{
Fred Drakee4315f52000-05-09 19:53:39 +00007539 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540}
7541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007542PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007543 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544\n\
7545Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007546and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547
7548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007549unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 return fixup(self, fixswapcase);
7552}
7553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007555 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556\n\
7557Return a copy of the string S, where all characters have been mapped\n\
7558through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007559Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7560Unmapped characters are left untouched. Characters mapped to None\n\
7561are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
7563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007564unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
Tim Petersced69f82003-09-16 20:30:58 +00007566 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007567 self->length,
7568 table,
7569 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570}
7571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007572PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007573 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007575Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007578unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 return fixup(self, fixupper);
7581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007584 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585\n\
Georg Brandl98064072008-09-09 19:26:00 +00007586Pad a numeric string S with zeros on the left, to fill a field\n\
7587of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
7589static PyObject *
7590unicode_zfill(PyUnicodeObject *self, PyObject *args)
7591{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007592 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 PyUnicodeObject *u;
7594
Martin v. Löwis18e16552006-02-15 17:27:45 +00007595 Py_ssize_t width;
7596 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 return NULL;
7598
7599 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007600 if (PyUnicode_CheckExact(self)) {
7601 Py_INCREF(self);
7602 return (PyObject*) self;
7603 }
7604 else
7605 return PyUnicode_FromUnicode(
7606 PyUnicode_AS_UNICODE(self),
7607 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007608 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 }
7610
7611 fill = width - self->length;
7612
7613 u = pad(self, fill, 0, '0');
7614
Walter Dörwald068325e2002-04-15 13:36:47 +00007615 if (u == NULL)
7616 return NULL;
7617
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 if (u->str[fill] == '+' || u->str[fill] == '-') {
7619 /* move sign to beginning of string */
7620 u->str[0] = u->str[fill];
7621 u->str[fill] = '0';
7622 }
7623
7624 return (PyObject*) u;
7625}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627#if 0
7628static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007629free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007631 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632}
7633#endif
7634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007635PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007636 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007638Return True if S starts with the specified prefix, False otherwise.\n\
7639With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007640With optional end, stop comparing S at that position.\n\
7641prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642
7643static PyObject *
7644unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007645 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646{
Georg Brandl24250812006-06-09 18:45:48 +00007647 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007650 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007651 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
Georg Brandl24250812006-06-09 18:45:48 +00007653 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007654 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7655 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007656 if (PyTuple_Check(subobj)) {
7657 Py_ssize_t i;
7658 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7659 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007660 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007661 if (substring == NULL)
7662 return NULL;
7663 result = tailmatch(self, substring, start, end, -1);
7664 Py_DECREF(substring);
7665 if (result) {
7666 Py_RETURN_TRUE;
7667 }
7668 }
7669 /* nothing matched */
7670 Py_RETURN_FALSE;
7671 }
7672 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007674 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007675 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007677 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
7680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007681PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007682 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007684Return True if S ends with the specified suffix, False otherwise.\n\
7685With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007686With optional end, stop comparing S at that position.\n\
7687suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688
7689static PyObject *
7690unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007691 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692{
Georg Brandl24250812006-06-09 18:45:48 +00007693 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007695 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007696 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007697 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698
Georg Brandl24250812006-06-09 18:45:48 +00007699 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007700 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7701 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007702 if (PyTuple_Check(subobj)) {
7703 Py_ssize_t i;
7704 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7705 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007706 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007707 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007708 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007709 result = tailmatch(self, substring, start, end, +1);
7710 Py_DECREF(substring);
7711 if (result) {
7712 Py_RETURN_TRUE;
7713 }
7714 }
7715 Py_RETURN_FALSE;
7716 }
7717 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
Georg Brandl24250812006-06-09 18:45:48 +00007721 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007723 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724}
7725
7726
Eric Smitha9f7d622008-02-17 19:46:49 +00007727/* Implements do_string_format, which is unicode because of stringlib */
7728#include "stringlib/string_format.h"
7729
7730PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007731 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007732\n\
7733");
7734
Eric Smithdc13b792008-05-30 18:10:04 +00007735static PyObject *
7736unicode__format__(PyObject *self, PyObject *args)
7737{
7738 PyObject *format_spec;
7739 PyObject *result = NULL;
7740 PyObject *tmp = NULL;
7741
7742 /* If 2.x, convert format_spec to the same type as value */
7743 /* This is to allow things like u''.format('') */
7744 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7745 goto done;
7746 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7747 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007748 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007749 goto done;
7750 }
7751 tmp = PyObject_Unicode(format_spec);
7752 if (tmp == NULL)
7753 goto done;
7754 format_spec = tmp;
7755
7756 result = _PyUnicode_FormatAdvanced(self,
7757 PyUnicode_AS_UNICODE(format_spec),
7758 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007759 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007760 Py_XDECREF(tmp);
7761 return result;
7762}
7763
Eric Smitha9f7d622008-02-17 19:46:49 +00007764PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007765 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007766\n\
7767");
7768
Robert Schuppenies901c9972008-06-10 10:10:31 +00007769static PyObject *
7770unicode__sizeof__(PyUnicodeObject *v)
7771{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007772 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7773 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007774}
7775
7776PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007777 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007778\n\
7779");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007780
7781static PyObject *
7782unicode_getnewargs(PyUnicodeObject *v)
7783{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007784 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007785}
7786
7787
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788static PyMethodDef unicode_methods[] = {
7789
7790 /* Order is according to common usage: often used methods should
7791 appear first, since lookup is done sequentially. */
7792
Benjamin Peterson332d7212009-09-18 21:14:55 +00007793 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007794 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7795 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007796 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007797 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7798 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7799 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7800 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7801 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7802 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7803 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007804 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007805 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7806 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7807 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007808 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007809 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007810/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7811 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7812 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7813 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007814 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007815 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007816 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007817 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007818 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7819 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7820 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7821 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7822 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7823 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7824 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7825 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7826 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7827 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7828 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7829 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7830 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7831 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007832 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007833 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7834 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7835 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7836 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007837 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007838#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007839 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840#endif
7841
7842#if 0
7843 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007844 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845#endif
7846
Benjamin Peterson857ce152009-01-31 16:29:18 +00007847 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 {NULL, NULL}
7849};
7850
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007851static PyObject *
7852unicode_mod(PyObject *v, PyObject *w)
7853{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007854 if (!PyUnicode_Check(v)) {
7855 Py_INCREF(Py_NotImplemented);
7856 return Py_NotImplemented;
7857 }
7858 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007859}
7860
7861static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007862 0, /*nb_add*/
7863 0, /*nb_subtract*/
7864 0, /*nb_multiply*/
7865 0, /*nb_divide*/
7866 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007867};
7868
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007870 (lenfunc) unicode_length, /* sq_length */
7871 PyUnicode_Concat, /* sq_concat */
7872 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7873 (ssizeargfunc) unicode_getitem, /* sq_item */
7874 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7875 0, /* sq_ass_item */
7876 0, /* sq_ass_slice */
7877 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878};
7879
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007880static PyObject*
7881unicode_subscript(PyUnicodeObject* self, PyObject* item)
7882{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007883 if (PyIndex_Check(item)) {
7884 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007885 if (i == -1 && PyErr_Occurred())
7886 return NULL;
7887 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007888 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007889 return unicode_getitem(self, i);
7890 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007891 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007892 Py_UNICODE* source_buf;
7893 Py_UNICODE* result_buf;
7894 PyObject* result;
7895
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007896 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007897 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007898 return NULL;
7899 }
7900
7901 if (slicelength <= 0) {
7902 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007903 } else if (start == 0 && step == 1 && slicelength == self->length &&
7904 PyUnicode_CheckExact(self)) {
7905 Py_INCREF(self);
7906 return (PyObject *)self;
7907 } else if (step == 1) {
7908 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007909 } else {
7910 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007911 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7912 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007913
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007914 if (result_buf == NULL)
7915 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007916
7917 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7918 result_buf[i] = source_buf[cur];
7919 }
Tim Petersced69f82003-09-16 20:30:58 +00007920
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007921 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007922 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007923 return result;
7924 }
7925 } else {
7926 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7927 return NULL;
7928 }
7929}
7930
7931static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007932 (lenfunc)unicode_length, /* mp_length */
7933 (binaryfunc)unicode_subscript, /* mp_subscript */
7934 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007935};
7936
Martin v. Löwis18e16552006-02-15 17:27:45 +00007937static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007939 Py_ssize_t index,
7940 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941{
7942 if (index != 0) {
7943 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007944 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 return -1;
7946 }
7947 *ptr = (void *) self->str;
7948 return PyUnicode_GET_DATA_SIZE(self);
7949}
7950
Martin v. Löwis18e16552006-02-15 17:27:45 +00007951static Py_ssize_t
7952unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007953 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
7955 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007956 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 return -1;
7958}
7959
7960static int
7961unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007962 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963{
7964 if (lenp)
7965 *lenp = PyUnicode_GET_DATA_SIZE(self);
7966 return 1;
7967}
7968
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007969static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007971 Py_ssize_t index,
7972 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973{
7974 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 if (index != 0) {
7977 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007978 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 return -1;
7980 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007981 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007983 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007984 *ptr = (void *) PyString_AS_STRING(str);
7985 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986}
7987
7988/* Helpers for PyUnicode_Format() */
7989
7990static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007991getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007993 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007995 (*p_argidx)++;
7996 if (arglen < 0)
7997 return args;
7998 else
7999 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 }
8001 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008002 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 return NULL;
8004}
8005
8006#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008007#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008009#define F_ALT (1<<3)
8010#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011
Martin v. Löwis18e16552006-02-15 17:27:45 +00008012static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008013strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015 register Py_ssize_t i;
8016 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008018 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 return len;
8021}
8022
Neal Norwitzfc76d632006-01-10 06:03:13 +00008023static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008024longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8025{
Tim Peters15231542006-02-16 01:08:01 +00008026 Py_ssize_t result;
8027
Neal Norwitzfc76d632006-01-10 06:03:13 +00008028 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008029 result = strtounicode(buffer, (char *)buffer);
8030 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008031}
8032
Guido van Rossum078151d2002-08-11 04:24:12 +00008033/* XXX To save some code duplication, formatfloat/long/int could have been
8034 shared with stringobject.c, converting from 8-bit to Unicode after the
8035 formatting is done. */
8036
Mark Dickinson18cfada2009-11-23 18:46:41 +00008037/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8038
8039static PyObject *
8040formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008042 char *p;
8043 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008045
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 x = PyFloat_AsDouble(v);
8047 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008048 return NULL;
8049
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008051 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008052
Mark Dickinson18cfada2009-11-23 18:46:41 +00008053 p = PyOS_double_to_string(x, type, prec,
8054 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8055 if (p == NULL)
8056 return NULL;
8057 result = PyUnicode_FromStringAndSize(p, strlen(p));
8058 PyMem_Free(p);
8059 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060}
8061
Tim Peters38fd5b62000-09-21 05:43:11 +00008062static PyObject*
8063formatlong(PyObject *val, int flags, int prec, int type)
8064{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008065 char *buf;
8066 int i, len;
8067 PyObject *str; /* temporary string object. */
8068 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008069
Benjamin Peterson857ce152009-01-31 16:29:18 +00008070 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8071 if (!str)
8072 return NULL;
8073 result = _PyUnicode_New(len);
8074 if (!result) {
8075 Py_DECREF(str);
8076 return NULL;
8077 }
8078 for (i = 0; i < len; i++)
8079 result->str[i] = buf[i];
8080 result->str[len] = 0;
8081 Py_DECREF(str);
8082 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008083}
8084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085static int
8086formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008087 size_t buflen,
8088 int flags,
8089 int prec,
8090 int type,
8091 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008093 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008094 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8095 * + 1 + 1
8096 * = 24
8097 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008098 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008099 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 long x;
8101
8102 x = PyInt_AsLong(v);
8103 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008104 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008105 if (x < 0 && type == 'u') {
8106 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008107 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008108 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8109 sign = "-";
8110 else
8111 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008113 prec = 1;
8114
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008115 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8116 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008117 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008118 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008119 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008120 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008121 return -1;
8122 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008123
8124 if ((flags & F_ALT) &&
8125 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008126 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008127 * of issues that cause pain:
8128 * - when 0 is being converted, the C standard leaves off
8129 * the '0x' or '0X', which is inconsistent with other
8130 * %#x/%#X conversions and inconsistent with Python's
8131 * hex() function
8132 * - there are platforms that violate the standard and
8133 * convert 0 with the '0x' or '0X'
8134 * (Metrowerks, Compaq Tru64)
8135 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008136 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008137 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008138 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008139 * We can achieve the desired consistency by inserting our
8140 * own '0x' or '0X' prefix, and substituting %x/%X in place
8141 * of %#x/%#X.
8142 *
8143 * Note that this is the same approach as used in
8144 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008145 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008146 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8147 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008148 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008149 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008150 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8151 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008152 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008153 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008154 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008155 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008156 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008157 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158}
8159
8160static int
8161formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008162 size_t buflen,
8163 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164{
Ezio Melotti32125152010-02-25 17:36:04 +00008165 PyObject *unistr;
8166 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008167 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008168 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008169 if (PyUnicode_GET_SIZE(v) != 1)
8170 goto onError;
8171 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008174 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008175 if (PyString_GET_SIZE(v) != 1)
8176 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008177 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8178 with a UnicodeDecodeError if 'char' is not decodable with the
8179 default encoding (usually ASCII, but it might be something else) */
8180 str = PyString_AS_STRING(v);
8181 if ((unsigned char)str[0] > 0x7F) {
8182 /* the char is not ASCII; try to decode the string using the
8183 default encoding and return -1 to let the UnicodeDecodeError
8184 be raised if the string can't be decoded */
8185 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8186 if (unistr == NULL)
8187 return -1;
8188 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8189 Py_DECREF(unistr);
8190 }
8191 else
8192 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
8195 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008196 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008198 x = PyInt_AsLong(v);
8199 if (x == -1 && PyErr_Occurred())
8200 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008201#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008202 if (x < 0 || x > 0x10ffff) {
8203 PyErr_SetString(PyExc_OverflowError,
8204 "%c arg not in range(0x110000) "
8205 "(wide Python build)");
8206 return -1;
8207 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008208#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008209 if (x < 0 || x > 0xffff) {
8210 PyErr_SetString(PyExc_OverflowError,
8211 "%c arg not in range(0x10000) "
8212 "(narrow Python build)");
8213 return -1;
8214 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008215#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008216 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 }
8218 buf[1] = '\0';
8219 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008220
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008221 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008222 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008223 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008224 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225}
8226
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008227/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8228
Mark Dickinson18cfada2009-11-23 18:46:41 +00008229 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008230 chars are formatted. XXX This is a magic number. Each formatting
8231 routine does bounds checking to ensure no overflow, but a better
8232 solution may be to malloc a buffer of appropriate size for each
8233 format. For now, the current solution is sufficient.
8234*/
8235#define FORMATBUFLEN (size_t)120
8236
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008238 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
8240 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008241 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 int args_owned = 0;
8243 PyUnicodeObject *result = NULL;
8244 PyObject *dict = NULL;
8245 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008246
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008248 PyErr_BadInternalCall();
8249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 }
8251 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008252 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 fmt = PyUnicode_AS_UNICODE(uformat);
8255 fmtcnt = PyUnicode_GET_SIZE(uformat);
8256
8257 reslen = rescnt = fmtcnt + 100;
8258 result = _PyUnicode_New(reslen);
8259 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 res = PyUnicode_AS_UNICODE(result);
8262
8263 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008264 arglen = PyTuple_Size(args);
8265 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 }
8267 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008268 arglen = -1;
8269 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 }
Christian Heimese93237d2007-12-19 02:37:44 +00008271 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008272 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008273 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274
8275 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008276 if (*fmt != '%') {
8277 if (--rescnt < 0) {
8278 rescnt = fmtcnt + 100;
8279 reslen += rescnt;
8280 if (_PyUnicode_Resize(&result, reslen) < 0)
8281 goto onError;
8282 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8283 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008284 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008285 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008286 }
8287 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008288 /* Got a format specifier */
8289 int flags = 0;
8290 Py_ssize_t width = -1;
8291 int prec = -1;
8292 Py_UNICODE c = '\0';
8293 Py_UNICODE fill;
8294 int isnumok;
8295 PyObject *v = NULL;
8296 PyObject *temp = NULL;
8297 Py_UNICODE *pbuf;
8298 Py_UNICODE sign;
8299 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008300 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008301
8302 fmt++;
8303 if (*fmt == '(') {
8304 Py_UNICODE *keystart;
8305 Py_ssize_t keylen;
8306 PyObject *key;
8307 int pcount = 1;
8308
8309 if (dict == NULL) {
8310 PyErr_SetString(PyExc_TypeError,
8311 "format requires a mapping");
8312 goto onError;
8313 }
8314 ++fmt;
8315 --fmtcnt;
8316 keystart = fmt;
8317 /* Skip over balanced parentheses */
8318 while (pcount > 0 && --fmtcnt >= 0) {
8319 if (*fmt == ')')
8320 --pcount;
8321 else if (*fmt == '(')
8322 ++pcount;
8323 fmt++;
8324 }
8325 keylen = fmt - keystart - 1;
8326 if (fmtcnt < 0 || pcount > 0) {
8327 PyErr_SetString(PyExc_ValueError,
8328 "incomplete format key");
8329 goto onError;
8330 }
8331#if 0
8332 /* keys are converted to strings using UTF-8 and
8333 then looked up since Python uses strings to hold
8334 variables names etc. in its namespaces and we
8335 wouldn't want to break common idioms. */
8336 key = PyUnicode_EncodeUTF8(keystart,
8337 keylen,
8338 NULL);
8339#else
8340 key = PyUnicode_FromUnicode(keystart, keylen);
8341#endif
8342 if (key == NULL)
8343 goto onError;
8344 if (args_owned) {
8345 Py_DECREF(args);
8346 args_owned = 0;
8347 }
8348 args = PyObject_GetItem(dict, key);
8349 Py_DECREF(key);
8350 if (args == NULL) {
8351 goto onError;
8352 }
8353 args_owned = 1;
8354 arglen = -1;
8355 argidx = -2;
8356 }
8357 while (--fmtcnt >= 0) {
8358 switch (c = *fmt++) {
8359 case '-': flags |= F_LJUST; continue;
8360 case '+': flags |= F_SIGN; continue;
8361 case ' ': flags |= F_BLANK; continue;
8362 case '#': flags |= F_ALT; continue;
8363 case '0': flags |= F_ZERO; continue;
8364 }
8365 break;
8366 }
8367 if (c == '*') {
8368 v = getnextarg(args, arglen, &argidx);
8369 if (v == NULL)
8370 goto onError;
8371 if (!PyInt_Check(v)) {
8372 PyErr_SetString(PyExc_TypeError,
8373 "* wants int");
8374 goto onError;
8375 }
8376 width = PyInt_AsLong(v);
8377 if (width < 0) {
8378 flags |= F_LJUST;
8379 width = -width;
8380 }
8381 if (--fmtcnt >= 0)
8382 c = *fmt++;
8383 }
8384 else if (c >= '0' && c <= '9') {
8385 width = c - '0';
8386 while (--fmtcnt >= 0) {
8387 c = *fmt++;
8388 if (c < '0' || c > '9')
8389 break;
8390 if ((width*10) / 10 != width) {
8391 PyErr_SetString(PyExc_ValueError,
8392 "width too big");
8393 goto onError;
8394 }
8395 width = width*10 + (c - '0');
8396 }
8397 }
8398 if (c == '.') {
8399 prec = 0;
8400 if (--fmtcnt >= 0)
8401 c = *fmt++;
8402 if (c == '*') {
8403 v = getnextarg(args, arglen, &argidx);
8404 if (v == NULL)
8405 goto onError;
8406 if (!PyInt_Check(v)) {
8407 PyErr_SetString(PyExc_TypeError,
8408 "* wants int");
8409 goto onError;
8410 }
8411 prec = PyInt_AsLong(v);
8412 if (prec < 0)
8413 prec = 0;
8414 if (--fmtcnt >= 0)
8415 c = *fmt++;
8416 }
8417 else if (c >= '0' && c <= '9') {
8418 prec = c - '0';
8419 while (--fmtcnt >= 0) {
8420 c = Py_CHARMASK(*fmt++);
8421 if (c < '0' || c > '9')
8422 break;
8423 if ((prec*10) / 10 != prec) {
8424 PyErr_SetString(PyExc_ValueError,
8425 "prec too big");
8426 goto onError;
8427 }
8428 prec = prec*10 + (c - '0');
8429 }
8430 }
8431 } /* prec */
8432 if (fmtcnt >= 0) {
8433 if (c == 'h' || c == 'l' || c == 'L') {
8434 if (--fmtcnt >= 0)
8435 c = *fmt++;
8436 }
8437 }
8438 if (fmtcnt < 0) {
8439 PyErr_SetString(PyExc_ValueError,
8440 "incomplete format");
8441 goto onError;
8442 }
8443 if (c != '%') {
8444 v = getnextarg(args, arglen, &argidx);
8445 if (v == NULL)
8446 goto onError;
8447 }
8448 sign = 0;
8449 fill = ' ';
8450 switch (c) {
8451
8452 case '%':
8453 pbuf = formatbuf;
8454 /* presume that buffer length is at least 1 */
8455 pbuf[0] = '%';
8456 len = 1;
8457 break;
8458
8459 case 's':
8460 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008461 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008462 temp = v;
8463 Py_INCREF(temp);
8464 }
8465 else {
8466 PyObject *unicode;
8467 if (c == 's')
8468 temp = PyObject_Unicode(v);
8469 else
8470 temp = PyObject_Repr(v);
8471 if (temp == NULL)
8472 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008473 if (PyUnicode_Check(temp))
8474 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008475 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008476 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008477 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8478 PyString_GET_SIZE(temp),
8479 NULL,
8480 "strict");
8481 Py_DECREF(temp);
8482 temp = unicode;
8483 if (temp == NULL)
8484 goto onError;
8485 }
8486 else {
8487 Py_DECREF(temp);
8488 PyErr_SetString(PyExc_TypeError,
8489 "%s argument has non-string str()");
8490 goto onError;
8491 }
8492 }
8493 pbuf = PyUnicode_AS_UNICODE(temp);
8494 len = PyUnicode_GET_SIZE(temp);
8495 if (prec >= 0 && len > prec)
8496 len = prec;
8497 break;
8498
8499 case 'i':
8500 case 'd':
8501 case 'u':
8502 case 'o':
8503 case 'x':
8504 case 'X':
8505 if (c == 'i')
8506 c = 'd';
8507 isnumok = 0;
8508 if (PyNumber_Check(v)) {
8509 PyObject *iobj=NULL;
8510
8511 if (PyInt_Check(v) || (PyLong_Check(v))) {
8512 iobj = v;
8513 Py_INCREF(iobj);
8514 }
8515 else {
8516 iobj = PyNumber_Int(v);
8517 if (iobj==NULL) iobj = PyNumber_Long(v);
8518 }
8519 if (iobj!=NULL) {
8520 if (PyInt_Check(iobj)) {
8521 isnumok = 1;
8522 pbuf = formatbuf;
8523 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8524 flags, prec, c, iobj);
8525 Py_DECREF(iobj);
8526 if (len < 0)
8527 goto onError;
8528 sign = 1;
8529 }
8530 else if (PyLong_Check(iobj)) {
8531 isnumok = 1;
8532 temp = formatlong(iobj, flags, prec, c);
8533 Py_DECREF(iobj);
8534 if (!temp)
8535 goto onError;
8536 pbuf = PyUnicode_AS_UNICODE(temp);
8537 len = PyUnicode_GET_SIZE(temp);
8538 sign = 1;
8539 }
8540 else {
8541 Py_DECREF(iobj);
8542 }
8543 }
8544 }
8545 if (!isnumok) {
8546 PyErr_Format(PyExc_TypeError,
8547 "%%%c format: a number is required, "
8548 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8549 goto onError;
8550 }
8551 if (flags & F_ZERO)
8552 fill = '0';
8553 break;
8554
8555 case 'e':
8556 case 'E':
8557 case 'f':
8558 case 'F':
8559 case 'g':
8560 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008561 temp = formatfloat(v, flags, prec, c);
8562 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008563 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008564 pbuf = PyUnicode_AS_UNICODE(temp);
8565 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008566 sign = 1;
8567 if (flags & F_ZERO)
8568 fill = '0';
8569 break;
8570
8571 case 'c':
8572 pbuf = formatbuf;
8573 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8574 if (len < 0)
8575 goto onError;
8576 break;
8577
8578 default:
8579 PyErr_Format(PyExc_ValueError,
8580 "unsupported format character '%c' (0x%x) "
8581 "at index %zd",
8582 (31<=c && c<=126) ? (char)c : '?',
8583 (int)c,
8584 (Py_ssize_t)(fmt - 1 -
8585 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008586 goto onError;
8587 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008588 if (sign) {
8589 if (*pbuf == '-' || *pbuf == '+') {
8590 sign = *pbuf++;
8591 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008592 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008593 else if (flags & F_SIGN)
8594 sign = '+';
8595 else if (flags & F_BLANK)
8596 sign = ' ';
8597 else
8598 sign = 0;
8599 }
8600 if (width < len)
8601 width = len;
8602 if (rescnt - (sign != 0) < width) {
8603 reslen -= rescnt;
8604 rescnt = width + fmtcnt + 100;
8605 reslen += rescnt;
8606 if (reslen < 0) {
8607 Py_XDECREF(temp);
8608 PyErr_NoMemory();
8609 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008610 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008611 if (_PyUnicode_Resize(&result, reslen) < 0) {
8612 Py_XDECREF(temp);
8613 goto onError;
8614 }
8615 res = PyUnicode_AS_UNICODE(result)
8616 + reslen - rescnt;
8617 }
8618 if (sign) {
8619 if (fill != ' ')
8620 *res++ = sign;
8621 rescnt--;
8622 if (width > len)
8623 width--;
8624 }
8625 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8626 assert(pbuf[0] == '0');
8627 assert(pbuf[1] == c);
8628 if (fill != ' ') {
8629 *res++ = *pbuf++;
8630 *res++ = *pbuf++;
8631 }
8632 rescnt -= 2;
8633 width -= 2;
8634 if (width < 0)
8635 width = 0;
8636 len -= 2;
8637 }
8638 if (width > len && !(flags & F_LJUST)) {
8639 do {
8640 --rescnt;
8641 *res++ = fill;
8642 } while (--width > len);
8643 }
8644 if (fill == ' ') {
8645 if (sign)
8646 *res++ = sign;
8647 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8648 assert(pbuf[0] == '0');
8649 assert(pbuf[1] == c);
8650 *res++ = *pbuf++;
8651 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008652 }
8653 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008654 Py_UNICODE_COPY(res, pbuf, len);
8655 res += len;
8656 rescnt -= len;
8657 while (--width >= len) {
8658 --rescnt;
8659 *res++ = ' ';
8660 }
8661 if (dict && (argidx < arglen) && c != '%') {
8662 PyErr_SetString(PyExc_TypeError,
8663 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008664 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008665 goto onError;
8666 }
8667 Py_XDECREF(temp);
8668 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 } /* until end */
8670 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008671 PyErr_SetString(PyExc_TypeError,
8672 "not all arguments converted during string formatting");
8673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 }
8675
Thomas Woutersa96affe2006-03-12 00:29:36 +00008676 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008679 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 }
8681 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 return (PyObject *)result;
8683
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008684 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 Py_XDECREF(result);
8686 Py_DECREF(uformat);
8687 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008688 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 }
8690 return NULL;
8691}
8692
8693static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 (readbufferproc) unicode_buffer_getreadbuf,
8695 (writebufferproc) unicode_buffer_getwritebuf,
8696 (segcountproc) unicode_buffer_getsegcount,
8697 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698};
8699
Jeremy Hylton938ace62002-07-17 16:30:39 +00008700static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008701unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8702
Tim Peters6d6c1a32001-08-02 04:15:00 +00008703static PyObject *
8704unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8705{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008706 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008707 static char *kwlist[] = {"string", "encoding", "errors", 0};
8708 char *encoding = NULL;
8709 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008710
Benjamin Peterson857ce152009-01-31 16:29:18 +00008711 if (type != &PyUnicode_Type)
8712 return unicode_subtype_new(type, args, kwds);
8713 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008714 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008715 return NULL;
8716 if (x == NULL)
8717 return (PyObject *)_PyUnicode_New(0);
8718 if (encoding == NULL && errors == NULL)
8719 return PyObject_Unicode(x);
8720 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008721 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008722}
8723
Guido van Rossume023fe02001-08-30 03:12:59 +00008724static PyObject *
8725unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8726{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008727 PyUnicodeObject *tmp, *pnew;
8728 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008729
Benjamin Peterson857ce152009-01-31 16:29:18 +00008730 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8731 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8732 if (tmp == NULL)
8733 return NULL;
8734 assert(PyUnicode_Check(tmp));
8735 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8736 if (pnew == NULL) {
8737 Py_DECREF(tmp);
8738 return NULL;
8739 }
8740 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8741 if (pnew->str == NULL) {
8742 _Py_ForgetReference((PyObject *)pnew);
8743 PyObject_Del(pnew);
8744 Py_DECREF(tmp);
8745 return PyErr_NoMemory();
8746 }
8747 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8748 pnew->length = n;
8749 pnew->hash = tmp->hash;
8750 Py_DECREF(tmp);
8751 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008752}
8753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008754PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008755 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008756\n\
8757Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008758encoding defaults to the current default string encoding.\n\
8759errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008760
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008762 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008763 "unicode", /* tp_name */
8764 sizeof(PyUnicodeObject), /* tp_size */
8765 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008767 (destructor)unicode_dealloc, /* tp_dealloc */
8768 0, /* tp_print */
8769 0, /* tp_getattr */
8770 0, /* tp_setattr */
8771 0, /* tp_compare */
8772 unicode_repr, /* tp_repr */
8773 &unicode_as_number, /* tp_as_number */
8774 &unicode_as_sequence, /* tp_as_sequence */
8775 &unicode_as_mapping, /* tp_as_mapping */
8776 (hashfunc) unicode_hash, /* tp_hash*/
8777 0, /* tp_call*/
8778 (reprfunc) unicode_str, /* tp_str */
8779 PyObject_GenericGetAttr, /* tp_getattro */
8780 0, /* tp_setattro */
8781 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008782 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008783 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008784 unicode_doc, /* tp_doc */
8785 0, /* tp_traverse */
8786 0, /* tp_clear */
8787 PyUnicode_RichCompare, /* tp_richcompare */
8788 0, /* tp_weaklistoffset */
8789 0, /* tp_iter */
8790 0, /* tp_iternext */
8791 unicode_methods, /* tp_methods */
8792 0, /* tp_members */
8793 0, /* tp_getset */
8794 &PyBaseString_Type, /* tp_base */
8795 0, /* tp_dict */
8796 0, /* tp_descr_get */
8797 0, /* tp_descr_set */
8798 0, /* tp_dictoffset */
8799 0, /* tp_init */
8800 0, /* tp_alloc */
8801 unicode_new, /* tp_new */
8802 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803};
8804
8805/* Initialize the Unicode implementation */
8806
Thomas Wouters78890102000-07-22 19:25:51 +00008807void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008809 int i;
8810
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008811 /* XXX - move this array to unicodectype.c ? */
8812 Py_UNICODE linebreak[] = {
8813 0x000A, /* LINE FEED */
8814 0x000D, /* CARRIAGE RETURN */
8815 0x001C, /* FILE SEPARATOR */
8816 0x001D, /* GROUP SEPARATOR */
8817 0x001E, /* RECORD SEPARATOR */
8818 0x0085, /* NEXT LINE */
8819 0x2028, /* LINE SEPARATOR */
8820 0x2029, /* PARAGRAPH SEPARATOR */
8821 };
8822
Fred Drakee4315f52000-05-09 19:53:39 +00008823 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008824 free_list = NULL;
8825 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008827 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008828 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008829
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008830 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008831 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008832 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008833 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008834 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008835
8836 /* initialize the linebreak bloom filter */
8837 bloom_linebreak = make_bloom_mask(
8838 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8839 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008840
8841 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842}
8843
8844/* Finalize the Unicode implementation */
8845
Christian Heimes3b718a72008-02-14 12:47:33 +00008846int
8847PyUnicode_ClearFreeList(void)
8848{
8849 int freelist_size = numfree;
8850 PyUnicodeObject *u;
8851
8852 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008853 PyUnicodeObject *v = u;
8854 u = *(PyUnicodeObject **)u;
8855 if (v->str)
8856 PyObject_DEL(v->str);
8857 Py_XDECREF(v->defenc);
8858 PyObject_Del(v);
8859 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008860 }
8861 free_list = NULL;
8862 assert(numfree == 0);
8863 return freelist_size;
8864}
8865
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866void
Thomas Wouters78890102000-07-22 19:25:51 +00008867_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008869 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008871 Py_XDECREF(unicode_empty);
8872 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008873
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008874 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008875 if (unicode_latin1[i]) {
8876 Py_DECREF(unicode_latin1[i]);
8877 unicode_latin1[i] = NULL;
8878 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008879 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008880 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008882
Anthony Baxterac6bd462006-04-13 02:06:09 +00008883#ifdef __cplusplus
8884}
8885#endif