blob: ae26ab60dceb21d71c90671fdb062e19cef58a01 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000297 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000300 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Andrew Dalkee0df7622006-05-27 11:04:36 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
Neal Norwitze7d8be82008-07-31 17:17:14 +0000315 /* Ensure we won't overflow the size. */
316 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
317 return (PyUnicodeObject *)PyErr_NoMemory();
318 }
319
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000321 if (free_list) {
322 unicode = free_list;
323 free_list = *(PyUnicodeObject **)unicode;
324 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000325 if (unicode->str) {
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000329 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000330 PyObject_DEL(unicode->str);
331 unicode->str = NULL;
332 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000333 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000334 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000335 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
336 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000337 }
338 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 }
340 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000341 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000342 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 if (unicode == NULL)
344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000345 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 }
348
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000349 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000350 PyErr_NoMemory();
351 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000352 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000353 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
358 * that case.
359 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000360 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000362 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000367 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000378 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000380 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000381 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
382 PyObject_DEL(unicode->str);
383 unicode->str = NULL;
384 unicode->length = 0;
385 }
386 if (unicode->defenc) {
387 Py_DECREF(unicode->defenc);
388 unicode->defenc = NULL;
389 }
390 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000391 *(PyUnicodeObject **)unicode = free_list;
392 free_list = unicode;
393 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 }
395 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyObject_DEL(unicode->str);
397 Py_XDECREF(unicode->defenc);
398 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400}
401
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000402static
403int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404{
405 register PyUnicodeObject *v;
406
407 /* Argument checks */
408 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 PyErr_BadInternalCall();
410 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000412 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000413 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000414 PyErr_BadInternalCall();
415 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 }
417
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000421 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000422 (v == unicode_empty || v->length == 1)) {
423 PyUnicodeObject *w = _PyUnicode_New(length);
424 if (w == NULL)
425 return -1;
426 Py_UNICODE_COPY(w->str, v->str,
427 length < v->length ? length : v->length);
428 Py_DECREF(*unicode);
429 *unicode = w;
430 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 }
432
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v, length);
436}
437
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000438int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
439{
440 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
441}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
450 if (u != NULL) {
451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000452 /* Optimization for empty strings */
453 if (size == 0 && unicode_empty != NULL) {
454 Py_INCREF(unicode_empty);
455 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000456 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000457
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size == 1 && *u < 256) {
461 unicode = unicode_latin1[*u];
462 if (!unicode) {
463 unicode = _PyUnicode_New(1);
464 if (!unicode)
465 return NULL;
466 unicode->str[0] = *u;
467 unicode_latin1[*u] = unicode;
468 }
469 Py_INCREF(unicode);
470 return (PyObject *)unicode;
471 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 }
Tim Petersced69f82003-09-16 20:30:58 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 unicode = _PyUnicode_New(size);
475 if (!unicode)
476 return NULL;
477
478 /* Copy the Unicode data into the new object */
479 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000480 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481
482 return (PyObject *)unicode;
483}
484
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000485PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
486{
487 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000488
Benjamin Peterson857ce152009-01-31 16:29:18 +0000489 if (size < 0) {
490 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000491 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 return NULL;
493 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000494
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
499 if (u != NULL) {
500
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000501 /* Optimization for empty strings */
502 if (size == 0 && unicode_empty != NULL) {
503 Py_INCREF(unicode_empty);
504 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000505 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000506
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size == 1 && Py_CHARMASK(*u) < 128) {
510 unicode = unicode_latin1[Py_CHARMASK(*u)];
511 if (!unicode) {
512 unicode = _PyUnicode_New(1);
513 if (!unicode)
514 return NULL;
515 unicode->str[0] = Py_CHARMASK(*u);
516 unicode_latin1[Py_CHARMASK(*u)] = unicode;
517 }
518 Py_INCREF(unicode);
519 return (PyObject *)unicode;
520 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000521
522 return PyUnicode_DecodeUTF8(u, size, NULL);
523 }
524
525 unicode = _PyUnicode_New(size);
526 if (!unicode)
527 return NULL;
528
529 return (PyObject *)unicode;
530}
531
532PyObject *PyUnicode_FromString(const char *u)
533{
534 size_t size = strlen(u);
535 if (size > PY_SSIZE_T_MAX) {
536 PyErr_SetString(PyExc_OverflowError, "input too long");
537 return NULL;
538 }
539
540 return PyUnicode_FromStringAndSize(u, size);
541}
542
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543#ifdef HAVE_WCHAR_H
544
Mark Dickinson6b265f12009-03-18 16:07:26 +0000545#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546# define CONVERT_WCHAR_TO_SURROGATES
547#endif
548
549#ifdef CONVERT_WCHAR_TO_SURROGATES
550
551/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
553
554PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
555 Py_ssize_t size)
556{
557 PyUnicodeObject *unicode;
558 register Py_ssize_t i;
559 Py_ssize_t alloc;
560 const wchar_t *orig_w;
561
562 if (w == NULL) {
563 PyErr_BadInternalCall();
564 return NULL;
565 }
566
567 alloc = size;
568 orig_w = w;
569 for (i = size; i > 0; i--) {
570 if (*w > 0xFFFF)
571 alloc++;
572 w++;
573 }
574 w = orig_w;
575 unicode = _PyUnicode_New(alloc);
576 if (!unicode)
577 return NULL;
578
579 /* Copy the wchar_t data into the new object */
580 {
581 register Py_UNICODE *u;
582 u = PyUnicode_AS_UNICODE(unicode);
583 for (i = size; i > 0; i--) {
584 if (*w > 0xFFFF) {
585 wchar_t ordinal = *w++;
586 ordinal -= 0x10000;
587 *u++ = 0xD800 | (ordinal >> 10);
588 *u++ = 0xDC00 | (ordinal & 0x3FF);
589 }
590 else
591 *u++ = *w++;
592 }
593 }
594 return (PyObject *)unicode;
595}
596
597#else
598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000600 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601{
602 PyUnicodeObject *unicode;
603
604 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 PyErr_BadInternalCall();
606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 }
608
609 unicode = _PyUnicode_New(size);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614#ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000616#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000618 register Py_UNICODE *u;
619 register Py_ssize_t i;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--)
622 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 }
624#endif
625
626 return (PyObject *)unicode;
627}
628
Mark Dickinson6b265f12009-03-18 16:07:26 +0000629#endif /* CONVERT_WCHAR_TO_SURROGATES */
630
631#undef CONVERT_WCHAR_TO_SURROGATES
632
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633static void
634makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
635{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000636 *fmt++ = '%';
637 if (width) {
638 if (zeropad)
639 *fmt++ = '0';
640 fmt += sprintf(fmt, "%d", width);
641 }
642 if (precision)
643 fmt += sprintf(fmt, ".%d", precision);
644 if (longflag)
645 *fmt++ = 'l';
646 else if (size_tflag) {
647 char *f = PY_FORMAT_SIZE_T;
648 while (*f)
649 *fmt++ = *f++;
650 }
651 *fmt++ = c;
652 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000653}
654
655#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656
657PyObject *
658PyUnicode_FromFormatV(const char *format, va_list vargs)
659{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000660 va_list count;
661 Py_ssize_t callcount = 0;
662 PyObject **callresults = NULL;
663 PyObject **callresult = NULL;
664 Py_ssize_t n = 0;
665 int width = 0;
666 int precision = 0;
667 int zeropad;
668 const char* f;
669 Py_UNICODE *s;
670 PyObject *string;
671 /* used by sprintf */
672 char buffer[21];
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer = NULL;
676 char *realbuffer;
677 Py_ssize_t abuffersize = 0;
678 char fmt[60]; /* should be enough for %0width.precisionld */
679 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000680
681#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000682 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000683#else
684#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000687 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000688#endif
689#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000694 if (*f == '%') {
695 if (*(f+1)=='%')
696 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000697 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000698 ++callcount;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
703 if (*f == 's')
704 ++callcount;
705 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000706 }
707 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000709 if (callcount) {
710 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
711 if (!callresults) {
712 PyErr_NoMemory();
713 return NULL;
714 }
715 callresult = callresults;
716 }
717 /* step 3: figure out how large a buffer we need */
718 for (f = format; *f; f++) {
719 if (*f == '%') {
720 const char* p = f;
721 width = 0;
722 while (isdigit((unsigned)*f))
723 width = (width*10) + *f++ - '0';
724 while (*++f && *f != '%' && !isalpha((unsigned)*f))
725 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000726
Benjamin Peterson857ce152009-01-31 16:29:18 +0000727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
729 */
730 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000731 (f[1] == 'd' || f[1] == 'u'))
732 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000733
Benjamin Peterson857ce152009-01-31 16:29:18 +0000734 switch (*f) {
735 case 'c':
736 (void)va_arg(count, int);
737 /* fall through... */
738 case '%':
739 n++;
740 break;
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
748 if (width < 20)
749 width = 20;
750 n += width;
751 if (abuffersize < width)
752 abuffersize = width;
753 break;
754 case 's':
755 {
756 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000757 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000758 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
759 if (!str)
760 goto fail;
761 n += PyUnicode_GET_SIZE(str);
762 /* Remember the str and switch to the next slot */
763 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000764 break;
765 }
766 case 'U':
767 {
768 PyObject *obj = va_arg(count, PyObject *);
769 assert(obj && PyUnicode_Check(obj));
770 n += PyUnicode_GET_SIZE(obj);
771 break;
772 }
773 case 'V':
774 {
775 PyObject *obj = va_arg(count, PyObject *);
776 const char *str = va_arg(count, const char *);
777 assert(obj || str);
778 assert(!obj || PyUnicode_Check(obj));
779 if (obj)
780 n += PyUnicode_GET_SIZE(obj);
781 else
782 n += strlen(str);
783 break;
784 }
785 case 'S':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *str;
789 assert(obj);
790 str = PyObject_Str(obj);
791 if (!str)
792 goto fail;
793 n += PyUnicode_GET_SIZE(str);
794 /* Remember the str and switch to the next slot */
795 *callresult++ = str;
796 break;
797 }
798 case 'R':
799 {
800 PyObject *obj = va_arg(count, PyObject *);
801 PyObject *repr;
802 assert(obj);
803 repr = PyObject_Repr(obj);
804 if (!repr)
805 goto fail;
806 n += PyUnicode_GET_SIZE(repr);
807 /* Remember the repr and switch to the next slot */
808 *callresult++ = repr;
809 break;
810 }
811 case 'p':
812 (void) va_arg(count, int);
813 /* maximum 64-bit pointer representation:
814 * 0xffffffffffffffff
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
817 */
818 n += 19;
819 break;
820 default:
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
827 n += strlen(p);
828 goto expand;
829 }
830 } else
831 n++;
832 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000833 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000834 if (abuffersize > 20) {
835 abuffer = PyObject_Malloc(abuffersize);
836 if (!abuffer) {
837 PyErr_NoMemory();
838 goto fail;
839 }
840 realbuffer = abuffer;
841 }
842 else
843 realbuffer = buffer;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string = PyUnicode_FromUnicode(NULL, n);
849 if (!string)
850 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000851
Benjamin Peterson857ce152009-01-31 16:29:18 +0000852 s = PyUnicode_AS_UNICODE(string);
853 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000854
Benjamin Peterson857ce152009-01-31 16:29:18 +0000855 for (f = format; *f; f++) {
856 if (*f == '%') {
857 const char* p = f++;
858 int longflag = 0;
859 int size_tflag = 0;
860 zeropad = (*f == '0');
861 /* parse the width.precision part */
862 width = 0;
863 while (isdigit((unsigned)*f))
864 width = (width*10) + *f++ - '0';
865 precision = 0;
866 if (*f == '.') {
867 f++;
868 while (isdigit((unsigned)*f))
869 precision = (precision*10) + *f++ - '0';
870 }
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
874 longflag = 1;
875 ++f;
876 }
877 /* handle the size_t flag. */
878 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
879 size_tflag = 1;
880 ++f;
881 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000882
Benjamin Peterson857ce152009-01-31 16:29:18 +0000883 switch (*f) {
884 case 'c':
885 *s++ = va_arg(vargs, int);
886 break;
887 case 'd':
888 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
889 if (longflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, long));
891 else if (size_tflag)
892 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
893 else
894 sprintf(realbuffer, fmt, va_arg(vargs, int));
895 appendstring(realbuffer);
896 break;
897 case 'u':
898 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
899 if (longflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
901 else if (size_tflag)
902 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
903 else
904 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
905 appendstring(realbuffer);
906 break;
907 case 'i':
908 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
909 sprintf(realbuffer, fmt, va_arg(vargs, int));
910 appendstring(realbuffer);
911 break;
912 case 'x':
913 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
914 sprintf(realbuffer, fmt, va_arg(vargs, int));
915 appendstring(realbuffer);
916 break;
917 case 's':
918 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000919 /* unused, since we already have the result */
920 (void) va_arg(vargs, char *);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
922 PyUnicode_GET_SIZE(*callresult));
923 s += PyUnicode_GET_SIZE(*callresult);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult);
926 /* switch to next unicode()/repr() result */
927 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000928 break;
929 }
930 case 'U':
931 {
932 PyObject *obj = va_arg(vargs, PyObject *);
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 break;
937 }
938 case 'V':
939 {
940 PyObject *obj = va_arg(vargs, PyObject *);
941 const char *str = va_arg(vargs, const char *);
942 if (obj) {
943 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
944 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
945 s += size;
946 } else {
947 appendstring(str);
948 }
949 break;
950 }
951 case 'S':
952 case 'R':
953 {
954 Py_UNICODE *ucopy;
955 Py_ssize_t usize;
956 Py_ssize_t upos;
957 /* unused, since we already have the result */
958 (void) va_arg(vargs, PyObject *);
959 ucopy = PyUnicode_AS_UNICODE(*callresult);
960 usize = PyUnicode_GET_SIZE(*callresult);
961 for (upos = 0; upos<usize;)
962 *s++ = ucopy[upos++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult);
965 /* switch to next unicode()/repr() result */
966 ++callresult;
967 break;
968 }
969 case 'p':
970 sprintf(buffer, "%p", va_arg(vargs, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer[1] == 'X')
973 buffer[1] = 'x';
974 else if (buffer[1] != 'x') {
975 memmove(buffer+2, buffer, strlen(buffer)+1);
976 buffer[0] = '0';
977 buffer[1] = 'x';
978 }
979 appendstring(buffer);
980 break;
981 case '%':
982 *s++ = '%';
983 break;
984 default:
985 appendstring(p);
986 goto end;
987 }
988 } else
989 *s++ = *f;
990 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000992 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000993 if (callresults)
994 PyObject_Free(callresults);
995 if (abuffer)
996 PyObject_Free(abuffer);
997 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
998 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000999 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001000 if (callresults) {
1001 PyObject **callresult2 = callresults;
1002 while (callresult2 < callresult) {
1003 Py_DECREF(*callresult2);
1004 ++callresult2;
1005 }
1006 PyObject_Free(callresults);
1007 }
1008 if (abuffer)
1009 PyObject_Free(abuffer);
1010 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001011}
1012
1013#undef appendstring
1014
1015PyObject *
1016PyUnicode_FromFormat(const char *format, ...)
1017{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001018 PyObject* ret;
1019 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001020
1021#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001025#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001026 ret = PyUnicode_FromFormatV(format, vargs);
1027 va_end(vargs);
1028 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001029}
1030
Martin v. Löwis18e16552006-02-15 17:27:45 +00001031Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001032 wchar_t *w,
1033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyErr_BadInternalCall();
1037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039
1040 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044#ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w, unicode->str, size * sizeof(wchar_t));
1046#else
1047 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001048 register Py_UNICODE *u;
1049 register Py_ssize_t i;
1050 u = PyUnicode_AS_UNICODE(unicode);
1051 for (i = size; i > 0; i--)
1052 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 }
1054#endif
1055
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001056 if (size > PyUnicode_GET_SIZE(unicode))
1057 return PyUnicode_GET_SIZE(unicode);
1058 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060}
1061
1062#endif
1063
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064PyObject *PyUnicode_FromOrdinal(int ordinal)
1065{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001066 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067
1068#ifdef Py_UNICODE_WIDE
1069 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 PyErr_SetString(PyExc_ValueError,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1073 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001074 }
1075#else
1076 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001077 PyErr_SetString(PyExc_ValueError,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1080 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001081 }
1082#endif
1083
Hye-Shik Chang40574832004-04-06 07:24:51 +00001084 s[0] = (Py_UNICODE)ordinal;
1085 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001086}
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_FromObject(register PyObject *obj)
1089{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001092 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 Py_INCREF(obj);
1094 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 }
1096 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1100 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1103}
1104
1105PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001106 const char *encoding,
1107 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001110 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001112
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001114 PyErr_BadInternalCall();
1115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118#if 0
1119 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1122 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001123
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001126
1127 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 if (PyUnicode_Check(obj)) {
1129 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001130 PyErr_SetString(PyExc_TypeError,
1131 "decoding Unicode is not supported");
1132 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001134 return PyObject_Unicode(obj);
1135 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136#else
1137 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001138 PyErr_SetString(PyExc_TypeError,
1139 "decoding Unicode is not supported");
1140 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142#endif
1143
1144 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001145 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001146 s = PyString_AS_STRING(obj);
1147 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 }
Christian Heimes3497f942008-05-26 12:29:14 +00001149 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError,
1152 "decoding bytearray is not supported");
1153 return NULL;
1154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001155 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError))
1159 PyErr_Format(PyExc_TypeError,
1160 "coercing to Unicode: need string or buffer, "
1161 "%.80s found",
1162 Py_TYPE(obj)->tp_name);
1163 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001164 }
Tim Petersced69f82003-09-16 20:30:58 +00001165
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_INCREF(unicode_empty);
1169 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001172 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001173
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001174 return v;
1175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001176 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 Py_ssize_t size,
1182 const char *encoding,
1183 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184{
1185 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186
1187 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001188 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001189
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001193 else if (strcmp(encoding, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001199 else if (strcmp(encoding, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 /* Decode via the codec registry */
1203 buffer = PyBuffer_FromMemory((void *)s, size);
1204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001211 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245 return NULL;
1246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001249 Py_ssize_t size,
1250 const char *encoding,
1251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252{
1253 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 unicode = PyUnicode_FromUnicode(s, size);
1256 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1259 Py_DECREF(unicode);
1260 return v;
1261}
1262
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1264 const char *encoding,
1265 const char *errors)
1266{
1267 PyObject *v;
1268
1269 if (!PyUnicode_Check(unicode)) {
1270 PyErr_BadArgument();
1271 goto onError;
1272 }
1273
1274 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001276
1277 /* Encode via the codec registry */
1278 v = PyCodec_Encode(unicode, encoding, errors);
1279 if (v == NULL)
1280 goto onError;
1281 return v;
1282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001284 return NULL;
1285}
1286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
Fred Drakee4315f52000-05-09 19:53:39 +00001297
Tim Petersced69f82003-09-16 20:30:58 +00001298 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001300
1301 /* Shortcuts for common default encodings */
1302 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001303 if (strcmp(encoding, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode);
1305 else if (strcmp(encoding, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001307#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001308 else if (strcmp(encoding, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001319 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001321 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001322 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 Py_DECREF(v);
1324 goto onError;
1325 }
1326 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001327
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 return NULL;
1330}
1331
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001333 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001334{
1335 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1336
1337 if (v)
1338 return v;
1339 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1340 if (v && errors == NULL)
1341 ((PyUnicodeObject *)unicode)->defenc = v;
1342 return v;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1346{
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_BadArgument();
1349 goto onError;
1350 }
1351 return PyUnicode_AS_UNICODE(unicode);
1352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return NULL;
1355}
1356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358{
1359 if (!PyUnicode_Check(unicode)) {
1360 PyErr_BadArgument();
1361 goto onError;
1362 }
1363 return PyUnicode_GET_SIZE(unicode);
1364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 return -1;
1367}
1368
Thomas Wouters78890102000-07-22 19:25:51 +00001369const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001370{
1371 return unicode_default_encoding;
1372}
1373
1374int PyUnicode_SetDefaultEncoding(const char *encoding)
1375{
1376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001377
Fred Drakee4315f52000-05-09 19:53:39 +00001378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v = _PyCodec_Lookup(encoding);
1381 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001382 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001383 Py_DECREF(v);
1384 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 encoding,
1386 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001387 return 0;
1388
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return -1;
1391}
1392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393/* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001395 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 and adjust various state variables.
1397 return 0 on success, -1 on error
1398*/
1399
1400static
1401int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001402 const char *encoding, const char *reason,
1403 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1404 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1405 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 PyObject *restuple = NULL;
1410 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1412 Py_ssize_t requiredsize;
1413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 int res = -1;
1417
1418 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001419 *errorHandler = PyCodec_LookupError(errors);
1420 if (*errorHandler == NULL)
1421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 }
1423
1424 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001425 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001426 encoding, input, insize, *startinpos, *endinpos, reason);
1427 if (*exceptionObject == NULL)
1428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 }
1430 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1434 goto onError;
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 }
1438
1439 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 }
1446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001450 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1452 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr = PyUnicode_AS_UNICODE(repunicode);
1460 repsize = PyUnicode_GET_SIZE(repunicode);
1461 requiredsize = *outpos + repsize + insize-newpos;
1462 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001463 if (requiredsize<2*outsize)
1464 requiredsize = 2*outsize;
1465 if (_PyUnicode_Resize(output, requiredsize) < 0)
1466 goto onError;
1467 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468 }
1469 *endinpos = newpos;
1470 *inptr = input + newpos;
1471 Py_UNICODE_COPY(*outptr, repptr, repsize);
1472 *outptr += repsize;
1473 *outpos += repsize;
1474 /* we made it! */
1475 res = 0;
1476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_XDECREF(restuple);
1479 return res;
1480}
1481
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482/* --- UTF-7 Codec -------------------------------------------------------- */
1483
Antoine Pitrou653dece2009-05-04 18:32:32 +00001484/* See RFC2152 for details. We encode conservatively and decode liberally. */
1485
1486/* Three simple macros defining base-64. */
1487
1488/* Is c a base-64 character? */
1489
1490#define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1492
1493/* given that c is a base-64 character, what is its base-64 value? */
1494
1495#define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1500
1501/* What is the base-64 character of the bottom 6 bits of n? */
1502
1503#define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1505
1506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1509 * string. */
1510
1511#define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1513
1514/* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1517 * sets:
1518 * 0 : "Set D"
1519 * alphanumeric and '(),-./:?
1520 * 1 : "Set O"
1521 * !"#$%&*;<=>@[]^_`{|}
1522 * 2 : "whitespace"
1523 * ht nl cr sp
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
Tim Petersced69f82003-09-16 20:30:58 +00001528static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001529char utf7_category[128] = {
1530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534/* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538/* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542/* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544/* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546};
1547
Antoine Pitrou653dece2009-05-04 18:32:32 +00001548/* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554#define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001561 Py_ssize_t size,
1562 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001564 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1565}
1566
Antoine Pitrou653dece2009-05-04 18:32:32 +00001567/* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1572 * surrogate). */
1573
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001574PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001575 Py_ssize_t size,
1576 const char *errors,
1577 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t startinpos;
1581 Py_ssize_t endinpos;
1582 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *e;
1584 PyUnicodeObject *unicode;
1585 Py_UNICODE *p;
1586 const char *errmsg = "";
1587 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001588 Py_UNICODE *shiftOutStart;
1589 unsigned int base64bits = 0;
1590 unsigned long base64buffer = 0;
1591 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001592 PyObject *errorHandler = NULL;
1593 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 unicode = _PyUnicode_New(size);
1596 if (!unicode)
1597 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001598 if (size == 0) {
1599 if (consumed)
1600 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001602 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001605 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 e = s + size;
1607
1608 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610
Antoine Pitrou653dece2009-05-04 18:32:32 +00001611 if (inShift) { /* in a base-64 section */
1612 if (IS_BASE64(ch)) { /* consume a base-64 character */
1613 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1614 base64bits += 6;
1615 s++;
1616 if (base64bits >= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh = (Py_UNICODE)
1619 (base64buffer >> (base64bits-16));
1620 base64bits -= 16;
1621 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1622 if (surrogate) {
1623 /* expecting a second surrogate */
1624 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1625#ifdef Py_UNICODE_WIDE
1626 *p++ = (((surrogate & 0x3FF)<<10)
1627 | (outCh & 0x3FF)) + 0x10000;
1628#else
1629 *p++ = surrogate;
1630 *p++ = outCh;
1631#endif
1632 surrogate = 0;
1633 }
1634 else {
1635 surrogate = 0;
1636 errmsg = "second surrogate missing";
1637 goto utf7Error;
1638 }
1639 }
1640 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1641 /* first surrogate */
1642 surrogate = outCh;
1643 }
1644 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1645 errmsg = "unexpected second surrogate";
1646 goto utf7Error;
1647 }
1648 else {
1649 *p++ = outCh;
1650 }
1651 }
1652 }
1653 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 inShift = 0;
1655 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001656 if (surrogate) {
1657 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 if (base64bits > 0) { /* left-over bits */
1661 if (base64bits >= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg = "partial character in shift sequence";
1664 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001666 else {
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer != 0) {
1669 errmsg = "non-zero padding bits in shift sequence";
1670 goto utf7Error;
1671 }
1672 }
1673 }
1674 if (ch != '-') {
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 *p++ = ch;
1678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
1681 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001683 s++; /* consume '+' */
1684 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 s++;
1686 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001687 }
1688 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001690 shiftOutStart = p;
1691 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 }
1693 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001694 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 *p++ = ch;
1696 s++;
1697 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001698 else {
1699 startinpos = s-starts;
1700 s++;
1701 errmsg = "unexpected special character";
1702 goto utf7Error;
1703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001705utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 outpos = p-PyUnicode_AS_UNICODE(unicode);
1707 endinpos = s-starts;
1708 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001709 errors, &errorHandler,
1710 "utf7", errmsg,
1711 starts, size, &startinpos, &endinpos, &exc, &s,
1712 &unicode, &outpos, &p))
1713 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 }
1715
Antoine Pitrou653dece2009-05-04 18:32:32 +00001716 /* end of string */
1717
1718 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1720 if (surrogate ||
1721 (base64bits >= 6) ||
1722 (base64bits > 0 && base64buffer != 0)) {
1723 outpos = p-PyUnicode_AS_UNICODE(unicode);
1724 endinpos = size;
1725 if (unicode_decode_call_errorhandler(
1726 errors, &errorHandler,
1727 "utf7", "unterminated shift sequence",
1728 starts, size, &startinpos, &endinpos, &exc, &s,
1729 &unicode, &outpos, &p))
1730 goto onError;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733
1734 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 if (inShift) {
1737 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001738 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001739 }
1740 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001742 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001743 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 goto onError;
1747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_XDECREF(errorHandler);
1749 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 return (PyObject *)unicode;
1751
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(errorHandler);
1754 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 Py_DECREF(unicode);
1756 return NULL;
1757}
1758
1759
1760PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001761 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001762 int base64SetO,
1763 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765{
1766 PyObject *v;
1767 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001768 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001769 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001771 unsigned int base64bits = 0;
1772 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 char * out;
1774 char * start;
1775
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001776 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001777 return PyErr_NoMemory();
1778
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001780 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781
Antoine Pitrou653dece2009-05-04 18:32:32 +00001782 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001783 if (v == NULL)
1784 return NULL;
1785
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001786 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 for (;i < size; ++i) {
1788 Py_UNICODE ch = s[i];
1789
Antoine Pitrou653dece2009-05-04 18:32:32 +00001790 if (inShift) {
1791 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1792 /* shifting out */
1793 if (base64bits) { /* output remaining bits */
1794 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1795 base64buffer = 0;
1796 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
1798 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch) || ch == '-') {
1802 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 *out++ = (char) ch;
1805 }
1806 else {
1807 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001810 else { /* not in a shift sequence */
1811 if (ch == '+') {
1812 *out++ = '+';
1813 *out++ = '-';
1814 }
1815 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1816 *out++ = (char) ch;
1817 }
1818 else {
1819 *out++ = '+';
1820 inShift = 1;
1821 goto encode_char;
1822 }
1823 }
1824 continue;
1825encode_char:
1826#ifdef Py_UNICODE_WIDE
1827 if (ch >= 0x10000) {
1828 /* code first surrogate */
1829 base64bits += 16;
1830 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1831 while (base64bits >= 6) {
1832 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1833 base64bits -= 6;
1834 }
1835 /* prepare second surrogate */
1836 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1837 }
1838#endif
1839 base64bits += 16;
1840 base64buffer = (base64buffer << 16) | ch;
1841 while (base64bits >= 6) {
1842 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1843 base64bits -= 6;
1844 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001845 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001846 if (base64bits)
1847 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1848 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001849 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001851 if (_PyString_Resize(&v, out - start))
1852 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 return v;
1854}
1855
Antoine Pitrou653dece2009-05-04 18:32:32 +00001856#undef IS_BASE64
1857#undef FROM_BASE64
1858#undef TO_BASE64
1859#undef DECODE_DIRECT
1860#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862/* --- UTF-8 Codec -------------------------------------------------------- */
1863
Tim Petersced69f82003-09-16 20:30:58 +00001864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884};
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 Py_ssize_t size,
1888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Walter Dörwald69652032004-09-07 20:24:22 +00001890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1891}
1892
1893PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 Py_ssize_t size,
1895 const char *errors,
1896 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001900 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t startinpos;
1902 Py_ssize_t endinpos;
1903 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 const char *e;
1905 PyUnicodeObject *unicode;
1906 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 PyObject *errorHandler = NULL;
1909 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
1911 /* Note: size will always be longer than the resulting Unicode
1912 character count */
1913 unicode = _PyUnicode_New(size);
1914 if (!unicode)
1915 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 if (size == 0) {
1917 if (consumed)
1918 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921
1922 /* Unpack UTF-8 encoded data */
1923 p = unicode->str;
1924 e = s + size;
1925
1926 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 s++;
1932 continue;
1933 }
1934
1935 n = utf8_code_length[ch];
1936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 if (consumed)
1939 break;
1940 else {
1941 errmsg = "unexpected end of data";
1942 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001943 endinpos = startinpos+1;
1944 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001946 goto utf8Error;
1947 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
1950 switch (n) {
1951
1952 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001953 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001960 startinpos = s-starts;
1961 endinpos = startinpos+1;
1962 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001968 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001972 assert ((ch > 0x007F) && (ch <= 0x07FF));
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001982 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001983 (s[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s[0] == 0xE0 &&
1985 (unsigned char)s[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001990 endinpos = startinpos + 1;
1991
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1995 be incremented. */
1996 if ((s[1] & 0xC0) == 0x80)
1997 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001998 goto utf8Error;
1999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00002001 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002003 break;
2004
2005 case 4:
2006 if ((s[1] & 0xc0) != 0x80 ||
2007 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002008 (s[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s[0] == 0xF0 &&
2010 (unsigned char)s[1] < 0x90) ||
2011 ((unsigned char)s[0] == 0xF4 &&
2012 (unsigned char)s[1] > 0x8F)) {
2013 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002015 endinpos = startinpos + 1;
2016 if ((s[1] & 0xC0) == 0x80) {
2017 endinpos++;
2018 if ((s[2] & 0xC0) == 0x80)
2019 endinpos++;
2020 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002021 goto utf8Error;
2022 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002023 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002024 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026
Fredrik Lundh8f455852001-06-27 18:59:43 +00002027#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002028 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* translate from 10000..10FFFF to 0..FFFF */
2033 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* high surrogate = top 10 bits added to D800 */
2036 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002038 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002039 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002040#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
2043 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002046 utf8Error:
2047 outpos = p-PyUnicode_AS_UNICODE(unicode);
2048 if (unicode_decode_call_errorhandler(
2049 errors, &errorHandler,
2050 "utf8", errmsg,
2051 starts, size, &startinpos, &endinpos, &exc, &s,
2052 &unicode, &outpos, &p))
2053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 }
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002056 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
2058 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002059 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 goto onError;
2061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(errorHandler);
2063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 return (PyObject *)unicode;
2065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002066 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_XDECREF(errorHandler);
2068 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_DECREF(unicode);
2070 return NULL;
2071}
2072
Tim Peters602f7402002-04-27 18:03:26 +00002073/* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002077*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002078PyObject *
2079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002080 Py_ssize_t size,
2081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082{
Tim Peters602f7402002-04-27 18:03:26 +00002083#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002084
Martin v. Löwis18e16552006-02-15 17:27:45 +00002085 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002086 PyObject *v; /* result string object */
2087 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002088 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002089 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002090 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 assert(s != NULL);
2093 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094
Tim Peters602f7402002-04-27 18:03:26 +00002095 if (size <= MAX_SHORT_UNICHARS) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2099 */
2100 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2101 v = NULL; /* will allocate after we're done */
2102 p = stackbuf;
2103 }
2104 else {
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated = size * 4;
2107 if (nallocated / 4 != size) /* overflow! */
2108 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002109 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002110 if (v == NULL)
2111 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002112 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002113 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002114
Tim Peters602f7402002-04-27 18:03:26 +00002115 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002117
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002119 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002124 *p++ = (char)(0xc0 | (ch >> 6));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002127 else {
Tim Peters602f7402002-04-27 18:03:26 +00002128 /* Encode UCS2 Unicode ordinals */
2129 if (ch < 0x10000) {
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2132 Py_UCS4 ch2 = s[i];
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002136 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002137 i++;
2138 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Tim Peters602f7402002-04-27 18:03:26 +00002140 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002141 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2144 *p++ = (char)(0x80 | (ch & 0x3f));
2145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002147 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002148 /* Encode UCS4 Unicode ordinals */
2149 *p++ = (char)(0xf0 | (ch >> 18));
2150 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2151 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2152 *p++ = (char)(0x80 | (ch & 0x3f));
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002155
Tim Peters602f7402002-04-27 18:03:26 +00002156 if (v == NULL) {
2157 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002158 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002160 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
2162 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002163 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002165 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002166 if (_PyString_Resize(&v, nneeded))
2167 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002170
Tim Peters602f7402002-04-27 18:03:26 +00002171#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172}
2173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 PyUnicode_GET_SIZE(unicode),
2182 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183}
2184
Walter Dörwald6e390802007-08-17 16:41:28 +00002185/* --- UTF-32 Codec ------------------------------------------------------- */
2186
2187PyObject *
2188PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002192{
2193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2194}
2195
2196PyObject *
2197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 int *byteorder,
2201 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002202{
2203 const char *starts = s;
2204 Py_ssize_t startinpos;
2205 Py_ssize_t endinpos;
2206 Py_ssize_t outpos;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002210 int pairs = 0;
Walter Dörwald6e390802007-08-17 16:41:28 +00002211#else
2212 const int pairs = 0;
2213#endif
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002214 const unsigned char *q, *e, *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002215 int bo = 0; /* assume native ordering by default */
2216 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002217 /* Offsets from q for retrieving bytes in the right order. */
2218#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2219 int iorder[] = {0, 1, 2, 3};
2220#else
2221 int iorder[] = {3, 2, 1, 0};
2222#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002223 PyObject *errorHandler = NULL;
2224 PyObject *exc = NULL;
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002225
Walter Dörwald6e390802007-08-17 16:41:28 +00002226 q = (unsigned char *)s;
2227 e = q + size;
2228
2229 if (byteorder)
2230 bo = *byteorder;
2231
2232 /* Check for BOM marks (U+FEFF) in the input and adjust current
2233 byte order setting accordingly. In native mode, the leading BOM
2234 mark is skipped, in all other modes, it is copied to the output
2235 stream as-is (giving a ZWNBSP character). */
2236 if (bo == 0) {
2237 if (size >= 4) {
2238 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002239 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002240#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002241 if (bom == 0x0000FEFF) {
2242 q += 4;
2243 bo = -1;
2244 }
2245 else if (bom == 0xFFFE0000) {
2246 q += 4;
2247 bo = 1;
2248 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002249#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002250 if (bom == 0x0000FEFF) {
2251 q += 4;
2252 bo = 1;
2253 }
2254 else if (bom == 0xFFFE0000) {
2255 q += 4;
2256 bo = -1;
2257 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002258#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002259 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002260 }
2261
2262 if (bo == -1) {
2263 /* force LE */
2264 iorder[0] = 0;
2265 iorder[1] = 1;
2266 iorder[2] = 2;
2267 iorder[3] = 3;
2268 }
2269 else if (bo == 1) {
2270 /* force BE */
2271 iorder[0] = 3;
2272 iorder[1] = 2;
2273 iorder[2] = 1;
2274 iorder[3] = 0;
2275 }
2276
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002277 /* On narrow builds we split characters outside the BMP into two
2278 codepoints => count how much extra space we need. */
2279#ifndef Py_UNICODE_WIDE
2280 for (qq = q; qq < e; qq += 4)
2281 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2282 pairs++;
2283#endif
2284
2285 /* This might be one to much, because of a BOM */
2286 unicode = _PyUnicode_New((size+3)/4+pairs);
2287 if (!unicode)
2288 return NULL;
2289 if (size == 0)
2290 return (PyObject *)unicode;
2291
2292 /* Unpack UTF-32 encoded data */
2293 p = unicode->str;
2294
Walter Dörwald6e390802007-08-17 16:41:28 +00002295 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002296 Py_UCS4 ch;
2297 /* remaining bytes at the end? (size should be divisible by 4) */
2298 if (e-q<4) {
2299 if (consumed)
2300 break;
2301 errmsg = "truncated data";
2302 startinpos = ((const char *)q)-starts;
2303 endinpos = ((const char *)e)-starts;
2304 goto utf32Error;
2305 /* The remaining input chars are ignored if the callback
2306 chooses to skip the input */
2307 }
2308 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2309 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002310
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002311 if (ch >= 0x110000)
2312 {
2313 errmsg = "codepoint not in range(0x110000)";
2314 startinpos = ((const char *)q)-starts;
2315 endinpos = startinpos+4;
2316 goto utf32Error;
2317 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002318#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002319 if (ch >= 0x10000)
2320 {
2321 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2322 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2323 }
2324 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002325#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002326 *p++ = ch;
2327 q += 4;
2328 continue;
2329 utf32Error:
2330 outpos = p-PyUnicode_AS_UNICODE(unicode);
2331 if (unicode_decode_call_errorhandler(
2332 errors, &errorHandler,
2333 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002334 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002335 &unicode, &outpos, &p))
2336 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002337 }
2338
2339 if (byteorder)
2340 *byteorder = bo;
2341
2342 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002343 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002344
2345 /* Adjust length */
2346 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2347 goto onError;
2348
2349 Py_XDECREF(errorHandler);
2350 Py_XDECREF(exc);
2351 return (PyObject *)unicode;
2352
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002353 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002354 Py_DECREF(unicode);
2355 Py_XDECREF(errorHandler);
2356 Py_XDECREF(exc);
2357 return NULL;
2358}
2359
2360PyObject *
2361PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002362 Py_ssize_t size,
2363 const char *errors,
2364 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002365{
2366 PyObject *v;
2367 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002368 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002369#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002370 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002371#else
2372 const int pairs = 0;
2373#endif
2374 /* Offsets from p for storing byte pairs in the right order. */
2375#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2376 int iorder[] = {0, 1, 2, 3};
2377#else
2378 int iorder[] = {3, 2, 1, 0};
2379#endif
2380
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002381#define STORECHAR(CH) \
2382 do { \
2383 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2384 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2385 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2386 p[iorder[0]] = (CH) & 0xff; \
2387 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002388 } while(0)
2389
2390 /* In narrow builds we can output surrogate pairs as one codepoint,
2391 so we need less space. */
2392#ifndef Py_UNICODE_WIDE
2393 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002394 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2395 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2396 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002397#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002398 nsize = (size - pairs + (byteorder == 0));
2399 bytesize = nsize * 4;
2400 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002401 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002402 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002403 if (v == NULL)
2404 return NULL;
2405
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002406 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002407 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002408 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002409 if (size == 0)
2410 return v;
2411
2412 if (byteorder == -1) {
2413 /* force LE */
2414 iorder[0] = 0;
2415 iorder[1] = 1;
2416 iorder[2] = 2;
2417 iorder[3] = 3;
2418 }
2419 else if (byteorder == 1) {
2420 /* force BE */
2421 iorder[0] = 3;
2422 iorder[1] = 2;
2423 iorder[2] = 1;
2424 iorder[3] = 0;
2425 }
2426
2427 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002428 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002429#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002430 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2431 Py_UCS4 ch2 = *s;
2432 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2433 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2434 s++;
2435 size--;
2436 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002437 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002438#endif
2439 STORECHAR(ch);
2440 }
2441 return v;
2442#undef STORECHAR
2443}
2444
2445PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2446{
2447 if (!PyUnicode_Check(unicode)) {
2448 PyErr_BadArgument();
2449 return NULL;
2450 }
2451 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002452 PyUnicode_GET_SIZE(unicode),
2453 NULL,
2454 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002455}
2456
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457/* --- UTF-16 Codec ------------------------------------------------------- */
2458
Tim Peters772747b2001-08-09 22:21:55 +00002459PyObject *
2460PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002461 Py_ssize_t size,
2462 const char *errors,
2463 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464{
Walter Dörwald69652032004-09-07 20:24:22 +00002465 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2466}
2467
2468PyObject *
2469PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002470 Py_ssize_t size,
2471 const char *errors,
2472 int *byteorder,
2473 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002474{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002476 Py_ssize_t startinpos;
2477 Py_ssize_t endinpos;
2478 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 PyUnicodeObject *unicode;
2480 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002481 const unsigned char *q, *e;
2482 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002483 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002484 /* Offsets from q for retrieving byte pairs in the right order. */
2485#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2486 int ihi = 1, ilo = 0;
2487#else
2488 int ihi = 0, ilo = 1;
2489#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 PyObject *errorHandler = NULL;
2491 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492
2493 /* Note: size will always be longer than the resulting Unicode
2494 character count */
2495 unicode = _PyUnicode_New(size);
2496 if (!unicode)
2497 return NULL;
2498 if (size == 0)
2499 return (PyObject *)unicode;
2500
2501 /* Unpack UTF-16 encoded data */
2502 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002503 q = (unsigned char *)s;
2504 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505
2506 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002507 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002509 /* Check for BOM marks (U+FEFF) in the input and adjust current
2510 byte order setting accordingly. In native mode, the leading BOM
2511 mark is skipped, in all other modes, it is copied to the output
2512 stream as-is (giving a ZWNBSP character). */
2513 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002514 if (size >= 2) {
2515 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002516#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002517 if (bom == 0xFEFF) {
2518 q += 2;
2519 bo = -1;
2520 }
2521 else if (bom == 0xFFFE) {
2522 q += 2;
2523 bo = 1;
2524 }
Tim Petersced69f82003-09-16 20:30:58 +00002525#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002526 if (bom == 0xFEFF) {
2527 q += 2;
2528 bo = 1;
2529 }
2530 else if (bom == 0xFFFE) {
2531 q += 2;
2532 bo = -1;
2533 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002534#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002535 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537
Tim Peters772747b2001-08-09 22:21:55 +00002538 if (bo == -1) {
2539 /* force LE */
2540 ihi = 1;
2541 ilo = 0;
2542 }
2543 else if (bo == 1) {
2544 /* force BE */
2545 ihi = 0;
2546 ilo = 1;
2547 }
2548
2549 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002550 Py_UNICODE ch;
2551 /* remaining bytes at the end? (size should be even) */
2552 if (e-q<2) {
2553 if (consumed)
2554 break;
2555 errmsg = "truncated data";
2556 startinpos = ((const char *)q)-starts;
2557 endinpos = ((const char *)e)-starts;
2558 goto utf16Error;
2559 /* The remaining input chars are ignored if the callback
2560 chooses to skip the input */
2561 }
2562 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563
Benjamin Peterson857ce152009-01-31 16:29:18 +00002564 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002565
2566 if (ch < 0xD800 || ch > 0xDFFF) {
2567 *p++ = ch;
2568 continue;
2569 }
2570
2571 /* UTF-16 code pair: */
2572 if (q >= e) {
2573 errmsg = "unexpected end of data";
2574 startinpos = (((const char *)q)-2)-starts;
2575 endinpos = ((const char *)e)-starts;
2576 goto utf16Error;
2577 }
2578 if (0xD800 <= ch && ch <= 0xDBFF) {
2579 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2580 q += 2;
2581 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002582#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002583 *p++ = ch;
2584 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002585#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002587#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002588 continue;
2589 }
2590 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002591 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002592 startinpos = (((const char *)q)-4)-starts;
2593 endinpos = startinpos+2;
2594 goto utf16Error;
2595 }
2596
Benjamin Peterson857ce152009-01-31 16:29:18 +00002597 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002598 errmsg = "illegal encoding";
2599 startinpos = (((const char *)q)-2)-starts;
2600 endinpos = startinpos+2;
2601 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002602
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002603 utf16Error:
2604 outpos = p-PyUnicode_AS_UNICODE(unicode);
2605 if (unicode_decode_call_errorhandler(
2606 errors, &errorHandler,
2607 "utf16", errmsg,
2608 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2609 &unicode, &outpos, &p))
2610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 }
2612
2613 if (byteorder)
2614 *byteorder = bo;
2615
Walter Dörwald69652032004-09-07 20:24:22 +00002616 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002617 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002618
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002620 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 goto onError;
2622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 Py_XDECREF(errorHandler);
2624 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 return (PyObject *)unicode;
2626
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002627 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 Py_XDECREF(errorHandler);
2630 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 return NULL;
2632}
2633
Tim Peters772747b2001-08-09 22:21:55 +00002634PyObject *
2635PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002636 Py_ssize_t size,
2637 const char *errors,
2638 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639{
2640 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002641 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002642 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002643#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002644 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002645#else
2646 const int pairs = 0;
2647#endif
Tim Peters772747b2001-08-09 22:21:55 +00002648 /* Offsets from p for storing byte pairs in the right order. */
2649#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2650 int ihi = 1, ilo = 0;
2651#else
2652 int ihi = 0, ilo = 1;
2653#endif
2654
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002655#define STORECHAR(CH) \
2656 do { \
2657 p[ihi] = ((CH) >> 8) & 0xff; \
2658 p[ilo] = (CH) & 0xff; \
2659 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002660 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002662#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002663 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002664 if (s[i] >= 0x10000)
2665 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002666#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002667 /* 2 * (size + pairs + (byteorder == 0)) */
2668 if (size > PY_SSIZE_T_MAX ||
2669 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002670 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002671 nsize = size + pairs + (byteorder == 0);
2672 bytesize = nsize * 2;
2673 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002674 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002675 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 if (v == NULL)
2677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002679 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002681 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002682 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002683 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002684
2685 if (byteorder == -1) {
2686 /* force LE */
2687 ihi = 1;
2688 ilo = 0;
2689 }
2690 else if (byteorder == 1) {
2691 /* force BE */
2692 ihi = 0;
2693 ilo = 1;
2694 }
2695
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002696 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002697 Py_UNICODE ch = *s++;
2698 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002699#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002700 if (ch >= 0x10000) {
2701 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2702 ch = 0xD800 | ((ch-0x10000) >> 10);
2703 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002704#endif
Tim Peters772747b2001-08-09 22:21:55 +00002705 STORECHAR(ch);
2706 if (ch2)
2707 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002710#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711}
2712
2713PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2714{
2715 if (!PyUnicode_Check(unicode)) {
2716 PyErr_BadArgument();
2717 return NULL;
2718 }
2719 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002720 PyUnicode_GET_SIZE(unicode),
2721 NULL,
2722 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723}
2724
2725/* --- Unicode Escape Codec ----------------------------------------------- */
2726
Fredrik Lundh06d12682001-01-24 07:59:11 +00002727static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002728
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002730 Py_ssize_t size,
2731 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002734 Py_ssize_t startinpos;
2735 Py_ssize_t endinpos;
2736 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741 char* message;
2742 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 PyObject *errorHandler = NULL;
2744 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002745
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 /* Escaped strings will always be longer than the resulting
2747 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 length after conversion to the true value.
2749 (but if the error callback returns a long replacement string
2750 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 v = _PyUnicode_New(size);
2752 if (v == NULL)
2753 goto onError;
2754 if (size == 0)
2755 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002759
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 while (s < end) {
2761 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002762 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764
2765 /* Non-escape characters are interpreted as Unicode ordinals */
2766 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 continue;
2769 }
2770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002771 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 /* \ - Escapes */
2773 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002774 c = *s++;
2775 if (s > end)
2776 c = '\0'; /* Invalid after \ */
2777 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002779 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 case '\n': break;
2781 case '\\': *p++ = '\\'; break;
2782 case '\'': *p++ = '\''; break;
2783 case '\"': *p++ = '\"'; break;
2784 case 'b': *p++ = '\b'; break;
2785 case 'f': *p++ = '\014'; break; /* FF */
2786 case 't': *p++ = '\t'; break;
2787 case 'n': *p++ = '\n'; break;
2788 case 'r': *p++ = '\r'; break;
2789 case 'v': *p++ = '\013'; break; /* VT */
2790 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2791
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002792 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 case '0': case '1': case '2': case '3':
2794 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002795 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002796 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002797 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002798 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002799 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002801 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 break;
2803
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002804 /* hex escapes */
2805 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002807 digits = 2;
2808 message = "truncated \\xXX escape";
2809 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002811 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002813 digits = 4;
2814 message = "truncated \\uXXXX escape";
2815 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002817 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002818 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002819 digits = 8;
2820 message = "truncated \\UXXXXXXXX escape";
2821 hexescape:
2822 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 outpos = p-PyUnicode_AS_UNICODE(v);
2824 if (s+digits>end) {
2825 endinpos = size;
2826 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002827 errors, &errorHandler,
2828 "unicodeescape", "end of string in escape sequence",
2829 starts, size, &startinpos, &endinpos, &exc, &s,
2830 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 goto onError;
2832 goto nextByte;
2833 }
2834 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002836 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 endinpos = (s+i+1)-starts;
2838 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002839 errors, &errorHandler,
2840 "unicodeescape", message,
2841 starts, size, &startinpos, &endinpos, &exc, &s,
2842 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002843 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002845 }
2846 chr = (chr<<4) & ~0xF;
2847 if (c >= '0' && c <= '9')
2848 chr += c - '0';
2849 else if (c >= 'a' && c <= 'f')
2850 chr += 10 + c - 'a';
2851 else
2852 chr += 10 + c - 'A';
2853 }
2854 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002855 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 /* _decoding_error will have already written into the
2857 target buffer. */
2858 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002859 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002860 /* when we get here, chr is a 32-bit unicode character */
2861 if (chr <= 0xffff)
2862 /* UCS-2 character */
2863 *p++ = (Py_UNICODE) chr;
2864 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002865 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002866 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002867#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002868 *p++ = chr;
2869#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002870 chr -= 0x10000L;
2871 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002872 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002873#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002874 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002875 endinpos = s-starts;
2876 outpos = p-PyUnicode_AS_UNICODE(v);
2877 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002878 errors, &errorHandler,
2879 "unicodeescape", "illegal Unicode character",
2880 starts, size, &startinpos, &endinpos, &exc, &s,
2881 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002882 goto onError;
2883 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002884 break;
2885
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002886 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 case 'N':
2888 message = "malformed \\N character escape";
2889 if (ucnhash_CAPI == NULL) {
2890 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002891 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002892 if (ucnhash_CAPI == NULL)
2893 goto ucnhashError;
2894 }
2895 if (*s == '{') {
2896 const char *start = s+1;
2897 /* look for the closing brace */
2898 while (*s != '}' && s < end)
2899 s++;
2900 if (s > start && s < end && *s == '}') {
2901 /* found a name. look it up in the unicode database */
2902 message = "unknown Unicode character name";
2903 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002904 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002905 goto store;
2906 }
2907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908 endinpos = s-starts;
2909 outpos = p-PyUnicode_AS_UNICODE(v);
2910 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002911 errors, &errorHandler,
2912 "unicodeescape", message,
2913 starts, size, &startinpos, &endinpos, &exc, &s,
2914 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002915 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002916 break;
2917
2918 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002919 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 message = "\\ at end of string";
2921 s--;
2922 endinpos = s-starts;
2923 outpos = p-PyUnicode_AS_UNICODE(v);
2924 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002925 errors, &errorHandler,
2926 "unicodeescape", message,
2927 starts, size, &startinpos, &endinpos, &exc, &s,
2928 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002929 goto onError;
2930 }
2931 else {
2932 *p++ = '\\';
2933 *p++ = (unsigned char)s[-1];
2934 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002937 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002938 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002940 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002942 Py_XDECREF(errorHandler);
2943 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002945
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002946 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002947 PyErr_SetString(
2948 PyExc_UnicodeError,
2949 "\\N escapes not supported (can't load unicodedata module)"
2950 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 Py_XDECREF(errorHandler);
2953 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002954 return NULL;
2955
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002956 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 Py_XDECREF(errorHandler);
2959 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 return NULL;
2961}
2962
2963/* Return a Unicode-Escape string version of the Unicode object.
2964
2965 If quotes is true, the string is enclosed in u"" or u'' quotes as
2966 appropriate.
2967
2968*/
2969
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002970Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002971 Py_ssize_t size,
2972 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002973{
2974 /* like wcschr, but doesn't stop at NULL characters */
2975
2976 while (size-- > 0) {
2977 if (*s == ch)
2978 return s;
2979 s++;
2980 }
2981
2982 return NULL;
2983}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002984
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985static
2986PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002987 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 int quotes)
2989{
2990 PyObject *repr;
2991 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002993 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002994#ifdef Py_UNICODE_WIDE
2995 const Py_ssize_t expandsize = 10;
2996#else
2997 const Py_ssize_t expandsize = 6;
2998#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999
Neal Norwitz17753ec2006-08-21 22:21:19 +00003000 /* XXX(nnorwitz): rather than over-allocating, it would be
3001 better to choose a different scheme. Perhaps scan the
3002 first N-chars of the string and allocate based on that size.
3003 */
3004 /* Initial allocation is based on the longest-possible unichr
3005 escape.
3006
3007 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3008 unichr, so in this case it's the longest unichr escape. In
3009 narrow (UTF-16) builds this is five chars per source unichr
3010 since there are two unichrs in the surrogate pair, so in narrow
3011 (UTF-16) builds it's not the longest unichr escape.
3012
3013 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3014 so in the narrow (UTF-16) build case it's the longest unichr
3015 escape.
3016 */
3017
Neal Norwitze7d8be82008-07-31 17:17:14 +00003018 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003019 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003020
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003021 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003022 2
3023 + expandsize*size
3024 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 if (repr == NULL)
3026 return NULL;
3027
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003028 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
3030 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003032 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 !findchar(s, size, '"')) ? '"' : '\'';
3034 }
3035 while (size-- > 0) {
3036 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003037
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003038 /* Escape quotes and backslashes */
3039 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003040 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 *p++ = '\\';
3042 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003043 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003044 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003045
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003046#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003047 /* Map 21-bit characters to '\U00xxxxxx' */
3048 else if (ch >= 0x10000) {
3049 *p++ = '\\';
3050 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003051 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3057 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003058 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003059 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003060 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003061#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003062 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3063 else if (ch >= 0xD800 && ch < 0xDC00) {
3064 Py_UNICODE ch2;
3065 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003066
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003067 ch2 = *s++;
3068 size--;
3069 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3070 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3071 *p++ = '\\';
3072 *p++ = 'U';
3073 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3079 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3080 *p++ = hexdigit[ucs & 0x0000000F];
3081 continue;
3082 }
3083 /* Fall through: isolated surrogates are copied as-is */
3084 s--;
3085 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003086 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003088
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003090 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 *p++ = '\\';
3092 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003093 *p++ = hexdigit[(ch >> 12) & 0x000F];
3094 *p++ = hexdigit[(ch >> 8) & 0x000F];
3095 *p++ = hexdigit[(ch >> 4) & 0x000F];
3096 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003099 /* Map special whitespace to '\t', \n', '\r' */
3100 else if (ch == '\t') {
3101 *p++ = '\\';
3102 *p++ = 't';
3103 }
3104 else if (ch == '\n') {
3105 *p++ = '\\';
3106 *p++ = 'n';
3107 }
3108 else if (ch == '\r') {
3109 *p++ = '\\';
3110 *p++ = 'r';
3111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003113 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003114 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003116 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003117 *p++ = hexdigit[(ch >> 4) & 0x000F];
3118 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 /* Copy everything else as-is */
3122 else
3123 *p++ = (char) ch;
3124 }
3125 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003126 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127
3128 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003129 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 return repr;
3132}
3133
3134PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003135 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136{
3137 return unicodeescape_string(s, size, 0);
3138}
3139
3140PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3141{
3142 if (!PyUnicode_Check(unicode)) {
3143 PyErr_BadArgument();
3144 return NULL;
3145 }
3146 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003147 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148}
3149
3150/* --- Raw Unicode Escape Codec ------------------------------------------- */
3151
3152PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003153 Py_ssize_t size,
3154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003157 Py_ssize_t startinpos;
3158 Py_ssize_t endinpos;
3159 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 const char *end;
3163 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 PyObject *errorHandler = NULL;
3165 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 /* Escaped strings will always be longer than the resulting
3168 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 length after conversion to the true value. (But decoding error
3170 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 v = _PyUnicode_New(size);
3172 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003173 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003175 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 end = s + size;
3178 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003179 unsigned char c;
3180 Py_UCS4 x;
3181 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003182 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 /* Non-escape characters are interpreted as Unicode ordinals */
3185 if (*s != '\\') {
3186 *p++ = (unsigned char)*s++;
3187 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003188 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003189 startinpos = s-starts;
3190
3191 /* \u-escapes are only interpreted iff the number of leading
3192 backslashes if odd */
3193 bs = s;
3194 for (;s < end;) {
3195 if (*s != '\\')
3196 break;
3197 *p++ = (unsigned char)*s++;
3198 }
3199 if (((s - bs) & 1) == 0 ||
3200 s >= end ||
3201 (*s != 'u' && *s != 'U')) {
3202 continue;
3203 }
3204 p--;
3205 count = *s=='u' ? 4 : 8;
3206 s++;
3207
3208 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3209 outpos = p-PyUnicode_AS_UNICODE(v);
3210 for (x = 0, i = 0; i < count; ++i, ++s) {
3211 c = (unsigned char)*s;
3212 if (!isxdigit(c)) {
3213 endinpos = s-starts;
3214 if (unicode_decode_call_errorhandler(
3215 errors, &errorHandler,
3216 "rawunicodeescape", "truncated \\uXXXX",
3217 starts, size, &startinpos, &endinpos, &exc, &s,
3218 &v, &outpos, &p))
3219 goto onError;
3220 goto nextByte;
3221 }
3222 x = (x<<4) & ~0xF;
3223 if (c >= '0' && c <= '9')
3224 x += c - '0';
3225 else if (c >= 'a' && c <= 'f')
3226 x += 10 + c - 'a';
3227 else
3228 x += 10 + c - 'A';
3229 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003230 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003231 /* UCS-2 character */
3232 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003233 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003234 /* UCS-4 character. Either store directly, or as
3235 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003236#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003237 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003238#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003239 x -= 0x10000L;
3240 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3241 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003242#endif
3243 } else {
3244 endinpos = s-starts;
3245 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003246 if (unicode_decode_call_errorhandler(
3247 errors, &errorHandler,
3248 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003249 starts, size, &startinpos, &endinpos, &exc, &s,
3250 &v, &outpos, &p))
3251 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003252 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003253 nextByte:
3254 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003256 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003257 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258 Py_XDECREF(errorHandler);
3259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003261
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003262 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 Py_XDECREF(errorHandler);
3265 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 return NULL;
3267}
3268
3269PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003270 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271{
3272 PyObject *repr;
3273 char *p;
3274 char *q;
3275
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003276 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003277#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003278 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003279#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003280 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003281#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003282
Neal Norwitze7d8be82008-07-31 17:17:14 +00003283 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003284 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003285
Neal Norwitze7d8be82008-07-31 17:17:14 +00003286 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 if (repr == NULL)
3288 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003289 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003290 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003292 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 while (size-- > 0) {
3294 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003295#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003296 /* Map 32-bit characters to '\Uxxxxxxxx' */
3297 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003298 *p++ = '\\';
3299 *p++ = 'U';
3300 *p++ = hexdigit[(ch >> 28) & 0xf];
3301 *p++ = hexdigit[(ch >> 24) & 0xf];
3302 *p++ = hexdigit[(ch >> 20) & 0xf];
3303 *p++ = hexdigit[(ch >> 16) & 0xf];
3304 *p++ = hexdigit[(ch >> 12) & 0xf];
3305 *p++ = hexdigit[(ch >> 8) & 0xf];
3306 *p++ = hexdigit[(ch >> 4) & 0xf];
3307 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003308 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003309 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003310#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003311 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3312 if (ch >= 0xD800 && ch < 0xDC00) {
3313 Py_UNICODE ch2;
3314 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 ch2 = *s++;
3317 size--;
3318 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3319 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3320 *p++ = '\\';
3321 *p++ = 'U';
3322 *p++ = hexdigit[(ucs >> 28) & 0xf];
3323 *p++ = hexdigit[(ucs >> 24) & 0xf];
3324 *p++ = hexdigit[(ucs >> 20) & 0xf];
3325 *p++ = hexdigit[(ucs >> 16) & 0xf];
3326 *p++ = hexdigit[(ucs >> 12) & 0xf];
3327 *p++ = hexdigit[(ucs >> 8) & 0xf];
3328 *p++ = hexdigit[(ucs >> 4) & 0xf];
3329 *p++ = hexdigit[ucs & 0xf];
3330 continue;
3331 }
3332 /* Fall through: isolated surrogates are copied as-is */
3333 s--;
3334 size++;
3335 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003336#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003337 /* Map 16-bit characters to '\uxxxx' */
3338 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 *p++ = '\\';
3340 *p++ = 'u';
3341 *p++ = hexdigit[(ch >> 12) & 0xf];
3342 *p++ = hexdigit[(ch >> 8) & 0xf];
3343 *p++ = hexdigit[(ch >> 4) & 0xf];
3344 *p++ = hexdigit[ch & 15];
3345 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003346 /* Copy everything else as-is */
3347 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 *p++ = (char) ch;
3349 }
3350 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003351 if (_PyString_Resize(&repr, p - q))
3352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 return repr;
3354}
3355
3356PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3357{
3358 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003359 PyErr_BadArgument();
3360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 }
3362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003363 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364}
3365
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003366/* --- Unicode Internal Codec ------------------------------------------- */
3367
3368PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003369 Py_ssize_t size,
3370 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003371{
3372 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003373 Py_ssize_t startinpos;
3374 Py_ssize_t endinpos;
3375 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003376 PyUnicodeObject *v;
3377 Py_UNICODE *p;
3378 const char *end;
3379 const char *reason;
3380 PyObject *errorHandler = NULL;
3381 PyObject *exc = NULL;
3382
Neal Norwitzd43069c2006-01-08 01:12:10 +00003383#ifdef Py_UNICODE_WIDE
3384 Py_UNICODE unimax = PyUnicode_GetMax();
3385#endif
3386
Armin Rigo7ccbca92006-10-04 12:17:45 +00003387 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003388 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3389 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003390 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003391 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003392 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003393 p = PyUnicode_AS_UNICODE(v);
3394 end = s + size;
3395
3396 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003397 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003398 /* We have to sanity check the raw data, otherwise doom looms for
3399 some malformed UCS-4 data. */
3400 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003402 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003403#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003404 end-s < Py_UNICODE_SIZE
3405 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003407 startinpos = s - starts;
3408 if (end-s < Py_UNICODE_SIZE) {
3409 endinpos = end-starts;
3410 reason = "truncated input";
3411 }
3412 else {
3413 endinpos = s - starts + Py_UNICODE_SIZE;
3414 reason = "illegal code point (> 0x10FFFF)";
3415 }
3416 outpos = p - PyUnicode_AS_UNICODE(v);
3417 if (unicode_decode_call_errorhandler(
3418 errors, &errorHandler,
3419 "unicode_internal", reason,
3420 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003421 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003422 goto onError;
3423 }
3424 }
3425 else {
3426 p++;
3427 s += Py_UNICODE_SIZE;
3428 }
3429 }
3430
Martin v. Löwis412fb672006-04-13 06:34:32 +00003431 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003432 goto onError;
3433 Py_XDECREF(errorHandler);
3434 Py_XDECREF(exc);
3435 return (PyObject *)v;
3436
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003437 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003438 Py_XDECREF(v);
3439 Py_XDECREF(errorHandler);
3440 Py_XDECREF(exc);
3441 return NULL;
3442}
3443
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444/* --- Latin-1 Codec ------------------------------------------------------ */
3445
3446PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003447 Py_ssize_t size,
3448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449{
3450 PyUnicodeObject *v;
3451 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003452
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003454 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003455 Py_UNICODE r = *(unsigned char*)s;
3456 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003457 }
3458
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 v = _PyUnicode_New(size);
3460 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003463 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 p = PyUnicode_AS_UNICODE(v);
3465 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003468
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003469 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 Py_XDECREF(v);
3471 return NULL;
3472}
3473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474/* create or adjust a UnicodeEncodeError */
3475static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003476 const char *encoding,
3477 const Py_UNICODE *unicode, Py_ssize_t size,
3478 Py_ssize_t startpos, Py_ssize_t endpos,
3479 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003482 *exceptionObject = PyUnicodeEncodeError_Create(
3483 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 }
3485 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003486 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3487 goto onError;
3488 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3489 goto onError;
3490 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3491 goto onError;
3492 return;
3493 onError:
3494 Py_DECREF(*exceptionObject);
3495 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 }
3497}
3498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499/* raises a UnicodeEncodeError */
3500static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003501 const char *encoding,
3502 const Py_UNICODE *unicode, Py_ssize_t size,
3503 Py_ssize_t startpos, Py_ssize_t endpos,
3504 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505{
3506 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003509 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510}
3511
3512/* error handling callback helper:
3513 build arguments, call the callback and check the arguments,
3514 put the result into newpos and return the replacement string, which
3515 has to be freed by the caller */
3516static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003517 PyObject **errorHandler,
3518 const char *encoding, const char *reason,
3519 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3520 Py_ssize_t startpos, Py_ssize_t endpos,
3521 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524
3525 PyObject *restuple;
3526 PyObject *resunicode;
3527
3528 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003529 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 }
3533
3534 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538
3539 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003544 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 Py_DECREF(restuple);
3546 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 }
3548 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003549 &resunicode, newpos)) {
3550 Py_DECREF(restuple);
3551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 }
3553 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003555 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003556 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3557 Py_DECREF(restuple);
3558 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003559 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 Py_INCREF(resunicode);
3561 Py_DECREF(restuple);
3562 return resunicode;
3563}
3564
3565static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003566 Py_ssize_t size,
3567 const char *errors,
3568 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569{
3570 /* output object */
3571 PyObject *res;
3572 /* pointers to the beginning and end+1 of input */
3573 const Py_UNICODE *startp = p;
3574 const Py_UNICODE *endp = p + size;
3575 /* pointer to the beginning of the unencodable characters */
3576 /* const Py_UNICODE *badp = NULL; */
3577 /* pointer into the output */
3578 char *str;
3579 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003580 Py_ssize_t respos = 0;
3581 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003582 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3583 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 PyObject *errorHandler = NULL;
3585 PyObject *exc = NULL;
3586 /* the following variable is used for caching string comparisons
3587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3588 int known_errorHandler = -1;
3589
3590 /* allocate enough for a simple encoding without
3591 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003592 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 if (res == NULL)
3594 goto onError;
3595 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003596 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003597 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 ressize = size;
3599
3600 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003603 /* can we encode this? */
3604 if (c<limit) {
3605 /* no overflow check, because we know that the space is enough */
3606 *str++ = (char)c;
3607 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003608 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003609 else {
3610 Py_ssize_t unicodepos = p-startp;
3611 Py_ssize_t requiredsize;
3612 PyObject *repunicode;
3613 Py_ssize_t repsize;
3614 Py_ssize_t newpos;
3615 Py_ssize_t respos;
3616 Py_UNICODE *uni2;
3617 /* startpos for collecting unencodable chars */
3618 const Py_UNICODE *collstart = p;
3619 const Py_UNICODE *collend = p;
3620 /* find all unecodable characters */
3621 while ((collend < endp) && ((*collend)>=limit))
3622 ++collend;
3623 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3624 if (known_errorHandler==-1) {
3625 if ((errors==NULL) || (!strcmp(errors, "strict")))
3626 known_errorHandler = 1;
3627 else if (!strcmp(errors, "replace"))
3628 known_errorHandler = 2;
3629 else if (!strcmp(errors, "ignore"))
3630 known_errorHandler = 3;
3631 else if (!strcmp(errors, "xmlcharrefreplace"))
3632 known_errorHandler = 4;
3633 else
3634 known_errorHandler = 0;
3635 }
3636 switch (known_errorHandler) {
3637 case 1: /* strict */
3638 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3639 goto onError;
3640 case 2: /* replace */
3641 while (collstart++<collend)
3642 *str++ = '?'; /* fall through */
3643 case 3: /* ignore */
3644 p = collend;
3645 break;
3646 case 4: /* xmlcharrefreplace */
3647 respos = str-PyString_AS_STRING(res);
3648 /* determine replacement size (temporarily (mis)uses p) */
3649 for (p = collstart, repsize = 0; p < collend; ++p) {
3650 if (*p<10)
3651 repsize += 2+1+1;
3652 else if (*p<100)
3653 repsize += 2+2+1;
3654 else if (*p<1000)
3655 repsize += 2+3+1;
3656 else if (*p<10000)
3657 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003658#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003659 else
3660 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003661#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003662 else if (*p<100000)
3663 repsize += 2+5+1;
3664 else if (*p<1000000)
3665 repsize += 2+6+1;
3666 else
3667 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003668#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003669 }
3670 requiredsize = respos+repsize+(endp-collend);
3671 if (requiredsize > ressize) {
3672 if (requiredsize<2*ressize)
3673 requiredsize = 2*ressize;
3674 if (_PyString_Resize(&res, requiredsize))
3675 goto onError;
3676 str = PyString_AS_STRING(res) + respos;
3677 ressize = requiredsize;
3678 }
3679 /* generate replacement (temporarily (mis)uses p) */
3680 for (p = collstart; p < collend; ++p) {
3681 str += sprintf(str, "&#%d;", (int)*p);
3682 }
3683 p = collend;
3684 break;
3685 default:
3686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3687 encoding, reason, startp, size, &exc,
3688 collstart-startp, collend-startp, &newpos);
3689 if (repunicode == NULL)
3690 goto onError;
3691 /* need more space? (at least enough for what we have+the
3692 replacement+the rest of the string, so we won't have to
3693 check space for encodable characters) */
3694 respos = str-PyString_AS_STRING(res);
3695 repsize = PyUnicode_GET_SIZE(repunicode);
3696 requiredsize = respos+repsize+(endp-collend);
3697 if (requiredsize > ressize) {
3698 if (requiredsize<2*ressize)
3699 requiredsize = 2*ressize;
3700 if (_PyString_Resize(&res, requiredsize)) {
3701 Py_DECREF(repunicode);
3702 goto onError;
3703 }
3704 str = PyString_AS_STRING(res) + respos;
3705 ressize = requiredsize;
3706 }
3707 /* check if there is anything unencodable in the replacement
3708 and copy it to the output */
3709 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3710 c = *uni2;
3711 if (c >= limit) {
3712 raise_encode_exception(&exc, encoding, startp, size,
3713 unicodepos, unicodepos+1, reason);
3714 Py_DECREF(repunicode);
3715 goto onError;
3716 }
3717 *str = (char)c;
3718 }
3719 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003720 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003721 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003722 }
3723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003725 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003727 /* If this falls res will be NULL */
3728 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 Py_XDECREF(errorHandler);
3730 Py_XDECREF(exc);
3731 return res;
3732
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003733 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 Py_XDECREF(res);
3735 Py_XDECREF(errorHandler);
3736 Py_XDECREF(exc);
3737 return NULL;
3738}
3739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003741 Py_ssize_t size,
3742 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745}
3746
3747PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3748{
3749 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003750 PyErr_BadArgument();
3751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 }
3753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003754 PyUnicode_GET_SIZE(unicode),
3755 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756}
3757
3758/* --- 7-bit ASCII Codec -------------------------------------------------- */
3759
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003761 Py_ssize_t size,
3762 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 PyUnicodeObject *v;
3766 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003767 Py_ssize_t startinpos;
3768 Py_ssize_t endinpos;
3769 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 const char *e;
3771 PyObject *errorHandler = NULL;
3772 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003773
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003775 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003776 Py_UNICODE r = *(unsigned char*)s;
3777 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003778 }
Tim Petersced69f82003-09-16 20:30:58 +00003779
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 v = _PyUnicode_New(size);
3781 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003782 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003784 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 e = s + size;
3787 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003788 register unsigned char c = (unsigned char)*s;
3789 if (c < 128) {
3790 *p++ = c;
3791 ++s;
3792 }
3793 else {
3794 startinpos = s-starts;
3795 endinpos = startinpos + 1;
3796 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3797 if (unicode_decode_call_errorhandler(
3798 errors, &errorHandler,
3799 "ascii", "ordinal not in range(128)",
3800 starts, size, &startinpos, &endinpos, &exc, &s,
3801 &v, &outpos, &p))
3802 goto onError;
3803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003805 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3807 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 Py_XDECREF(errorHandler);
3809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003811
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003812 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 Py_XDECREF(errorHandler);
3815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 return NULL;
3817}
3818
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003820 Py_ssize_t size,
3821 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824}
3825
3826PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3827{
3828 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003829 PyErr_BadArgument();
3830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 }
3832 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003833 PyUnicode_GET_SIZE(unicode),
3834 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835}
3836
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003837#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003838
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003839/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003840
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003841#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003842#define NEED_RETRY
3843#endif
3844
3845/* XXX This code is limited to "true" double-byte encodings, as
3846 a) it assumes an incomplete character consists of a single byte, and
3847 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003848 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003849
3850static int is_dbcs_lead_byte(const char *s, int offset)
3851{
3852 const char *curr = s + offset;
3853
3854 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003855 const char *prev = CharPrev(s, curr);
3856 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003857 }
3858 return 0;
3859}
3860
3861/*
3862 * Decode MBCS string into unicode object. If 'final' is set, converts
3863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3864 */
3865static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003866 const char *s, /* MBCS string */
3867 int size, /* sizeof MBCS string */
3868 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003869{
3870 Py_UNICODE *p;
3871 Py_ssize_t n = 0;
3872 int usize = 0;
3873
3874 assert(size >= 0);
3875
3876 /* Skip trailing lead-byte unless 'final' is set */
3877 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003878 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003879
3880 /* First get the size of the result */
3881 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003882 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3883 if (usize == 0) {
3884 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3885 return -1;
3886 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003887 }
3888
3889 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003890 /* Create unicode object */
3891 *v = _PyUnicode_New(usize);
3892 if (*v == NULL)
3893 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003894 }
3895 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003896 /* Extend unicode object */
3897 n = PyUnicode_GET_SIZE(*v);
3898 if (_PyUnicode_Resize(v, n + usize) < 0)
3899 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003900 }
3901
3902 /* Do the conversion */
3903 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003904 p = PyUnicode_AS_UNICODE(*v) + n;
3905 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3906 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3907 return -1;
3908 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003909 }
3910
3911 return size;
3912}
3913
3914PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003915 Py_ssize_t size,
3916 const char *errors,
3917 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003918{
3919 PyUnicodeObject *v = NULL;
3920 int done;
3921
3922 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003923 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924
3925#ifdef NEED_RETRY
3926 retry:
3927 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003928 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003929 else
3930#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003931 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003932
3933 if (done < 0) {
3934 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003935 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003936 }
3937
3938 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003939 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003940
3941#ifdef NEED_RETRY
3942 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003943 s += done;
3944 size -= done;
3945 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003946 }
3947#endif
3948
3949 return (PyObject *)v;
3950}
3951
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003952PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003953 Py_ssize_t size,
3954 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003955{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003956 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3957}
3958
3959/*
3960 * Convert unicode into string object (MBCS).
3961 * Returns 0 if succeed, -1 otherwise.
3962 */
3963static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003964 const Py_UNICODE *p, /* unicode */
3965 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003966{
3967 int mbcssize = 0;
3968 Py_ssize_t n = 0;
3969
3970 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003971
3972 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003973 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003974 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3975 if (mbcssize == 0) {
3976 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3977 return -1;
3978 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003979 }
3980
Martin v. Löwisd8251432006-06-14 05:21:04 +00003981 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003982 /* Create string object */
3983 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3984 if (*repr == NULL)
3985 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003986 }
3987 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003988 /* Extend string object */
3989 n = PyString_Size(*repr);
3990 if (_PyString_Resize(repr, n + mbcssize) < 0)
3991 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003992 }
3993
3994 /* Do the conversion */
3995 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003996 char *s = PyString_AS_STRING(*repr) + n;
3997 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3998 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3999 return -1;
4000 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004001 }
4002
4003 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004004}
4005
4006PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004007 Py_ssize_t size,
4008 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004009{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010 PyObject *repr = NULL;
4011 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004012
Martin v. Löwisd8251432006-06-14 05:21:04 +00004013#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004016 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004017 else
4018#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004019 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004020
Martin v. Löwisd8251432006-06-14 05:21:04 +00004021 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004022 Py_XDECREF(repr);
4023 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004024 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004025
4026#ifdef NEED_RETRY
4027 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004028 p += INT_MAX;
4029 size -= INT_MAX;
4030 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004031 }
4032#endif
4033
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004034 return repr;
4035}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004036
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004037PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4038{
4039 if (!PyUnicode_Check(unicode)) {
4040 PyErr_BadArgument();
4041 return NULL;
4042 }
4043 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004044 PyUnicode_GET_SIZE(unicode),
4045 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004046}
4047
Martin v. Löwisd8251432006-06-14 05:21:04 +00004048#undef NEED_RETRY
4049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004050#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052/* --- Character Mapping Codec -------------------------------------------- */
4053
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004055 Py_ssize_t size,
4056 PyObject *mapping,
4057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004060 Py_ssize_t startinpos;
4061 Py_ssize_t endinpos;
4062 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 PyUnicodeObject *v;
4065 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004066 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 PyObject *errorHandler = NULL;
4068 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004069 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004071
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 /* Default to Latin-1 */
4073 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004074 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075
4076 v = _PyUnicode_New(size);
4077 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004080 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004083 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004084 mapstring = PyUnicode_AS_UNICODE(mapping);
4085 maplen = PyUnicode_GET_SIZE(mapping);
4086 while (s < e) {
4087 unsigned char ch = *s;
4088 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004090 if (ch < maplen)
4091 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004093 if (x == 0xfffe) {
4094 /* undefined mapping */
4095 outpos = p-PyUnicode_AS_UNICODE(v);
4096 startinpos = s-starts;
4097 endinpos = startinpos+1;
4098 if (unicode_decode_call_errorhandler(
4099 errors, &errorHandler,
4100 "charmap", "character maps to <undefined>",
4101 starts, size, &startinpos, &endinpos, &exc, &s,
4102 &v, &outpos, &p)) {
4103 goto onError;
4104 }
4105 continue;
4106 }
4107 *p++ = x;
4108 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004109 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004110 }
4111 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004112 while (s < e) {
4113 unsigned char ch = *s;
4114 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004115
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004116 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4117 w = PyInt_FromLong((long)ch);
4118 if (w == NULL)
4119 goto onError;
4120 x = PyObject_GetItem(mapping, w);
4121 Py_DECREF(w);
4122 if (x == NULL) {
4123 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4124 /* No mapping found means: mapping is undefined. */
4125 PyErr_Clear();
4126 x = Py_None;
4127 Py_INCREF(x);
4128 } else
4129 goto onError;
4130 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004131
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004132 /* Apply mapping */
4133 if (PyInt_Check(x)) {
4134 long value = PyInt_AS_LONG(x);
4135 if (value < 0 || value > 65535) {
4136 PyErr_SetString(PyExc_TypeError,
4137 "character mapping must be in range(65536)");
4138 Py_DECREF(x);
4139 goto onError;
4140 }
4141 *p++ = (Py_UNICODE)value;
4142 }
4143 else if (x == Py_None) {
4144 /* undefined mapping */
4145 outpos = p-PyUnicode_AS_UNICODE(v);
4146 startinpos = s-starts;
4147 endinpos = startinpos+1;
4148 if (unicode_decode_call_errorhandler(
4149 errors, &errorHandler,
4150 "charmap", "character maps to <undefined>",
4151 starts, size, &startinpos, &endinpos, &exc, &s,
4152 &v, &outpos, &p)) {
4153 Py_DECREF(x);
4154 goto onError;
4155 }
4156 Py_DECREF(x);
4157 continue;
4158 }
4159 else if (PyUnicode_Check(x)) {
4160 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004161
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004162 if (targetsize == 1)
4163 /* 1-1 mapping */
4164 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004165
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004166 else if (targetsize > 1) {
4167 /* 1-n mapping */
4168 if (targetsize > extrachars) {
4169 /* resize first */
4170 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4171 Py_ssize_t needed = (targetsize - extrachars) + \
4172 (targetsize << 2);
4173 extrachars += needed;
4174 /* XXX overflow detection missing */
4175 if (_PyUnicode_Resize(&v,
4176 PyUnicode_GET_SIZE(v) + needed) < 0) {
4177 Py_DECREF(x);
4178 goto onError;
4179 }
4180 p = PyUnicode_AS_UNICODE(v) + oldpos;
4181 }
4182 Py_UNICODE_COPY(p,
4183 PyUnicode_AS_UNICODE(x),
4184 targetsize);
4185 p += targetsize;
4186 extrachars -= targetsize;
4187 }
4188 /* 1-0 mapping: skip the character */
4189 }
4190 else {
4191 /* wrong return value */
4192 PyErr_SetString(PyExc_TypeError,
4193 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004194 Py_DECREF(x);
4195 goto onError;
4196 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004197 Py_DECREF(x);
4198 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 }
4201 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004202 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4203 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_XDECREF(errorHandler);
4205 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004207
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 Py_XDECREF(errorHandler);
4210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 Py_XDECREF(v);
4212 return NULL;
4213}
4214
Martin v. Löwis3f767792006-06-04 19:36:28 +00004215/* Charmap encoding: the lookup table */
4216
4217struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004218 PyObject_HEAD
4219 unsigned char level1[32];
4220 int count2, count3;
4221 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004222};
4223
4224static PyObject*
4225encoding_map_size(PyObject *obj, PyObject* args)
4226{
4227 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004228 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004229 128*map->count3);
4230}
4231
4232static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004233 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004234 PyDoc_STR("Return the size (in bytes) of this object") },
4235 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004236};
4237
4238static void
4239encoding_map_dealloc(PyObject* o)
4240{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004241 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004242}
4243
4244static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004245 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004246 "EncodingMap", /*tp_name*/
4247 sizeof(struct encoding_map), /*tp_basicsize*/
4248 0, /*tp_itemsize*/
4249 /* methods */
4250 encoding_map_dealloc, /*tp_dealloc*/
4251 0, /*tp_print*/
4252 0, /*tp_getattr*/
4253 0, /*tp_setattr*/
4254 0, /*tp_compare*/
4255 0, /*tp_repr*/
4256 0, /*tp_as_number*/
4257 0, /*tp_as_sequence*/
4258 0, /*tp_as_mapping*/
4259 0, /*tp_hash*/
4260 0, /*tp_call*/
4261 0, /*tp_str*/
4262 0, /*tp_getattro*/
4263 0, /*tp_setattro*/
4264 0, /*tp_as_buffer*/
4265 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4266 0, /*tp_doc*/
4267 0, /*tp_traverse*/
4268 0, /*tp_clear*/
4269 0, /*tp_richcompare*/
4270 0, /*tp_weaklistoffset*/
4271 0, /*tp_iter*/
4272 0, /*tp_iternext*/
4273 encoding_map_methods, /*tp_methods*/
4274 0, /*tp_members*/
4275 0, /*tp_getset*/
4276 0, /*tp_base*/
4277 0, /*tp_dict*/
4278 0, /*tp_descr_get*/
4279 0, /*tp_descr_set*/
4280 0, /*tp_dictoffset*/
4281 0, /*tp_init*/
4282 0, /*tp_alloc*/
4283 0, /*tp_new*/
4284 0, /*tp_free*/
4285 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004286};
4287
4288PyObject*
4289PyUnicode_BuildEncodingMap(PyObject* string)
4290{
4291 Py_UNICODE *decode;
4292 PyObject *result;
4293 struct encoding_map *mresult;
4294 int i;
4295 int need_dict = 0;
4296 unsigned char level1[32];
4297 unsigned char level2[512];
4298 unsigned char *mlevel1, *mlevel2, *mlevel3;
4299 int count2 = 0, count3 = 0;
4300
4301 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4302 PyErr_BadArgument();
4303 return NULL;
4304 }
4305 decode = PyUnicode_AS_UNICODE(string);
4306 memset(level1, 0xFF, sizeof level1);
4307 memset(level2, 0xFF, sizeof level2);
4308
4309 /* If there isn't a one-to-one mapping of NULL to \0,
4310 or if there are non-BMP characters, we need to use
4311 a mapping dictionary. */
4312 if (decode[0] != 0)
4313 need_dict = 1;
4314 for (i = 1; i < 256; i++) {
4315 int l1, l2;
4316 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004317#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004318 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004319#endif
4320 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004321 need_dict = 1;
4322 break;
4323 }
4324 if (decode[i] == 0xFFFE)
4325 /* unmapped character */
4326 continue;
4327 l1 = decode[i] >> 11;
4328 l2 = decode[i] >> 7;
4329 if (level1[l1] == 0xFF)
4330 level1[l1] = count2++;
4331 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004332 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004333 }
4334
4335 if (count2 >= 0xFF || count3 >= 0xFF)
4336 need_dict = 1;
4337
4338 if (need_dict) {
4339 PyObject *result = PyDict_New();
4340 PyObject *key, *value;
4341 if (!result)
4342 return NULL;
4343 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004344 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004345 key = PyInt_FromLong(decode[i]);
4346 value = PyInt_FromLong(i);
4347 if (!key || !value)
4348 goto failed1;
4349 if (PyDict_SetItem(result, key, value) == -1)
4350 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004351 Py_DECREF(key);
4352 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004353 }
4354 return result;
4355 failed1:
4356 Py_XDECREF(key);
4357 Py_XDECREF(value);
4358 Py_DECREF(result);
4359 return NULL;
4360 }
4361
4362 /* Create a three-level trie */
4363 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4364 16*count2 + 128*count3 - 1);
4365 if (!result)
4366 return PyErr_NoMemory();
4367 PyObject_Init(result, &EncodingMapType);
4368 mresult = (struct encoding_map*)result;
4369 mresult->count2 = count2;
4370 mresult->count3 = count3;
4371 mlevel1 = mresult->level1;
4372 mlevel2 = mresult->level23;
4373 mlevel3 = mresult->level23 + 16*count2;
4374 memcpy(mlevel1, level1, 32);
4375 memset(mlevel2, 0xFF, 16*count2);
4376 memset(mlevel3, 0, 128*count3);
4377 count3 = 0;
4378 for (i = 1; i < 256; i++) {
4379 int o1, o2, o3, i2, i3;
4380 if (decode[i] == 0xFFFE)
4381 /* unmapped character */
4382 continue;
4383 o1 = decode[i]>>11;
4384 o2 = (decode[i]>>7) & 0xF;
4385 i2 = 16*mlevel1[o1] + o2;
4386 if (mlevel2[i2] == 0xFF)
4387 mlevel2[i2] = count3++;
4388 o3 = decode[i] & 0x7F;
4389 i3 = 128*mlevel2[i2] + o3;
4390 mlevel3[i3] = i;
4391 }
4392 return result;
4393}
4394
4395static int
4396encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4397{
4398 struct encoding_map *map = (struct encoding_map*)mapping;
4399 int l1 = c>>11;
4400 int l2 = (c>>7) & 0xF;
4401 int l3 = c & 0x7F;
4402 int i;
4403
4404#ifdef Py_UNICODE_WIDE
4405 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004406 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004407 }
4408#endif
4409 if (c == 0)
4410 return 0;
4411 /* level 1*/
4412 i = map->level1[l1];
4413 if (i == 0xFF) {
4414 return -1;
4415 }
4416 /* level 2*/
4417 i = map->level23[16*i+l2];
4418 if (i == 0xFF) {
4419 return -1;
4420 }
4421 /* level 3 */
4422 i = map->level23[16*map->count2 + 128*i + l3];
4423 if (i == 0) {
4424 return -1;
4425 }
4426 return i;
4427}
4428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429/* Lookup the character ch in the mapping. If the character
4430 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004431 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 PyObject *w = PyInt_FromLong((long)c);
4435 PyObject *x;
4436
4437 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004438 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 x = PyObject_GetItem(mapping, w);
4440 Py_DECREF(w);
4441 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004442 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4443 /* No mapping found means: mapping is undefined. */
4444 PyErr_Clear();
4445 x = Py_None;
4446 Py_INCREF(x);
4447 return x;
4448 } else
4449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004451 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004452 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004454 long value = PyInt_AS_LONG(x);
4455 if (value < 0 || value > 255) {
4456 PyErr_SetString(PyExc_TypeError,
4457 "character mapping must be in range(256)");
4458 Py_DECREF(x);
4459 return NULL;
4460 }
4461 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004463 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004464 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004466 /* wrong return value */
4467 PyErr_SetString(PyExc_TypeError,
4468 "character mapping must return integer, None or str");
4469 Py_DECREF(x);
4470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 }
4472}
4473
Martin v. Löwis3f767792006-06-04 19:36:28 +00004474static int
4475charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4476{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004477 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4478 /* exponentially overallocate to minimize reallocations */
4479 if (requiredsize < 2*outsize)
4480 requiredsize = 2*outsize;
4481 if (_PyString_Resize(outobj, requiredsize)) {
4482 return 0;
4483 }
4484 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004485}
4486
Benjamin Peterson857ce152009-01-31 16:29:18 +00004487typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004488 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004489}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490/* lookup the character, put the result in the output string and adjust
4491 various state variables. Reallocate the output string if not enough
4492 space is available. Return a new reference to the object that
4493 was put in the output buffer, or Py_None, if the mapping was undefined
4494 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004495 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004497charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004498 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004500 PyObject *rep;
4501 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004502 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503
Christian Heimese93237d2007-12-19 02:37:44 +00004504 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004506 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004507 if (res == -1)
4508 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004509 if (outsize<requiredsize)
4510 if (!charmapencode_resize(outobj, outpos, requiredsize))
4511 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004512 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004513 outstart[(*outpos)++] = (char)res;
4514 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004515 }
4516
4517 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004519 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004520 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004521 Py_DECREF(rep);
4522 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004523 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004524 if (PyInt_Check(rep)) {
4525 Py_ssize_t requiredsize = *outpos+1;
4526 if (outsize<requiredsize)
4527 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4528 Py_DECREF(rep);
4529 return enc_EXCEPTION;
4530 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004531 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004533 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004534 else {
4535 const char *repchars = PyString_AS_STRING(rep);
4536 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4537 Py_ssize_t requiredsize = *outpos+repsize;
4538 if (outsize<requiredsize)
4539 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4540 Py_DECREF(rep);
4541 return enc_EXCEPTION;
4542 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004543 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004544 memcpy(outstart + *outpos, repchars, repsize);
4545 *outpos += repsize;
4546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 }
Georg Brandl9f167602006-06-04 21:46:16 +00004548 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004549 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550}
4551
4552/* handle an error in PyUnicode_EncodeCharmap
4553 Return 0 on success, -1 on error */
4554static
4555int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004558 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004559 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560{
4561 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004562 Py_ssize_t repsize;
4563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 Py_UNICODE *uni2;
4565 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004566 Py_ssize_t collstartpos = *inpos;
4567 Py_ssize_t collendpos = *inpos+1;
4568 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 char *encoding = "charmap";
4570 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004571 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 /* find all unencodable characters */
4574 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004575 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004576 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004577 int res = encoding_map_lookup(p[collendpos], mapping);
4578 if (res != -1)
4579 break;
4580 ++collendpos;
4581 continue;
4582 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004583
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004584 rep = charmapencode_lookup(p[collendpos], mapping);
4585 if (rep==NULL)
4586 return -1;
4587 else if (rep!=Py_None) {
4588 Py_DECREF(rep);
4589 break;
4590 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004591 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004592 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 }
4594 /* cache callback name lookup
4595 * (if not done yet, i.e. it's the first error) */
4596 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004597 if ((errors==NULL) || (!strcmp(errors, "strict")))
4598 *known_errorHandler = 1;
4599 else if (!strcmp(errors, "replace"))
4600 *known_errorHandler = 2;
4601 else if (!strcmp(errors, "ignore"))
4602 *known_errorHandler = 3;
4603 else if (!strcmp(errors, "xmlcharrefreplace"))
4604 *known_errorHandler = 4;
4605 else
4606 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 }
4608 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004609 case 1: /* strict */
4610 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4611 return -1;
4612 case 2: /* replace */
4613 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004614 x = charmapencode_output('?', mapping, res, respos);
4615 if (x==enc_EXCEPTION) {
4616 return -1;
4617 }
4618 else if (x==enc_FAILED) {
4619 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4620 return -1;
4621 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004622 }
4623 /* fall through */
4624 case 3: /* ignore */
4625 *inpos = collendpos;
4626 break;
4627 case 4: /* xmlcharrefreplace */
4628 /* generate replacement (temporarily (mis)uses p) */
4629 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004630 char buffer[2+29+1+1];
4631 char *cp;
4632 sprintf(buffer, "&#%d;", (int)p[collpos]);
4633 for (cp = buffer; *cp; ++cp) {
4634 x = charmapencode_output(*cp, mapping, res, respos);
4635 if (x==enc_EXCEPTION)
4636 return -1;
4637 else if (x==enc_FAILED) {
4638 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4639 return -1;
4640 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004641 }
4642 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004643 *inpos = collendpos;
4644 break;
4645 default:
4646 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004647 encoding, reason, p, size, exceptionObject,
4648 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004649 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004650 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004651 /* generate replacement */
4652 repsize = PyUnicode_GET_SIZE(repunicode);
4653 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004654 x = charmapencode_output(*uni2, mapping, res, respos);
4655 if (x==enc_EXCEPTION) {
4656 return -1;
4657 }
4658 else if (x==enc_FAILED) {
4659 Py_DECREF(repunicode);
4660 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4661 return -1;
4662 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004663 }
4664 *inpos = newpos;
4665 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 }
4667 return 0;
4668}
4669
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004671 Py_ssize_t size,
4672 PyObject *mapping,
4673 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 /* output object */
4676 PyObject *res = NULL;
4677 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004678 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004680 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 PyObject *errorHandler = NULL;
4682 PyObject *exc = NULL;
4683 /* the following variable is used for caching string comparisons
4684 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4685 * 3=ignore, 4=xmlcharrefreplace */
4686 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687
4688 /* Default to Latin-1 */
4689 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004690 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 /* allocate enough for a simple encoding without
4693 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004694 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 if (res == NULL)
4696 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004697 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004701 /* try to encode it */
4702 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4703 if (x==enc_EXCEPTION) /* error */
4704 goto onError;
4705 if (x==enc_FAILED) { /* unencodable character */
4706 if (charmap_encoding_error(p, size, &inpos, mapping,
4707 &exc,
4708 &known_errorHandler, &errorHandler, errors,
4709 &res, &respos)) {
4710 goto onError;
4711 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004712 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004713 else
4714 /* done with this character => adjust input position */
4715 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004719 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004720 if (_PyString_Resize(&res, respos))
4721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 }
4723 Py_XDECREF(exc);
4724 Py_XDECREF(errorHandler);
4725 return res;
4726
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004727 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(res);
4729 Py_XDECREF(exc);
4730 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 return NULL;
4732}
4733
4734PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004735 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736{
4737 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004738 PyErr_BadArgument();
4739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 }
4741 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004742 PyUnicode_GET_SIZE(unicode),
4743 mapping,
4744 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745}
4746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747/* create or adjust a UnicodeTranslateError */
4748static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004749 const Py_UNICODE *unicode, Py_ssize_t size,
4750 Py_ssize_t startpos, Py_ssize_t endpos,
4751 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004754 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004755 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 }
4757 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004758 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4759 goto onError;
4760 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4761 goto onError;
4762 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4763 goto onError;
4764 return;
4765 onError:
4766 Py_DECREF(*exceptionObject);
4767 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 }
4769}
4770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771/* raises a UnicodeTranslateError */
4772static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004773 const Py_UNICODE *unicode, Py_ssize_t size,
4774 Py_ssize_t startpos, Py_ssize_t endpos,
4775 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776{
4777 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004778 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004780 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781}
4782
4783/* error handling callback helper:
4784 build arguments, call the callback and check the arguments,
4785 put the result into newpos and return the replacement string, which
4786 has to be freed by the caller */
4787static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 PyObject **errorHandler,
4789 const char *reason,
4790 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4791 Py_ssize_t startpos, Py_ssize_t endpos,
4792 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004794 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795
Martin v. Löwis412fb672006-04-13 06:34:32 +00004796 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 PyObject *restuple;
4798 PyObject *resunicode;
4799
4800 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004801 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004803 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 }
4805
4806 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004807 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004809 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810
4811 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004812 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004816 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004817 Py_DECREF(restuple);
4818 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819 }
4820 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004821 &resunicode, &i_newpos)) {
4822 Py_DECREF(restuple);
4823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004825 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004826 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004827 else
4828 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004829 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004830 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4831 Py_DECREF(restuple);
4832 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004833 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 Py_INCREF(resunicode);
4835 Py_DECREF(restuple);
4836 return resunicode;
4837}
4838
4839/* Lookup the character ch in the mapping and put the result in result,
4840 which must be decrefed by the caller.
4841 Return 0 on success, -1 on error */
4842static
4843int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4844{
4845 PyObject *w = PyInt_FromLong((long)c);
4846 PyObject *x;
4847
4848 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004849 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 x = PyObject_GetItem(mapping, w);
4851 Py_DECREF(w);
4852 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004853 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4854 /* No mapping found means: use 1:1 mapping. */
4855 PyErr_Clear();
4856 *result = NULL;
4857 return 0;
4858 } else
4859 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 }
4861 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004862 *result = x;
4863 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 }
4865 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004866 long value = PyInt_AS_LONG(x);
4867 long max = PyUnicode_GetMax();
4868 if (value < 0 || value > max) {
4869 PyErr_Format(PyExc_TypeError,
4870 "character mapping must be in range(0x%lx)", max+1);
4871 Py_DECREF(x);
4872 return -1;
4873 }
4874 *result = x;
4875 return 0;
4876 }
4877 else if (PyUnicode_Check(x)) {
4878 *result = x;
4879 return 0;
4880 }
4881 else {
4882 /* wrong return value */
4883 PyErr_SetString(PyExc_TypeError,
4884 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004885 Py_DECREF(x);
4886 return -1;
4887 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888}
4889/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004890 if not reallocate and adjust various state variables.
4891 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892static
Walter Dörwald4894c302003-10-24 14:25:28 +00004893int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004894 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004897 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004898 /* remember old output position */
4899 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4900 /* exponentially overallocate to minimize reallocations */
4901 if (requiredsize < 2 * oldsize)
4902 requiredsize = 2 * oldsize;
4903 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4904 return -1;
4905 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 }
4907 return 0;
4908}
4909/* lookup the character, put the result in the output string and adjust
4910 various state variables. Return a new reference to the object that
4911 was put in the output buffer in *result, or Py_None, if the mapping was
4912 undefined (in which case no character was written).
4913 The called must decref result.
4914 Return 0 on success, -1 on error. */
4915static
Walter Dörwald4894c302003-10-24 14:25:28 +00004916int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004917 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4918 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919{
Walter Dörwald4894c302003-10-24 14:25:28 +00004920 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004921 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004923 /* not found => default to 1:1 mapping */
4924 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 }
4926 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004927 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004929 /* no overflow check, because we know that the space is enough */
4930 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931 }
4932 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004933 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4934 if (repsize==1) {
4935 /* no overflow check, because we know that the space is enough */
4936 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4937 }
4938 else if (repsize!=0) {
4939 /* more than one character */
4940 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4941 (insize - (curinp-startinp)) +
4942 repsize - 1;
4943 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4944 return -1;
4945 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4946 *outp += repsize;
4947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 }
4949 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004950 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 return 0;
4952}
4953
4954PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 Py_ssize_t size,
4956 PyObject *mapping,
4957 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 /* output object */
4960 PyObject *res = NULL;
4961 /* pointers to the beginning and end+1 of input */
4962 const Py_UNICODE *startp = p;
4963 const Py_UNICODE *endp = p + size;
4964 /* pointer into the output */
4965 Py_UNICODE *str;
4966 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004967 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004968 char *reason = "character maps to <undefined>";
4969 PyObject *errorHandler = NULL;
4970 PyObject *exc = NULL;
4971 /* the following variable is used for caching string comparisons
4972 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4973 * 3=ignore, 4=xmlcharrefreplace */
4974 int known_errorHandler = -1;
4975
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004977 PyErr_BadArgument();
4978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980
4981 /* allocate enough for a simple 1:1 translation without
4982 replacements, if we need more, we'll resize */
4983 res = PyUnicode_FromUnicode(NULL, size);
4984 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004987 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004990 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004991 /* try to encode it */
4992 PyObject *x = NULL;
4993 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4994 Py_XDECREF(x);
4995 goto onError;
4996 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004997 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004998 if (x!=Py_None) /* it worked => adjust input pointer */
4999 ++p;
5000 else { /* untranslatable character */
5001 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5002 Py_ssize_t repsize;
5003 Py_ssize_t newpos;
5004 Py_UNICODE *uni2;
5005 /* startpos for collecting untranslatable chars */
5006 const Py_UNICODE *collstart = p;
5007 const Py_UNICODE *collend = p+1;
5008 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005010 /* find all untranslatable characters */
5011 while (collend < endp) {
5012 if (charmaptranslate_lookup(*collend, mapping, &x))
5013 goto onError;
5014 Py_XDECREF(x);
5015 if (x!=Py_None)
5016 break;
5017 ++collend;
5018 }
5019 /* cache callback name lookup
5020 * (if not done yet, i.e. it's the first error) */
5021 if (known_errorHandler==-1) {
5022 if ((errors==NULL) || (!strcmp(errors, "strict")))
5023 known_errorHandler = 1;
5024 else if (!strcmp(errors, "replace"))
5025 known_errorHandler = 2;
5026 else if (!strcmp(errors, "ignore"))
5027 known_errorHandler = 3;
5028 else if (!strcmp(errors, "xmlcharrefreplace"))
5029 known_errorHandler = 4;
5030 else
5031 known_errorHandler = 0;
5032 }
5033 switch (known_errorHandler) {
5034 case 1: /* strict */
5035 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005036 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005037 case 2: /* replace */
5038 /* No need to check for space, this is a 1:1 replacement */
5039 for (coll = collstart; coll<collend; ++coll)
5040 *str++ = '?';
5041 /* fall through */
5042 case 3: /* ignore */
5043 p = collend;
5044 break;
5045 case 4: /* xmlcharrefreplace */
5046 /* generate replacement (temporarily (mis)uses p) */
5047 for (p = collstart; p < collend; ++p) {
5048 char buffer[2+29+1+1];
5049 char *cp;
5050 sprintf(buffer, "&#%d;", (int)*p);
5051 if (charmaptranslate_makespace(&res, &str,
5052 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5053 goto onError;
5054 for (cp = buffer; *cp; ++cp)
5055 *str++ = *cp;
5056 }
5057 p = collend;
5058 break;
5059 default:
5060 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5061 reason, startp, size, &exc,
5062 collstart-startp, collend-startp, &newpos);
5063 if (repunicode == NULL)
5064 goto onError;
5065 /* generate replacement */
5066 repsize = PyUnicode_GET_SIZE(repunicode);
5067 if (charmaptranslate_makespace(&res, &str,
5068 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5069 Py_DECREF(repunicode);
5070 goto onError;
5071 }
5072 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5073 *str++ = *uni2;
5074 p = startp + newpos;
5075 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005076 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005077 }
5078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 /* Resize if we allocated to much */
5080 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005081 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005082 if (PyUnicode_Resize(&res, respos) < 0)
5083 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 }
5085 Py_XDECREF(exc);
5086 Py_XDECREF(errorHandler);
5087 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005089 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 Py_XDECREF(res);
5091 Py_XDECREF(exc);
5092 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 return NULL;
5094}
5095
5096PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005097 PyObject *mapping,
5098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099{
5100 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005101
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 str = PyUnicode_FromObject(str);
5103 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005104 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005106 PyUnicode_GET_SIZE(str),
5107 mapping,
5108 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 Py_DECREF(str);
5110 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005111
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 Py_XDECREF(str);
5114 return NULL;
5115}
Tim Petersced69f82003-09-16 20:30:58 +00005116
Guido van Rossum9e896b32000-04-05 20:11:21 +00005117/* --- Decimal Encoder ---------------------------------------------------- */
5118
5119int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005120 Py_ssize_t length,
5121 char *output,
5122 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005123{
5124 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 PyObject *errorHandler = NULL;
5126 PyObject *exc = NULL;
5127 const char *encoding = "decimal";
5128 const char *reason = "invalid decimal Unicode string";
5129 /* the following variable is used for caching string comparisons
5130 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5131 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005132
5133 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005134 PyErr_BadArgument();
5135 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005136 }
5137
5138 p = s;
5139 end = s + length;
5140 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005141 register Py_UNICODE ch = *p;
5142 int decimal;
5143 PyObject *repunicode;
5144 Py_ssize_t repsize;
5145 Py_ssize_t newpos;
5146 Py_UNICODE *uni2;
5147 Py_UNICODE *collstart;
5148 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005149
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005150 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005151 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005152 ++p;
5153 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005154 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005155 decimal = Py_UNICODE_TODECIMAL(ch);
5156 if (decimal >= 0) {
5157 *output++ = '0' + decimal;
5158 ++p;
5159 continue;
5160 }
5161 if (0 < ch && ch < 256) {
5162 *output++ = (char)ch;
5163 ++p;
5164 continue;
5165 }
5166 /* All other characters are considered unencodable */
5167 collstart = p;
5168 collend = p+1;
5169 while (collend < end) {
5170 if ((0 < *collend && *collend < 256) ||
5171 !Py_UNICODE_ISSPACE(*collend) ||
5172 Py_UNICODE_TODECIMAL(*collend))
5173 break;
5174 }
5175 /* cache callback name lookup
5176 * (if not done yet, i.e. it's the first error) */
5177 if (known_errorHandler==-1) {
5178 if ((errors==NULL) || (!strcmp(errors, "strict")))
5179 known_errorHandler = 1;
5180 else if (!strcmp(errors, "replace"))
5181 known_errorHandler = 2;
5182 else if (!strcmp(errors, "ignore"))
5183 known_errorHandler = 3;
5184 else if (!strcmp(errors, "xmlcharrefreplace"))
5185 known_errorHandler = 4;
5186 else
5187 known_errorHandler = 0;
5188 }
5189 switch (known_errorHandler) {
5190 case 1: /* strict */
5191 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5192 goto onError;
5193 case 2: /* replace */
5194 for (p = collstart; p < collend; ++p)
5195 *output++ = '?';
5196 /* fall through */
5197 case 3: /* ignore */
5198 p = collend;
5199 break;
5200 case 4: /* xmlcharrefreplace */
5201 /* generate replacement (temporarily (mis)uses p) */
5202 for (p = collstart; p < collend; ++p)
5203 output += sprintf(output, "&#%d;", (int)*p);
5204 p = collend;
5205 break;
5206 default:
5207 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5208 encoding, reason, s, length, &exc,
5209 collstart-s, collend-s, &newpos);
5210 if (repunicode == NULL)
5211 goto onError;
5212 /* generate replacement */
5213 repsize = PyUnicode_GET_SIZE(repunicode);
5214 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5215 Py_UNICODE ch = *uni2;
5216 if (Py_UNICODE_ISSPACE(ch))
5217 *output++ = ' ';
5218 else {
5219 decimal = Py_UNICODE_TODECIMAL(ch);
5220 if (decimal >= 0)
5221 *output++ = '0' + decimal;
5222 else if (0 < ch && ch < 256)
5223 *output++ = (char)ch;
5224 else {
5225 Py_DECREF(repunicode);
5226 raise_encode_exception(&exc, encoding,
5227 s, length, collstart-s, collend-s, reason);
5228 goto onError;
5229 }
5230 }
5231 }
5232 p = s + newpos;
5233 Py_DECREF(repunicode);
5234 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005235 }
5236 /* 0-terminate the output string */
5237 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 Py_XDECREF(exc);
5239 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005240 return 0;
5241
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005242 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 Py_XDECREF(exc);
5244 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005245 return -1;
5246}
5247
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248/* --- Helpers ------------------------------------------------------------ */
5249
Eric Smitha9f7d622008-02-17 19:46:49 +00005250#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005251#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005252
5253#include "stringlib/count.h"
5254#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005255#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005256#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005257
Fredrik Lundhc8162812006-05-26 19:33:03 +00005258/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005259#define ADJUST_INDICES(start, end, len) \
5260 if (end > len) \
5261 end = len; \
5262 else if (end < 0) { \
5263 end += len; \
5264 if (end < 0) \
5265 end = 0; \
5266 } \
5267 if (start < 0) { \
5268 start += len; \
5269 if (start < 0) \
5270 start = 0; \
5271 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005272
Martin v. Löwis18e16552006-02-15 17:27:45 +00005273Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005274 PyObject *substr,
5275 Py_ssize_t start,
5276 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005278 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005279 PyUnicodeObject* str_obj;
5280 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005281
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005282 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5283 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005284 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005285 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5286 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005287 Py_DECREF(str_obj);
5288 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 }
Tim Petersced69f82003-09-16 20:30:58 +00005290
Antoine Pitrou64672132010-01-13 07:55:48 +00005291 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005292 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005293 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5294 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005295 );
5296
5297 Py_DECREF(sub_obj);
5298 Py_DECREF(str_obj);
5299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 return result;
5301}
5302
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005304 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005305 Py_ssize_t start,
5306 Py_ssize_t end,
5307 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005310
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005311 str = PyUnicode_FromObject(str);
5312 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005313 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005314 sub = PyUnicode_FromObject(sub);
5315 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005316 Py_DECREF(str);
5317 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 }
Tim Petersced69f82003-09-16 20:30:58 +00005319
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005320 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005321 result = stringlib_find_slice(
5322 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5323 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5324 start, end
5325 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005326 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005327 result = stringlib_rfind_slice(
5328 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5329 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5330 start, end
5331 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005332
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005333 Py_DECREF(str);
5334 Py_DECREF(sub);
5335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 return result;
5337}
5338
Tim Petersced69f82003-09-16 20:30:58 +00005339static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005341 PyUnicodeObject *substring,
5342 Py_ssize_t start,
5343 Py_ssize_t end,
5344 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 if (substring->length == 0)
5347 return 1;
5348
Antoine Pitrou64672132010-01-13 07:55:48 +00005349 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 end -= substring->length;
5351 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005352 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353
5354 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005355 if (Py_UNICODE_MATCH(self, end, substring))
5356 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 } else {
5358 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005359 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 }
5361
5362 return 0;
5363}
5364
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005366 PyObject *substr,
5367 Py_ssize_t start,
5368 Py_ssize_t end,
5369 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005371 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 str = PyUnicode_FromObject(str);
5374 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005375 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 substr = PyUnicode_FromObject(substr);
5377 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005378 Py_DECREF(str);
5379 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 }
Tim Petersced69f82003-09-16 20:30:58 +00005381
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 (PyUnicodeObject *)substr,
5384 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 Py_DECREF(str);
5386 Py_DECREF(substr);
5387 return result;
5388}
5389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390/* Apply fixfct filter to the Unicode object self and return a
5391 reference to the modified object */
5392
Tim Petersced69f82003-09-16 20:30:58 +00005393static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005395 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
5397
5398 PyUnicodeObject *u;
5399
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005400 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005402 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005403
5404 Py_UNICODE_COPY(u->str, self->str, self->length);
5405
Tim Peters7a29bd52001-09-12 03:03:31 +00005406 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005407 /* fixfct should return TRUE if it modified the buffer. If
5408 FALSE, return a reference to the original buffer instead
5409 (to save space, not time) */
5410 Py_INCREF(self);
5411 Py_DECREF(u);
5412 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 }
5414 return (PyObject*) u;
5415}
5416
Tim Petersced69f82003-09-16 20:30:58 +00005417static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418int fixupper(PyUnicodeObject *self)
5419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005420 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 Py_UNICODE *s = self->str;
5422 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005426
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005427 ch = Py_UNICODE_TOUPPER(*s);
5428 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005430 *s = ch;
5431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 s++;
5433 }
5434
5435 return status;
5436}
5437
Tim Petersced69f82003-09-16 20:30:58 +00005438static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439int fixlower(PyUnicodeObject *self)
5440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005441 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 Py_UNICODE *s = self->str;
5443 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005447
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005448 ch = Py_UNICODE_TOLOWER(*s);
5449 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 *s = ch;
5452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 s++;
5454 }
5455
5456 return status;
5457}
5458
Tim Petersced69f82003-09-16 20:30:58 +00005459static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460int fixswapcase(PyUnicodeObject *self)
5461{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 Py_UNICODE *s = self->str;
5464 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005465
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 while (len-- > 0) {
5467 if (Py_UNICODE_ISUPPER(*s)) {
5468 *s = Py_UNICODE_TOLOWER(*s);
5469 status = 1;
5470 } else if (Py_UNICODE_ISLOWER(*s)) {
5471 *s = Py_UNICODE_TOUPPER(*s);
5472 status = 1;
5473 }
5474 s++;
5475 }
5476
5477 return status;
5478}
5479
Tim Petersced69f82003-09-16 20:30:58 +00005480static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481int fixcapitalize(PyUnicodeObject *self)
5482{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005483 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005484 Py_UNICODE *s = self->str;
5485 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005486
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005487 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005488 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005489 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005490 *s = Py_UNICODE_TOUPPER(*s);
5491 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005493 s++;
5494 while (--len > 0) {
5495 if (Py_UNICODE_ISUPPER(*s)) {
5496 *s = Py_UNICODE_TOLOWER(*s);
5497 status = 1;
5498 }
5499 s++;
5500 }
5501 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502}
5503
5504static
5505int fixtitle(PyUnicodeObject *self)
5506{
5507 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5508 register Py_UNICODE *e;
5509 int previous_is_cased;
5510
5511 /* Shortcut for single character strings */
5512 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005513 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5514 if (*p != ch) {
5515 *p = ch;
5516 return 1;
5517 }
5518 else
5519 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 }
Tim Petersced69f82003-09-16 20:30:58 +00005521
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 e = p + PyUnicode_GET_SIZE(self);
5523 previous_is_cased = 0;
5524 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005525 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005526
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005527 if (previous_is_cased)
5528 *p = Py_UNICODE_TOLOWER(ch);
5529 else
5530 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005531
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005532 if (Py_UNICODE_ISLOWER(ch) ||
5533 Py_UNICODE_ISUPPER(ch) ||
5534 Py_UNICODE_ISTITLE(ch))
5535 previous_is_cased = 1;
5536 else
5537 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 }
5539 return 1;
5540}
5541
Tim Peters8ce9f162004-08-27 01:49:32 +00005542PyObject *
5543PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544{
Tim Peters8ce9f162004-08-27 01:49:32 +00005545 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005546 const Py_UNICODE blank = ' ';
5547 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005548 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005549 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005550 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5551 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005552 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5553 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005556 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
Tim Peters05eba1f2004-08-27 21:32:02 +00005558 fseq = PySequence_Fast(seq, "");
5559 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005560 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005561 }
5562
Tim Peters91879ab2004-08-27 22:35:44 +00005563 /* Grrrr. A codec may be invoked to convert str objects to
5564 * Unicode, and so it's possible to call back into Python code
5565 * during PyUnicode_FromObject(), and so it's possible for a sick
5566 * codec to change the size of fseq (if seq is a list). Therefore
5567 * we have to keep refetching the size -- can't assume seqlen
5568 * is invariant.
5569 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005570 seqlen = PySequence_Fast_GET_SIZE(fseq);
5571 /* If empty sequence, return u"". */
5572 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005573 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5574 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 }
5576 /* If singleton sequence with an exact Unicode, return that. */
5577 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005578 item = PySequence_Fast_GET_ITEM(fseq, 0);
5579 if (PyUnicode_CheckExact(item)) {
5580 Py_INCREF(item);
5581 res = (PyUnicodeObject *)item;
5582 goto Done;
5583 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005584 }
5585
Tim Peters05eba1f2004-08-27 21:32:02 +00005586 /* At least two items to join, or one that isn't exact Unicode. */
5587 if (seqlen > 1) {
5588 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005589 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005590 sep = &blank;
5591 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005592 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005593 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005594 internal_separator = PyUnicode_FromObject(separator);
5595 if (internal_separator == NULL)
5596 goto onError;
5597 sep = PyUnicode_AS_UNICODE(internal_separator);
5598 seplen = PyUnicode_GET_SIZE(internal_separator);
5599 /* In case PyUnicode_FromObject() mutated seq. */
5600 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005601 }
5602 }
5603
5604 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005605 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005607 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005608 res_p = PyUnicode_AS_UNICODE(res);
5609 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005610
Tim Peters05eba1f2004-08-27 21:32:02 +00005611 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005612 Py_ssize_t itemlen;
5613 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005614
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005615 item = PySequence_Fast_GET_ITEM(fseq, i);
5616 /* Convert item to Unicode. */
5617 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5618 PyErr_Format(PyExc_TypeError,
5619 "sequence item %zd: expected string or Unicode,"
5620 " %.80s found",
5621 i, Py_TYPE(item)->tp_name);
5622 goto onError;
5623 }
5624 item = PyUnicode_FromObject(item);
5625 if (item == NULL)
5626 goto onError;
5627 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005628
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005629 /* In case PyUnicode_FromObject() mutated seq. */
5630 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005631
Tim Peters8ce9f162004-08-27 01:49:32 +00005632 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005633 itemlen = PyUnicode_GET_SIZE(item);
5634 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005635 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005636 goto Overflow;
5637 if (i < seqlen - 1) {
5638 new_res_used += seplen;
5639 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005640 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005641 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005642 if (new_res_used > res_alloc) {
5643 /* double allocated size until it's big enough */
5644 do {
5645 res_alloc += res_alloc;
5646 if (res_alloc <= 0)
5647 goto Overflow;
5648 } while (new_res_used > res_alloc);
5649 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5650 Py_DECREF(item);
5651 goto onError;
5652 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005653 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005654 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005655
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005656 /* Copy item, and maybe the separator. */
5657 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5658 res_p += itemlen;
5659 if (i < seqlen - 1) {
5660 Py_UNICODE_COPY(res_p, sep, seplen);
5661 res_p += seplen;
5662 }
5663 Py_DECREF(item);
5664 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005665 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005666
Tim Peters05eba1f2004-08-27 21:32:02 +00005667 /* Shrink res to match the used area; this probably can't fail,
5668 * but it's cheap to check.
5669 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005670 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005671 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005672
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005673 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005674 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005675 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 return (PyObject *)res;
5677
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005678 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005680 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005681 Py_DECREF(item);
5682 /* fall through */
5683
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005684 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005686 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005687 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 return NULL;
5689}
5690
Tim Petersced69f82003-09-16 20:30:58 +00005691static
5692PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005693 Py_ssize_t left,
5694 Py_ssize_t right,
5695 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696{
5697 PyUnicodeObject *u;
5698
5699 if (left < 0)
5700 left = 0;
5701 if (right < 0)
5702 right = 0;
5703
Tim Peters7a29bd52001-09-12 03:03:31 +00005704 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 Py_INCREF(self);
5706 return self;
5707 }
5708
Neal Norwitze7d8be82008-07-31 17:17:14 +00005709 if (left > PY_SSIZE_T_MAX - self->length ||
5710 right > PY_SSIZE_T_MAX - (left + self->length)) {
5711 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5712 return NULL;
5713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 u = _PyUnicode_New(left + self->length + right);
5715 if (u) {
5716 if (left)
5717 Py_UNICODE_FILL(u->str, fill, left);
5718 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5719 if (right)
5720 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5721 }
5722
5723 return u;
5724}
5725
Antoine Pitrou64672132010-01-13 07:55:48 +00005726PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
5730 string = PyUnicode_FromObject(string);
5731 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Antoine Pitrou64672132010-01-13 07:55:48 +00005734 list = stringlib_splitlines(
5735 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5736 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
5738 Py_DECREF(string);
5739 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740}
5741
Tim Petersced69f82003-09-16 20:30:58 +00005742static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005744 PyUnicodeObject *substring,
5745 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005748 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005751 return stringlib_split_whitespace(
5752 (PyObject*) self, self->str, self->length, maxcount
5753 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Antoine Pitrou64672132010-01-13 07:55:48 +00005755 return stringlib_split(
5756 (PyObject*) self, self->str, self->length,
5757 substring->str, substring->length,
5758 maxcount
5759 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760}
5761
Tim Petersced69f82003-09-16 20:30:58 +00005762static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005763PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005764 PyUnicodeObject *substring,
5765 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005766{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005768 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005769
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005770 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005771 return stringlib_rsplit_whitespace(
5772 (PyObject*) self, self->str, self->length, maxcount
5773 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774
Antoine Pitrou64672132010-01-13 07:55:48 +00005775 return stringlib_rsplit(
5776 (PyObject*) self, self->str, self->length,
5777 substring->str, substring->length,
5778 maxcount
5779 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005780}
5781
5782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005784 PyUnicodeObject *str1,
5785 PyUnicodeObject *str2,
5786 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787{
5788 PyUnicodeObject *u;
5789
5790 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005791 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005792 else if (maxcount == 0 || self->length == 0)
5793 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794
Fredrik Lundh347ee272006-05-24 16:35:18 +00005795 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005796 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005797 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005798 if (str1->length == 0)
5799 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005800 if (str1->length == 1) {
5801 /* replace characters */
5802 Py_UNICODE u1, u2;
5803 if (!findchar(self->str, self->length, str1->str[0]))
5804 goto nothing;
5805 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5806 if (!u)
5807 return NULL;
5808 Py_UNICODE_COPY(u->str, self->str, self->length);
5809 u1 = str1->str[0];
5810 u2 = str2->str[0];
5811 for (i = 0; i < u->length; i++)
5812 if (u->str[i] == u1) {
5813 if (--maxcount < 0)
5814 break;
5815 u->str[i] = u2;
5816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005818 i = stringlib_find(
5819 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005821 if (i < 0)
5822 goto nothing;
5823 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5824 if (!u)
5825 return NULL;
5826 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005827
5828 /* change everything in-place, starting with this one */
5829 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5830 i += str1->length;
5831
5832 while ( --maxcount > 0) {
5833 i = stringlib_find(self->str+i, self->length-i,
5834 str1->str, str1->length,
5835 i);
5836 if (i == -1)
5837 break;
5838 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5839 i += str1->length;
5840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005843
Brett Cannona7f13ee2010-05-04 01:16:51 +00005844 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005845 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 Py_UNICODE *p;
5847
5848 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005849 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5850 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005851 if (n == 0)
5852 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005853 /* new_size = self->length + n * (str2->length - str1->length)); */
5854 delta = (str2->length - str1->length);
5855 if (delta == 0) {
5856 new_size = self->length;
5857 } else {
5858 product = n * (str2->length - str1->length);
5859 if ((product / (str2->length - str1->length)) != n) {
5860 PyErr_SetString(PyExc_OverflowError,
5861 "replace string is too long");
5862 return NULL;
5863 }
5864 new_size = self->length + product;
5865 if (new_size < 0) {
5866 PyErr_SetString(PyExc_OverflowError,
5867 "replace string is too long");
5868 return NULL;
5869 }
5870 }
5871 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005872 if (!u)
5873 return NULL;
5874 i = 0;
5875 p = u->str;
5876 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005877 while (n-- > 0) {
5878 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005879 j = stringlib_find(self->str+i, self->length-i,
5880 str1->str, str1->length,
5881 i);
5882 if (j == -1)
5883 break;
5884 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005885 /* copy unchanged part [i:j] */
5886 Py_UNICODE_COPY(p, self->str+i, j-i);
5887 p += j - i;
5888 }
5889 /* copy substitution string */
5890 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005891 Py_UNICODE_COPY(p, str2->str, str2->length);
5892 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005893 }
5894 i = j + str1->length;
5895 }
5896 if (i < self->length)
5897 /* copy tail [i:] */
5898 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005899 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005900 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005901 while (n > 0) {
5902 Py_UNICODE_COPY(p, str2->str, str2->length);
5903 p += str2->length;
5904 if (--n <= 0)
5905 break;
5906 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005908 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 }
5910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005912
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005913 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005914 /* nothing to replace; return original string (when possible) */
5915 if (PyUnicode_CheckExact(self)) {
5916 Py_INCREF(self);
5917 return (PyObject *) self;
5918 }
5919 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920}
5921
5922/* --- Unicode Object Methods --------------------------------------------- */
5923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005924PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005925 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926\n\
5927Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005928characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929
5930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005931unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 return fixup(self, fixtitle);
5934}
5935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005936PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005937 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938\n\
5939Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005940have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941
5942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005943unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 return fixup(self, fixcapitalize);
5946}
5947
5948#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005950 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951\n\
5952Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005953normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
5955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005956unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
5958 PyObject *list;
5959 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005960 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* Split into words */
5963 list = split(self, NULL, -1);
5964 if (!list)
5965 return NULL;
5966
5967 /* Capitalize each word */
5968 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5969 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005970 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 if (item == NULL)
5972 goto onError;
5973 Py_DECREF(PyList_GET_ITEM(list, i));
5974 PyList_SET_ITEM(list, i, item);
5975 }
5976
5977 /* Join the words to form a new string */
5978 item = PyUnicode_Join(NULL, list);
5979
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005980 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 Py_DECREF(list);
5982 return (PyObject *)item;
5983}
5984#endif
5985
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005986/* Argument converter. Coerces to a single unicode character */
5987
5988static int
5989convert_uc(PyObject *obj, void *addr)
5990{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005991 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5992 PyObject *uniobj;
5993 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005994
Benjamin Peterson857ce152009-01-31 16:29:18 +00005995 uniobj = PyUnicode_FromObject(obj);
5996 if (uniobj == NULL) {
5997 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005998 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00005999 return 0;
6000 }
6001 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6002 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006003 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006004 Py_DECREF(uniobj);
6005 return 0;
6006 }
6007 unistr = PyUnicode_AS_UNICODE(uniobj);
6008 *fillcharloc = unistr[0];
6009 Py_DECREF(uniobj);
6010 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006011}
6012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006013PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006014 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006016Return S centered in a Unicode string of length width. Padding is\n\
6017done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018
6019static PyObject *
6020unicode_center(PyUnicodeObject *self, PyObject *args)
6021{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t marg, left;
6023 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006024 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Thomas Woutersde017742006-02-16 19:34:37 +00006026 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 return NULL;
6028
Tim Peters7a29bd52001-09-12 03:03:31 +00006029 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 Py_INCREF(self);
6031 return (PyObject*) self;
6032 }
6033
6034 marg = width - self->length;
6035 left = marg / 2 + (marg & width & 1);
6036
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038}
6039
Marc-André Lemburge5034372000-08-08 08:04:29 +00006040#if 0
6041
6042/* This code should go into some future Unicode collation support
6043 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006044 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006045
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006046/* speedy UTF-16 code point order comparison */
6047/* gleaned from: */
6048/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6049
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006050static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006051{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006052 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006053 0, 0, 0, 0, 0, 0, 0, 0,
6054 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006055 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006056};
6057
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058static int
6059unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6060{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006061 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 Py_UNICODE *s1 = str1->str;
6064 Py_UNICODE *s2 = str2->str;
6065
6066 len1 = str1->length;
6067 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006070 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006071
6072 c1 = *s1++;
6073 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006074
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006075 if (c1 > (1<<11) * 26)
6076 c1 += utf16Fixup[c1>>11];
6077 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006079 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006080
6081 if (c1 != c2)
6082 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006083
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 }
6086
6087 return (len1 < len2) ? -1 : (len1 != len2);
6088}
6089
Marc-André Lemburge5034372000-08-08 08:04:29 +00006090#else
6091
6092static int
6093unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6094{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006095 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006096
6097 Py_UNICODE *s1 = str1->str;
6098 Py_UNICODE *s2 = str2->str;
6099
6100 len1 = str1->length;
6101 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006102
Marc-André Lemburge5034372000-08-08 08:04:29 +00006103 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006104 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006105
Fredrik Lundh45714e92001-06-26 16:39:36 +00006106 c1 = *s1++;
6107 c2 = *s2++;
6108
6109 if (c1 != c2)
6110 return (c1 < c2) ? -1 : 1;
6111
Marc-André Lemburge5034372000-08-08 08:04:29 +00006112 len1--; len2--;
6113 }
6114
6115 return (len1 < len2) ? -1 : (len1 != len2);
6116}
6117
6118#endif
6119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006121 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122{
6123 PyUnicodeObject *u = NULL, *v = NULL;
6124 int result;
6125
6126 /* Coerce the two arguments */
6127 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6128 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006129 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6131 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006132 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Thomas Wouters7e474022000-07-16 12:04:32 +00006134 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006136 Py_DECREF(u);
6137 Py_DECREF(v);
6138 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 }
6140
6141 result = unicode_compare(u, v);
6142
6143 Py_DECREF(u);
6144 Py_DECREF(v);
6145 return result;
6146
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006147 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 Py_XDECREF(u);
6149 Py_XDECREF(v);
6150 return -1;
6151}
6152
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006153PyObject *PyUnicode_RichCompare(PyObject *left,
6154 PyObject *right,
6155 int op)
6156{
6157 int result;
6158
6159 result = PyUnicode_Compare(left, right);
6160 if (result == -1 && PyErr_Occurred())
6161 goto onError;
6162
6163 /* Convert the return value to a Boolean */
6164 switch (op) {
6165 case Py_EQ:
6166 result = (result == 0);
6167 break;
6168 case Py_NE:
6169 result = (result != 0);
6170 break;
6171 case Py_LE:
6172 result = (result <= 0);
6173 break;
6174 case Py_GE:
6175 result = (result >= 0);
6176 break;
6177 case Py_LT:
6178 result = (result == -1);
6179 break;
6180 case Py_GT:
6181 result = (result == 1);
6182 break;
6183 }
6184 return PyBool_FromLong(result);
6185
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006186 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006187
6188 /* Standard case
6189
6190 Type errors mean that PyUnicode_FromObject() could not convert
6191 one of the arguments (usually the right hand side) to Unicode,
6192 ie. we can't handle the comparison request. However, it is
6193 possible that the other object knows a comparison method, which
6194 is why we return Py_NotImplemented to give the other object a
6195 chance.
6196
6197 */
6198 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6199 PyErr_Clear();
6200 Py_INCREF(Py_NotImplemented);
6201 return Py_NotImplemented;
6202 }
6203 if (op != Py_EQ && op != Py_NE)
6204 return NULL;
6205
6206 /* Equality comparison.
6207
6208 This is a special case: we silence any PyExc_UnicodeDecodeError
6209 and instead turn it into a PyErr_UnicodeWarning.
6210
6211 */
6212 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6213 return NULL;
6214 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006215 if (PyErr_Warn(PyExc_UnicodeWarning,
6216 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006217 "Unicode equal comparison "
6218 "failed to convert both arguments to Unicode - "
6219 "interpreting them as being unequal" :
6220 "Unicode unequal comparison "
6221 "failed to convert both arguments to Unicode - "
6222 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006223 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006224 return NULL;
6225 result = (op == Py_NE);
6226 return PyBool_FromLong(result);
6227}
6228
Guido van Rossum403d68b2000-03-13 15:55:09 +00006229int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006230 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006231{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006232 PyObject *str, *sub;
6233 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006234
6235 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006236 sub = PyUnicode_FromObject(element);
6237 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006238 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006239 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006240
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006241 str = PyUnicode_FromObject(container);
6242 if (!str) {
6243 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006244 return -1;
6245 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006246
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006247 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006248
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006249 Py_DECREF(str);
6250 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006251
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006252 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006253}
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255/* Concat to string or Unicode object giving a new Unicode object. */
6256
6257PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006258 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259{
6260 PyUnicodeObject *u = NULL, *v = NULL, *w;
6261
6262 /* Coerce the two arguments */
6263 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6264 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6267 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006268 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
6270 /* Shortcuts */
6271 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006272 Py_DECREF(v);
6273 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 }
6275 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006276 Py_DECREF(u);
6277 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 }
6279
6280 /* Concat the two Unicode strings */
6281 w = _PyUnicode_New(u->length + v->length);
6282 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006283 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 Py_UNICODE_COPY(w->str, u->str, u->length);
6285 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6286
6287 Py_DECREF(u);
6288 Py_DECREF(v);
6289 return (PyObject *)w;
6290
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006291 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 Py_XDECREF(u);
6293 Py_XDECREF(v);
6294 return NULL;
6295}
6296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006297PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006298 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006300Return the number of non-overlapping occurrences of substring sub in\n\
6301Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006302interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303
6304static PyObject *
6305unicode_count(PyUnicodeObject *self, PyObject *args)
6306{
6307 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006308 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006309 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 PyObject *result;
6311
Guido van Rossumb8872e62000-05-09 14:14:27 +00006312 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006313 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 return NULL;
6315
6316 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006317 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006319 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Antoine Pitrou64672132010-01-13 07:55:48 +00006321 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006322 result = PyInt_FromSsize_t(
6323 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006324 substring->str, substring->length,
6325 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006326 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
6328 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006329
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 return result;
6331}
6332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006333PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006334 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006336Encodes S using the codec registered for encoding. encoding defaults\n\
6337to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006338handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6340'xmlcharrefreplace' as well as any other name registered with\n\
6341codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006344unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006346 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 char *encoding = NULL;
6348 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006349 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006350
Benjamin Peterson332d7212009-09-18 21:14:55 +00006351 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6352 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006354 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006355 if (v == NULL)
6356 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006357 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006358 PyErr_Format(PyExc_TypeError,
6359 "encoder did not return a string/unicode object "
6360 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006361 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006362 Py_DECREF(v);
6363 return NULL;
6364 }
6365 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006366
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006367 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006368 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006369}
6370
6371PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006372 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006373\n\
6374Decodes S using the codec registered for encoding. encoding defaults\n\
6375to the default encoding. errors may be given to set a different error\n\
6376handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6377a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6378as well as any other name registerd with codecs.register_error that is\n\
6379able to handle UnicodeDecodeErrors.");
6380
6381static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006382unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006383{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006384 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006385 char *encoding = NULL;
6386 char *errors = NULL;
6387 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006388
Benjamin Peterson332d7212009-09-18 21:14:55 +00006389 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6390 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006391 return NULL;
6392 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006393 if (v == NULL)
6394 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006395 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396 PyErr_Format(PyExc_TypeError,
6397 "decoder did not return a string/unicode object "
6398 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006399 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006400 Py_DECREF(v);
6401 return NULL;
6402 }
6403 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006404
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006405 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407}
6408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006409PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006410 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411\n\
6412Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006413If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
6415static PyObject*
6416unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6417{
6418 Py_UNICODE *e;
6419 Py_UNICODE *p;
6420 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006421 Py_UNICODE *qe;
6422 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 PyUnicodeObject *u;
6424 int tabsize = 8;
6425
6426 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
Thomas Wouters7e474022000-07-16 12:04:32 +00006429 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006430 i = 0; /* chars up to and including most recent \n or \r */
6431 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6432 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 for (p = self->str; p < e; p++)
6434 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006435 if (tabsize > 0) {
6436 incr = tabsize - (j % tabsize); /* cannot overflow */
6437 if (j > PY_SSIZE_T_MAX - incr)
6438 goto overflow1;
6439 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006440 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006443 if (j > PY_SSIZE_T_MAX - 1)
6444 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 j++;
6446 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006447 if (i > PY_SSIZE_T_MAX - j)
6448 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006450 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
6452 }
6453
Guido van Rossum5bdff602008-03-11 21:18:06 +00006454 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006455 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 /* Second pass: create output string and fill it */
6458 u = _PyUnicode_New(i + j);
6459 if (!u)
6460 return NULL;
6461
Guido van Rossum5bdff602008-03-11 21:18:06 +00006462 j = 0; /* same as in first pass */
6463 q = u->str; /* next output char */
6464 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
6466 for (p = self->str; p < e; p++)
6467 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006468 if (tabsize > 0) {
6469 i = tabsize - (j % tabsize);
6470 j += i;
6471 while (i--) {
6472 if (q >= qe)
6473 goto overflow2;
6474 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006475 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006476 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006477 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006478 else {
6479 if (q >= qe)
6480 goto overflow2;
6481 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006482 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 if (*p == '\n' || *p == '\r')
6484 j = 0;
6485 }
6486
6487 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006488
6489 overflow2:
6490 Py_DECREF(u);
6491 overflow1:
6492 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006496PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006497 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498\n\
6499Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006500such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501arguments start and end are interpreted as in slice notation.\n\
6502\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006503Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504
6505static PyObject *
6506unicode_find(PyUnicodeObject *self, PyObject *args)
6507{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006508 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006509 Py_ssize_t start;
6510 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006511 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
Facundo Batista57d56692007-11-16 18:04:14 +00006513 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006516 result = stringlib_find_slice(
6517 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6518 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6519 start, end
6520 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006523
6524 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525}
6526
6527static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529{
6530 if (index < 0 || index >= self->length) {
6531 PyErr_SetString(PyExc_IndexError, "string index out of range");
6532 return NULL;
6533 }
6534
6535 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6536}
6537
6538static long
6539unicode_hash(PyUnicodeObject *self)
6540{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006541 /* Since Unicode objects compare equal to their ASCII string
6542 counterparts, they should use the individual character values
6543 as basis for their hash value. This is needed to assure that
6544 strings and Unicode objects behave in the same way as
6545 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
Martin v. Löwis18e16552006-02-15 17:27:45 +00006547 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006548 register Py_UNICODE *p;
6549 register long x;
6550
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006552 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006553 len = PyUnicode_GET_SIZE(self);
6554 p = PyUnicode_AS_UNICODE(self);
6555 x = *p << 7;
6556 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006557 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006558 x ^= PyUnicode_GET_SIZE(self);
6559 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006560 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006561 self->hash = x;
6562 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006565PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006566 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006568Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
6570static PyObject *
6571unicode_index(PyUnicodeObject *self, PyObject *args)
6572{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006574 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006575 Py_ssize_t start;
6576 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
Facundo Batista57d56692007-11-16 18:04:14 +00006578 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006581 result = stringlib_find_slice(
6582 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6583 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6584 start, end
6585 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 if (result < 0) {
6590 PyErr_SetString(PyExc_ValueError, "substring not found");
6591 return NULL;
6592 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006593
Martin v. Löwis18e16552006-02-15 17:27:45 +00006594 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595}
6596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006597PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006598 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006600Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006601at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
6603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006604unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
6606 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6607 register const Py_UNICODE *e;
6608 int cased;
6609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Shortcut for single character strings */
6611 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006612 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006614 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006615 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006616 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006617
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 e = p + PyUnicode_GET_SIZE(self);
6619 cased = 0;
6620 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006621 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006622
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006623 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6624 return PyBool_FromLong(0);
6625 else if (!cased && Py_UNICODE_ISLOWER(ch))
6626 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006628 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629}
6630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006631PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006632 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006634Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006635at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
6637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006638unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
6640 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6641 register const Py_UNICODE *e;
6642 int cased;
6643
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 /* Shortcut for single character strings */
6645 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006646 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006648 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006649 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006650 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006651
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 e = p + PyUnicode_GET_SIZE(self);
6653 cased = 0;
6654 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006655 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006656
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006657 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6658 return PyBool_FromLong(0);
6659 else if (!cased && Py_UNICODE_ISUPPER(ch))
6660 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006662 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663}
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006666 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006668Return True if S is a titlecased string and there is at least one\n\
6669character in S, i.e. upper- and titlecase characters may only\n\
6670follow uncased characters and lowercase characters only cased ones.\n\
6671Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006674unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675{
6676 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6677 register const Py_UNICODE *e;
6678 int cased, previous_is_cased;
6679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 /* Shortcut for single character strings */
6681 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006682 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6683 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006685 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006686 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006687 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 e = p + PyUnicode_GET_SIZE(self);
6690 cased = 0;
6691 previous_is_cased = 0;
6692 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006693 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006694
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006695 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6696 if (previous_is_cased)
6697 return PyBool_FromLong(0);
6698 previous_is_cased = 1;
6699 cased = 1;
6700 }
6701 else if (Py_UNICODE_ISLOWER(ch)) {
6702 if (!previous_is_cased)
6703 return PyBool_FromLong(0);
6704 previous_is_cased = 1;
6705 cased = 1;
6706 }
6707 else
6708 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006710 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711}
6712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006714 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006716Return True if all characters in S are whitespace\n\
6717and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718
6719static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006720unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
6722 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6723 register const Py_UNICODE *e;
6724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 /* Shortcut for single character strings */
6726 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006727 Py_UNICODE_ISSPACE(*p))
6728 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006730 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006731 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006732 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 e = p + PyUnicode_GET_SIZE(self);
6735 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006736 if (!Py_UNICODE_ISSPACE(*p))
6737 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006743 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006744\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006745Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006747
6748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006750{
6751 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6752 register const Py_UNICODE *e;
6753
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006754 /* Shortcut for single character strings */
6755 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006756 Py_UNICODE_ISALPHA(*p))
6757 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006758
6759 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006760 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006761 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006762
6763 e = p + PyUnicode_GET_SIZE(self);
6764 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006765 if (!Py_UNICODE_ISALPHA(*p))
6766 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006769}
6770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006772 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006773\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006774Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006776
6777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006778unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006779{
6780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6781 register const Py_UNICODE *e;
6782
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783 /* Shortcut for single character strings */
6784 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006785 Py_UNICODE_ISALNUM(*p))
6786 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006787
6788 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006789 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006790 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006791
6792 e = p + PyUnicode_GET_SIZE(self);
6793 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006794 if (!Py_UNICODE_ISALNUM(*p))
6795 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798}
6799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006800PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006801 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006803Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
6806static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006807unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808{
6809 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6810 register const Py_UNICODE *e;
6811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 /* Shortcut for single character strings */
6813 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006814 Py_UNICODE_ISDECIMAL(*p))
6815 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006817 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006818 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006819 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 e = p + PyUnicode_GET_SIZE(self);
6822 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006823 if (!Py_UNICODE_ISDECIMAL(*p))
6824 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827}
6828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006829PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006830 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006832Return True if all characters in S are digits\n\
6833and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834
6835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006836unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837{
6838 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6839 register const Py_UNICODE *e;
6840
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 /* Shortcut for single character strings */
6842 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006843 Py_UNICODE_ISDIGIT(*p))
6844 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006846 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006847 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006848 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006849
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 e = p + PyUnicode_GET_SIZE(self);
6851 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006852 if (!Py_UNICODE_ISDIGIT(*p))
6853 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856}
6857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006859 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
6864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006865unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
6867 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6868 register const Py_UNICODE *e;
6869
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 /* Shortcut for single character strings */
6871 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006872 Py_UNICODE_ISNUMERIC(*p))
6873 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006875 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006876 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006877 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 e = p + PyUnicode_GET_SIZE(self);
6880 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006881 if (!Py_UNICODE_ISNUMERIC(*p))
6882 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885}
6886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006888 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889\n\
6890Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006891iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
6893static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006894unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006896 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900unicode_length(PyUnicodeObject *self)
6901{
6902 return self->length;
6903}
6904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006905PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006906 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006908Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006909done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910
6911static PyObject *
6912unicode_ljust(PyUnicodeObject *self, PyObject *args)
6913{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006914 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006915 Py_UNICODE fillchar = ' ';
6916
Martin v. Löwis412fb672006-04-13 06:34:32 +00006917 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 return NULL;
6919
Tim Peters7a29bd52001-09-12 03:03:31 +00006920 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 Py_INCREF(self);
6922 return (PyObject*) self;
6923 }
6924
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006925 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926}
6927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006928PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006929 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932
6933static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006934unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 return fixup(self, fixlower);
6937}
6938
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006939#define LEFTSTRIP 0
6940#define RIGHTSTRIP 1
6941#define BOTHSTRIP 2
6942
6943/* Arrays indexed by above */
6944static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6945
6946#define STRIPNAME(i) (stripformat[i]+3)
6947
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006948/* externally visible for str.strip(unicode) */
6949PyObject *
6950_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6951{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006952 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6953 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6954 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6955 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6956 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006957
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006958 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006959
Benjamin Peterson857ce152009-01-31 16:29:18 +00006960 i = 0;
6961 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006962 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6963 i++;
6964 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006965 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006966
Benjamin Peterson857ce152009-01-31 16:29:18 +00006967 j = len;
6968 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006969 do {
6970 j--;
6971 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6972 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006973 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006974
Benjamin Peterson857ce152009-01-31 16:29:18 +00006975 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006976 Py_INCREF(self);
6977 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006978 }
6979 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006980 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006981}
6982
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
6984static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006985do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006987 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6988 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006989
Benjamin Peterson857ce152009-01-31 16:29:18 +00006990 i = 0;
6991 if (striptype != RIGHTSTRIP) {
6992 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6993 i++;
6994 }
6995 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006996
Benjamin Peterson857ce152009-01-31 16:29:18 +00006997 j = len;
6998 if (striptype != LEFTSTRIP) {
6999 do {
7000 j--;
7001 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7002 j++;
7003 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007004
Benjamin Peterson857ce152009-01-31 16:29:18 +00007005 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7006 Py_INCREF(self);
7007 return (PyObject*)self;
7008 }
7009 else
7010 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011}
7012
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007013
7014static PyObject *
7015do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7016{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007017 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007018
Benjamin Peterson857ce152009-01-31 16:29:18 +00007019 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7020 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007021
Benjamin Peterson857ce152009-01-31 16:29:18 +00007022 if (sep != NULL && sep != Py_None) {
7023 if (PyUnicode_Check(sep))
7024 return _PyUnicode_XStrip(self, striptype, sep);
7025 else if (PyString_Check(sep)) {
7026 PyObject *res;
7027 sep = PyUnicode_FromObject(sep);
7028 if (sep==NULL)
7029 return NULL;
7030 res = _PyUnicode_XStrip(self, striptype, sep);
7031 Py_DECREF(sep);
7032 return res;
7033 }
7034 else {
7035 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007036 "%s arg must be None, unicode or str",
7037 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007038 return NULL;
7039 }
7040 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041
Benjamin Peterson857ce152009-01-31 16:29:18 +00007042 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007043}
7044
7045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007046PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007047 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007048\n\
7049Return a copy of the string S with leading and trailing\n\
7050whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007051If chars is given and not None, remove characters in chars instead.\n\
7052If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053
7054static PyObject *
7055unicode_strip(PyUnicodeObject *self, PyObject *args)
7056{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007057 if (PyTuple_GET_SIZE(args) == 0)
7058 return do_strip(self, BOTHSTRIP); /* Common case */
7059 else
7060 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007061}
7062
7063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007064PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007065 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066\n\
7067Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007068If chars is given and not None, remove characters in chars instead.\n\
7069If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007070
7071static PyObject *
7072unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7073{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007074 if (PyTuple_GET_SIZE(args) == 0)
7075 return do_strip(self, LEFTSTRIP); /* Common case */
7076 else
7077 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078}
7079
7080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007082 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083\n\
7084Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007085If chars is given and not None, remove characters in chars instead.\n\
7086If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087
7088static PyObject *
7089unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7090{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007091 if (PyTuple_GET_SIZE(args) == 0)
7092 return do_strip(self, RIGHTSTRIP); /* Common case */
7093 else
7094 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007095}
7096
7097
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007099unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100{
7101 PyUnicodeObject *u;
7102 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007103 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007104 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106 if (len < 0)
7107 len = 0;
7108
Tim Peters7a29bd52001-09-12 03:03:31 +00007109 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 /* no repeat, return original string */
7111 Py_INCREF(str);
7112 return (PyObject*) str;
7113 }
Tim Peters8f422462000-09-09 06:13:41 +00007114
7115 /* ensure # of chars needed doesn't overflow int and # of bytes
7116 * needed doesn't overflow size_t
7117 */
7118 nchars = len * str->length;
7119 if (len && nchars / len != str->length) {
7120 PyErr_SetString(PyExc_OverflowError,
7121 "repeated string is too long");
7122 return NULL;
7123 }
7124 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7125 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7126 PyErr_SetString(PyExc_OverflowError,
7127 "repeated string is too long");
7128 return NULL;
7129 }
7130 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 if (!u)
7132 return NULL;
7133
7134 p = u->str;
7135
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007136 if (str->length == 1 && len > 0) {
7137 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007138 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007139 Py_ssize_t done = 0; /* number of characters copied this far */
7140 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007141 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007142 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007143 }
7144 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007145 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007146 Py_UNICODE_COPY(p+done, p, n);
7147 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007148 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151 return (PyObject*) u;
7152}
7153
7154PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007155 PyObject *subobj,
7156 PyObject *replobj,
7157 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158{
7159 PyObject *self;
7160 PyObject *str1;
7161 PyObject *str2;
7162 PyObject *result;
7163
7164 self = PyUnicode_FromObject(obj);
7165 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 str1 = PyUnicode_FromObject(subobj);
7168 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007169 Py_DECREF(self);
7170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 }
7172 str2 = PyUnicode_FromObject(replobj);
7173 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007174 Py_DECREF(self);
7175 Py_DECREF(str1);
7176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 }
Tim Petersced69f82003-09-16 20:30:58 +00007178 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007179 (PyUnicodeObject *)str1,
7180 (PyUnicodeObject *)str2,
7181 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 Py_DECREF(self);
7183 Py_DECREF(str1);
7184 Py_DECREF(str2);
7185 return result;
7186}
7187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007188PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007189 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190\n\
7191Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007192old replaced by new. If the optional argument count is\n\
7193given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194
7195static PyObject*
7196unicode_replace(PyUnicodeObject *self, PyObject *args)
7197{
7198 PyUnicodeObject *str1;
7199 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 PyObject *result;
7202
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 return NULL;
7205 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7206 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007209 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007210 Py_DECREF(str1);
7211 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214 result = replace(self, str1, str2, maxcount);
7215
7216 Py_DECREF(str1);
7217 Py_DECREF(str2);
7218 return result;
7219}
7220
7221static
7222PyObject *unicode_repr(PyObject *unicode)
7223{
7224 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007225 PyUnicode_GET_SIZE(unicode),
7226 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227}
7228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007229PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007230 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231\n\
7232Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007233such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234arguments start and end are interpreted as in slice notation.\n\
7235\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007236Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237
7238static PyObject *
7239unicode_rfind(PyUnicodeObject *self, PyObject *args)
7240{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007241 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007242 Py_ssize_t start;
7243 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007244 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245
Facundo Batista57d56692007-11-16 18:04:14 +00007246 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007249 result = stringlib_rfind_slice(
7250 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7251 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7252 start, end
7253 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
7255 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007256
7257 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258}
7259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007260PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007261 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007263Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
7265static PyObject *
7266unicode_rindex(PyUnicodeObject *self, PyObject *args)
7267{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007268 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007269 Py_ssize_t start;
7270 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007271 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
Facundo Batista57d56692007-11-16 18:04:14 +00007273 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007276 result = stringlib_rfind_slice(
7277 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7278 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7279 start, end
7280 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
7282 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007283
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 if (result < 0) {
7285 PyErr_SetString(PyExc_ValueError, "substring not found");
7286 return NULL;
7287 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007288 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289}
7290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007291PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007292 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007294Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007295done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
7297static PyObject *
7298unicode_rjust(PyUnicodeObject *self, PyObject *args)
7299{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007300 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007301 Py_UNICODE fillchar = ' ';
7302
Martin v. Löwis412fb672006-04-13 06:34:32 +00007303 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 return NULL;
7305
Tim Peters7a29bd52001-09-12 03:03:31 +00007306 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 Py_INCREF(self);
7308 return (PyObject*) self;
7309 }
7310
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007311 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312}
7313
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007315unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316{
7317 /* standard clamping */
7318 if (start < 0)
7319 start = 0;
7320 if (end < 0)
7321 end = 0;
7322 if (end > self->length)
7323 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007324 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 /* full slice, return original string */
7326 Py_INCREF(self);
7327 return (PyObject*) self;
7328 }
7329 if (start > end)
7330 start = end;
7331 /* copy slice */
7332 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007333 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334}
7335
7336PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007337 PyObject *sep,
7338 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339{
7340 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007341
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 s = PyUnicode_FromObject(s);
7343 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007344 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007345 if (sep != NULL) {
7346 sep = PyUnicode_FromObject(sep);
7347 if (sep == NULL) {
7348 Py_DECREF(s);
7349 return NULL;
7350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 }
7352
7353 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7354
7355 Py_DECREF(s);
7356 Py_XDECREF(sep);
7357 return result;
7358}
7359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007360PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007361 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362\n\
7363Return a list of the words in S, using sep as the\n\
7364delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007365splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007366whitespace string is a separator and empty strings are\n\
7367removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368
7369static PyObject*
7370unicode_split(PyUnicodeObject *self, PyObject *args)
7371{
7372 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007373 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374
Martin v. Löwis18e16552006-02-15 17:27:45 +00007375 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 return NULL;
7377
7378 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007379 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007381 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007383 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384}
7385
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007386PyObject *
7387PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7388{
7389 PyObject* str_obj;
7390 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007391 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007392
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007393 str_obj = PyUnicode_FromObject(str_in);
7394 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007395 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007396 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007397 if (!sep_obj) {
7398 Py_DECREF(str_obj);
7399 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007400 }
7401
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007402 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007403 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7404 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7405 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007406
Fredrik Lundhb9479482006-05-26 17:22:38 +00007407 Py_DECREF(sep_obj);
7408 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007409
7410 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007411}
7412
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007413
7414PyObject *
7415PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7416{
7417 PyObject* str_obj;
7418 PyObject* sep_obj;
7419 PyObject* out;
7420
7421 str_obj = PyUnicode_FromObject(str_in);
7422 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007423 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007424 sep_obj = PyUnicode_FromObject(sep_in);
7425 if (!sep_obj) {
7426 Py_DECREF(str_obj);
7427 return NULL;
7428 }
7429
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007430 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007431 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7432 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7433 );
7434
7435 Py_DECREF(sep_obj);
7436 Py_DECREF(str_obj);
7437
7438 return out;
7439}
7440
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007441PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007442 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007443\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007444Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007445the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007446found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007447
7448static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007449unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007450{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007451 return PyUnicode_Partition((PyObject *)self, separator);
7452}
7453
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007454PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007455 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007456\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007457Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007458the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007459separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007460
7461static PyObject*
7462unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7463{
7464 return PyUnicode_RPartition((PyObject *)self, separator);
7465}
7466
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007467PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007468 PyObject *sep,
7469 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007470{
7471 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007472
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007473 s = PyUnicode_FromObject(s);
7474 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007475 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007476 if (sep != NULL) {
7477 sep = PyUnicode_FromObject(sep);
7478 if (sep == NULL) {
7479 Py_DECREF(s);
7480 return NULL;
7481 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007482 }
7483
7484 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7485
7486 Py_DECREF(s);
7487 Py_XDECREF(sep);
7488 return result;
7489}
7490
7491PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007492 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007493\n\
7494Return a list of the words in S, using sep as the\n\
7495delimiter string, starting at the end of the string and\n\
7496working to the front. If maxsplit is given, at most maxsplit\n\
7497splits are done. If sep is not specified, any whitespace string\n\
7498is a separator.");
7499
7500static PyObject*
7501unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7502{
7503 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007504 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007505
Martin v. Löwis18e16552006-02-15 17:27:45 +00007506 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007507 return NULL;
7508
7509 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007510 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007511 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007512 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007513 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007514 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007515}
7516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007517PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007518 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519\n\
7520Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007521Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007522is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523
7524static PyObject*
7525unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7526{
Guido van Rossum86662912000-04-11 15:38:46 +00007527 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
Guido van Rossum86662912000-04-11 15:38:46 +00007529 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 return NULL;
7531
Guido van Rossum86662912000-04-11 15:38:46 +00007532 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533}
7534
7535static
7536PyObject *unicode_str(PyUnicodeObject *self)
7537{
Fred Drakee4315f52000-05-09 19:53:39 +00007538 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539}
7540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007542 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543\n\
7544Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007545and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007548unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 return fixup(self, fixswapcase);
7551}
7552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007553PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007554 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555\n\
7556Return a copy of the string S, where all characters have been mapped\n\
7557through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007558Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7559Unmapped characters are left untouched. Characters mapped to None\n\
7560are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007563unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
Tim Petersced69f82003-09-16 20:30:58 +00007565 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007566 self->length,
7567 table,
7568 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007572 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
7576static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007577unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 return fixup(self, fixupper);
7580}
7581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007582PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007583 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584\n\
Georg Brandl98064072008-09-09 19:26:00 +00007585Pad a numeric string S with zeros on the left, to fill a field\n\
7586of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
7588static PyObject *
7589unicode_zfill(PyUnicodeObject *self, PyObject *args)
7590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 PyUnicodeObject *u;
7593
Martin v. Löwis18e16552006-02-15 17:27:45 +00007594 Py_ssize_t width;
7595 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 return NULL;
7597
7598 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007599 if (PyUnicode_CheckExact(self)) {
7600 Py_INCREF(self);
7601 return (PyObject*) self;
7602 }
7603 else
7604 return PyUnicode_FromUnicode(
7605 PyUnicode_AS_UNICODE(self),
7606 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 }
7609
7610 fill = width - self->length;
7611
7612 u = pad(self, fill, 0, '0');
7613
Walter Dörwald068325e2002-04-15 13:36:47 +00007614 if (u == NULL)
7615 return NULL;
7616
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 if (u->str[fill] == '+' || u->str[fill] == '-') {
7618 /* move sign to beginning of string */
7619 u->str[0] = u->str[fill];
7620 u->str[fill] = '0';
7621 }
7622
7623 return (PyObject*) u;
7624}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625
7626#if 0
7627static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007628free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007630 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631}
7632#endif
7633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007634PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007635 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007637Return True if S starts with the specified prefix, False otherwise.\n\
7638With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007639With optional end, stop comparing S at that position.\n\
7640prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642static PyObject *
7643unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007644 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645{
Georg Brandl24250812006-06-09 18:45:48 +00007646 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007648 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007649 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007650 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
Georg Brandl24250812006-06-09 18:45:48 +00007652 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007653 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7654 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007655 if (PyTuple_Check(subobj)) {
7656 Py_ssize_t i;
7657 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7658 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007659 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007660 if (substring == NULL)
7661 return NULL;
7662 result = tailmatch(self, substring, start, end, -1);
7663 Py_DECREF(substring);
7664 if (result) {
7665 Py_RETURN_TRUE;
7666 }
7667 }
7668 /* nothing matched */
7669 Py_RETURN_FALSE;
7670 }
7671 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007673 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007674 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007676 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677}
7678
7679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007681 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007683Return True if S ends with the specified suffix, False otherwise.\n\
7684With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007685With optional end, stop comparing S at that position.\n\
7686suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687
7688static PyObject *
7689unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007690 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691{
Georg Brandl24250812006-06-09 18:45:48 +00007692 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007694 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007695 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007696 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697
Georg Brandl24250812006-06-09 18:45:48 +00007698 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007699 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7700 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007701 if (PyTuple_Check(subobj)) {
7702 Py_ssize_t i;
7703 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7704 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007705 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007706 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007707 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007708 result = tailmatch(self, substring, start, end, +1);
7709 Py_DECREF(substring);
7710 if (result) {
7711 Py_RETURN_TRUE;
7712 }
7713 }
7714 Py_RETURN_FALSE;
7715 }
7716 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
Georg Brandl24250812006-06-09 18:45:48 +00007720 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007722 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723}
7724
7725
Eric Smitha9f7d622008-02-17 19:46:49 +00007726/* Implements do_string_format, which is unicode because of stringlib */
7727#include "stringlib/string_format.h"
7728
7729PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007730 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007731\n\
7732");
7733
Eric Smithdc13b792008-05-30 18:10:04 +00007734static PyObject *
7735unicode__format__(PyObject *self, PyObject *args)
7736{
7737 PyObject *format_spec;
7738 PyObject *result = NULL;
7739 PyObject *tmp = NULL;
7740
7741 /* If 2.x, convert format_spec to the same type as value */
7742 /* This is to allow things like u''.format('') */
7743 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7744 goto done;
7745 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7746 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007747 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007748 goto done;
7749 }
7750 tmp = PyObject_Unicode(format_spec);
7751 if (tmp == NULL)
7752 goto done;
7753 format_spec = tmp;
7754
7755 result = _PyUnicode_FormatAdvanced(self,
7756 PyUnicode_AS_UNICODE(format_spec),
7757 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007758 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007759 Py_XDECREF(tmp);
7760 return result;
7761}
7762
Eric Smitha9f7d622008-02-17 19:46:49 +00007763PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007764 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007765\n\
7766");
7767
Robert Schuppenies901c9972008-06-10 10:10:31 +00007768static PyObject *
7769unicode__sizeof__(PyUnicodeObject *v)
7770{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007771 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7772 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007773}
7774
7775PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007776 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007777\n\
7778");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007779
7780static PyObject *
7781unicode_getnewargs(PyUnicodeObject *v)
7782{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007783 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007784}
7785
7786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787static PyMethodDef unicode_methods[] = {
7788
7789 /* Order is according to common usage: often used methods should
7790 appear first, since lookup is done sequentially. */
7791
Benjamin Peterson332d7212009-09-18 21:14:55 +00007792 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007793 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7794 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007795 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7797 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7798 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7799 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7800 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7801 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7802 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007803 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007804 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7805 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7806 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007807 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007808 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007809/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7810 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7811 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7812 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007814 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007815 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007817 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7818 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7819 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7820 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7821 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7822 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7823 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7824 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7825 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7826 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7827 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7828 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7829 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7830 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007831 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007832 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7833 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7834 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7835 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007836 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007837#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007838 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839#endif
7840
7841#if 0
7842 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007843 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844#endif
7845
Benjamin Peterson857ce152009-01-31 16:29:18 +00007846 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 {NULL, NULL}
7848};
7849
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007850static PyObject *
7851unicode_mod(PyObject *v, PyObject *w)
7852{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007853 if (!PyUnicode_Check(v)) {
7854 Py_INCREF(Py_NotImplemented);
7855 return Py_NotImplemented;
7856 }
7857 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007858}
7859
7860static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007861 0, /*nb_add*/
7862 0, /*nb_subtract*/
7863 0, /*nb_multiply*/
7864 0, /*nb_divide*/
7865 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007866};
7867
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007869 (lenfunc) unicode_length, /* sq_length */
7870 PyUnicode_Concat, /* sq_concat */
7871 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7872 (ssizeargfunc) unicode_getitem, /* sq_item */
7873 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7874 0, /* sq_ass_item */
7875 0, /* sq_ass_slice */
7876 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877};
7878
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007879static PyObject*
7880unicode_subscript(PyUnicodeObject* self, PyObject* item)
7881{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007882 if (PyIndex_Check(item)) {
7883 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007884 if (i == -1 && PyErr_Occurred())
7885 return NULL;
7886 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007887 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007888 return unicode_getitem(self, i);
7889 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007890 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007891 Py_UNICODE* source_buf;
7892 Py_UNICODE* result_buf;
7893 PyObject* result;
7894
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007895 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007896 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007897 return NULL;
7898 }
7899
7900 if (slicelength <= 0) {
7901 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007902 } else if (start == 0 && step == 1 && slicelength == self->length &&
7903 PyUnicode_CheckExact(self)) {
7904 Py_INCREF(self);
7905 return (PyObject *)self;
7906 } else if (step == 1) {
7907 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007908 } else {
7909 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007910 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7911 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007912
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007913 if (result_buf == NULL)
7914 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007915
7916 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7917 result_buf[i] = source_buf[cur];
7918 }
Tim Petersced69f82003-09-16 20:30:58 +00007919
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007920 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007921 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007922 return result;
7923 }
7924 } else {
7925 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7926 return NULL;
7927 }
7928}
7929
7930static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007931 (lenfunc)unicode_length, /* mp_length */
7932 (binaryfunc)unicode_subscript, /* mp_subscript */
7933 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007934};
7935
Martin v. Löwis18e16552006-02-15 17:27:45 +00007936static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007938 Py_ssize_t index,
7939 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940{
7941 if (index != 0) {
7942 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007943 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944 return -1;
7945 }
7946 *ptr = (void *) self->str;
7947 return PyUnicode_GET_DATA_SIZE(self);
7948}
7949
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950static Py_ssize_t
7951unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007952 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953{
7954 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007955 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 return -1;
7957}
7958
7959static int
7960unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007961 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
7963 if (lenp)
7964 *lenp = PyUnicode_GET_DATA_SIZE(self);
7965 return 1;
7966}
7967
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007968static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007970 Py_ssize_t index,
7971 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
7973 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007974
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 if (index != 0) {
7976 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007977 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 return -1;
7979 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007980 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007982 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007983 *ptr = (void *) PyString_AS_STRING(str);
7984 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985}
7986
7987/* Helpers for PyUnicode_Format() */
7988
7989static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007990getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007992 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007994 (*p_argidx)++;
7995 if (arglen < 0)
7996 return args;
7997 else
7998 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
8000 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008001 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 return NULL;
8003}
8004
8005#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008006#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008008#define F_ALT (1<<3)
8009#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008012strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008014 register Py_ssize_t i;
8015 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008017 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 return len;
8020}
8021
Neal Norwitzfc76d632006-01-10 06:03:13 +00008022static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008023longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8024{
Tim Peters15231542006-02-16 01:08:01 +00008025 Py_ssize_t result;
8026
Neal Norwitzfc76d632006-01-10 06:03:13 +00008027 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008028 result = strtounicode(buffer, (char *)buffer);
8029 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008030}
8031
Guido van Rossum078151d2002-08-11 04:24:12 +00008032/* XXX To save some code duplication, formatfloat/long/int could have been
8033 shared with stringobject.c, converting from 8-bit to Unicode after the
8034 formatting is done. */
8035
Mark Dickinson18cfada2009-11-23 18:46:41 +00008036/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8037
8038static PyObject *
8039formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008041 char *p;
8042 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008044
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 x = PyFloat_AsDouble(v);
8046 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008047 return NULL;
8048
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008050 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008051
Mark Dickinson18cfada2009-11-23 18:46:41 +00008052 p = PyOS_double_to_string(x, type, prec,
8053 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8054 if (p == NULL)
8055 return NULL;
8056 result = PyUnicode_FromStringAndSize(p, strlen(p));
8057 PyMem_Free(p);
8058 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059}
8060
Tim Peters38fd5b62000-09-21 05:43:11 +00008061static PyObject*
8062formatlong(PyObject *val, int flags, int prec, int type)
8063{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008064 char *buf;
8065 int i, len;
8066 PyObject *str; /* temporary string object. */
8067 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008068
Benjamin Peterson857ce152009-01-31 16:29:18 +00008069 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8070 if (!str)
8071 return NULL;
8072 result = _PyUnicode_New(len);
8073 if (!result) {
8074 Py_DECREF(str);
8075 return NULL;
8076 }
8077 for (i = 0; i < len; i++)
8078 result->str[i] = buf[i];
8079 result->str[len] = 0;
8080 Py_DECREF(str);
8081 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008082}
8083
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084static int
8085formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008086 size_t buflen,
8087 int flags,
8088 int prec,
8089 int type,
8090 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008092 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008093 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8094 * + 1 + 1
8095 * = 24
8096 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008097 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008098 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 long x;
8100
8101 x = PyInt_AsLong(v);
8102 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008103 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008104 if (x < 0 && type == 'u') {
8105 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008106 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008107 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8108 sign = "-";
8109 else
8110 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008112 prec = 1;
8113
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008114 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8115 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008116 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008117 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008118 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008119 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008120 return -1;
8121 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008122
8123 if ((flags & F_ALT) &&
8124 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008125 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008126 * of issues that cause pain:
8127 * - when 0 is being converted, the C standard leaves off
8128 * the '0x' or '0X', which is inconsistent with other
8129 * %#x/%#X conversions and inconsistent with Python's
8130 * hex() function
8131 * - there are platforms that violate the standard and
8132 * convert 0 with the '0x' or '0X'
8133 * (Metrowerks, Compaq Tru64)
8134 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008135 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008136 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008137 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008138 * We can achieve the desired consistency by inserting our
8139 * own '0x' or '0X' prefix, and substituting %x/%X in place
8140 * of %#x/%#X.
8141 *
8142 * Note that this is the same approach as used in
8143 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008144 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008145 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8146 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008147 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008148 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008149 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8150 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008151 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008152 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008153 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008154 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008155 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008156 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157}
8158
8159static int
8160formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008161 size_t buflen,
8162 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163{
Ezio Melotti32125152010-02-25 17:36:04 +00008164 PyObject *unistr;
8165 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008166 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008167 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008168 if (PyUnicode_GET_SIZE(v) != 1)
8169 goto onError;
8170 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008173 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008174 if (PyString_GET_SIZE(v) != 1)
8175 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008176 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8177 with a UnicodeDecodeError if 'char' is not decodable with the
8178 default encoding (usually ASCII, but it might be something else) */
8179 str = PyString_AS_STRING(v);
8180 if ((unsigned char)str[0] > 0x7F) {
8181 /* the char is not ASCII; try to decode the string using the
8182 default encoding and return -1 to let the UnicodeDecodeError
8183 be raised if the string can't be decoded */
8184 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8185 if (unistr == NULL)
8186 return -1;
8187 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8188 Py_DECREF(unistr);
8189 }
8190 else
8191 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193
8194 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008195 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008197 x = PyInt_AsLong(v);
8198 if (x == -1 && PyErr_Occurred())
8199 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008200#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008201 if (x < 0 || x > 0x10ffff) {
8202 PyErr_SetString(PyExc_OverflowError,
8203 "%c arg not in range(0x110000) "
8204 "(wide Python build)");
8205 return -1;
8206 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008207#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008208 if (x < 0 || x > 0xffff) {
8209 PyErr_SetString(PyExc_OverflowError,
8210 "%c arg not in range(0x10000) "
8211 "(narrow Python build)");
8212 return -1;
8213 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008214#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008215 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 }
8217 buf[1] = '\0';
8218 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008219
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008221 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008222 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008223 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224}
8225
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008226/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8227
Mark Dickinson18cfada2009-11-23 18:46:41 +00008228 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008229 chars are formatted. XXX This is a magic number. Each formatting
8230 routine does bounds checking to ensure no overflow, but a better
8231 solution may be to malloc a buffer of appropriate size for each
8232 format. For now, the current solution is sufficient.
8233*/
8234#define FORMATBUFLEN (size_t)120
8235
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008237 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238{
8239 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008240 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 int args_owned = 0;
8242 PyUnicodeObject *result = NULL;
8243 PyObject *dict = NULL;
8244 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008245
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 PyErr_BadInternalCall();
8248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 }
8250 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008251 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 fmt = PyUnicode_AS_UNICODE(uformat);
8254 fmtcnt = PyUnicode_GET_SIZE(uformat);
8255
8256 reslen = rescnt = fmtcnt + 100;
8257 result = _PyUnicode_New(reslen);
8258 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 res = PyUnicode_AS_UNICODE(result);
8261
8262 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008263 arglen = PyTuple_Size(args);
8264 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 }
8266 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008267 arglen = -1;
8268 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
Christian Heimese93237d2007-12-19 02:37:44 +00008270 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008271 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008272 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
8274 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008275 if (*fmt != '%') {
8276 if (--rescnt < 0) {
8277 rescnt = fmtcnt + 100;
8278 reslen += rescnt;
8279 if (_PyUnicode_Resize(&result, reslen) < 0)
8280 goto onError;
8281 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8282 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008283 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008284 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008285 }
8286 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 /* Got a format specifier */
8288 int flags = 0;
8289 Py_ssize_t width = -1;
8290 int prec = -1;
8291 Py_UNICODE c = '\0';
8292 Py_UNICODE fill;
8293 int isnumok;
8294 PyObject *v = NULL;
8295 PyObject *temp = NULL;
8296 Py_UNICODE *pbuf;
8297 Py_UNICODE sign;
8298 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008299 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008300
8301 fmt++;
8302 if (*fmt == '(') {
8303 Py_UNICODE *keystart;
8304 Py_ssize_t keylen;
8305 PyObject *key;
8306 int pcount = 1;
8307
8308 if (dict == NULL) {
8309 PyErr_SetString(PyExc_TypeError,
8310 "format requires a mapping");
8311 goto onError;
8312 }
8313 ++fmt;
8314 --fmtcnt;
8315 keystart = fmt;
8316 /* Skip over balanced parentheses */
8317 while (pcount > 0 && --fmtcnt >= 0) {
8318 if (*fmt == ')')
8319 --pcount;
8320 else if (*fmt == '(')
8321 ++pcount;
8322 fmt++;
8323 }
8324 keylen = fmt - keystart - 1;
8325 if (fmtcnt < 0 || pcount > 0) {
8326 PyErr_SetString(PyExc_ValueError,
8327 "incomplete format key");
8328 goto onError;
8329 }
8330#if 0
8331 /* keys are converted to strings using UTF-8 and
8332 then looked up since Python uses strings to hold
8333 variables names etc. in its namespaces and we
8334 wouldn't want to break common idioms. */
8335 key = PyUnicode_EncodeUTF8(keystart,
8336 keylen,
8337 NULL);
8338#else
8339 key = PyUnicode_FromUnicode(keystart, keylen);
8340#endif
8341 if (key == NULL)
8342 goto onError;
8343 if (args_owned) {
8344 Py_DECREF(args);
8345 args_owned = 0;
8346 }
8347 args = PyObject_GetItem(dict, key);
8348 Py_DECREF(key);
8349 if (args == NULL) {
8350 goto onError;
8351 }
8352 args_owned = 1;
8353 arglen = -1;
8354 argidx = -2;
8355 }
8356 while (--fmtcnt >= 0) {
8357 switch (c = *fmt++) {
8358 case '-': flags |= F_LJUST; continue;
8359 case '+': flags |= F_SIGN; continue;
8360 case ' ': flags |= F_BLANK; continue;
8361 case '#': flags |= F_ALT; continue;
8362 case '0': flags |= F_ZERO; continue;
8363 }
8364 break;
8365 }
8366 if (c == '*') {
8367 v = getnextarg(args, arglen, &argidx);
8368 if (v == NULL)
8369 goto onError;
8370 if (!PyInt_Check(v)) {
8371 PyErr_SetString(PyExc_TypeError,
8372 "* wants int");
8373 goto onError;
8374 }
8375 width = PyInt_AsLong(v);
8376 if (width < 0) {
8377 flags |= F_LJUST;
8378 width = -width;
8379 }
8380 if (--fmtcnt >= 0)
8381 c = *fmt++;
8382 }
8383 else if (c >= '0' && c <= '9') {
8384 width = c - '0';
8385 while (--fmtcnt >= 0) {
8386 c = *fmt++;
8387 if (c < '0' || c > '9')
8388 break;
8389 if ((width*10) / 10 != width) {
8390 PyErr_SetString(PyExc_ValueError,
8391 "width too big");
8392 goto onError;
8393 }
8394 width = width*10 + (c - '0');
8395 }
8396 }
8397 if (c == '.') {
8398 prec = 0;
8399 if (--fmtcnt >= 0)
8400 c = *fmt++;
8401 if (c == '*') {
8402 v = getnextarg(args, arglen, &argidx);
8403 if (v == NULL)
8404 goto onError;
8405 if (!PyInt_Check(v)) {
8406 PyErr_SetString(PyExc_TypeError,
8407 "* wants int");
8408 goto onError;
8409 }
8410 prec = PyInt_AsLong(v);
8411 if (prec < 0)
8412 prec = 0;
8413 if (--fmtcnt >= 0)
8414 c = *fmt++;
8415 }
8416 else if (c >= '0' && c <= '9') {
8417 prec = c - '0';
8418 while (--fmtcnt >= 0) {
8419 c = Py_CHARMASK(*fmt++);
8420 if (c < '0' || c > '9')
8421 break;
8422 if ((prec*10) / 10 != prec) {
8423 PyErr_SetString(PyExc_ValueError,
8424 "prec too big");
8425 goto onError;
8426 }
8427 prec = prec*10 + (c - '0');
8428 }
8429 }
8430 } /* prec */
8431 if (fmtcnt >= 0) {
8432 if (c == 'h' || c == 'l' || c == 'L') {
8433 if (--fmtcnt >= 0)
8434 c = *fmt++;
8435 }
8436 }
8437 if (fmtcnt < 0) {
8438 PyErr_SetString(PyExc_ValueError,
8439 "incomplete format");
8440 goto onError;
8441 }
8442 if (c != '%') {
8443 v = getnextarg(args, arglen, &argidx);
8444 if (v == NULL)
8445 goto onError;
8446 }
8447 sign = 0;
8448 fill = ' ';
8449 switch (c) {
8450
8451 case '%':
8452 pbuf = formatbuf;
8453 /* presume that buffer length is at least 1 */
8454 pbuf[0] = '%';
8455 len = 1;
8456 break;
8457
8458 case 's':
8459 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008460 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008461 temp = v;
8462 Py_INCREF(temp);
8463 }
8464 else {
8465 PyObject *unicode;
8466 if (c == 's')
8467 temp = PyObject_Unicode(v);
8468 else
8469 temp = PyObject_Repr(v);
8470 if (temp == NULL)
8471 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008472 if (PyUnicode_Check(temp))
8473 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008474 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008475 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008476 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8477 PyString_GET_SIZE(temp),
8478 NULL,
8479 "strict");
8480 Py_DECREF(temp);
8481 temp = unicode;
8482 if (temp == NULL)
8483 goto onError;
8484 }
8485 else {
8486 Py_DECREF(temp);
8487 PyErr_SetString(PyExc_TypeError,
8488 "%s argument has non-string str()");
8489 goto onError;
8490 }
8491 }
8492 pbuf = PyUnicode_AS_UNICODE(temp);
8493 len = PyUnicode_GET_SIZE(temp);
8494 if (prec >= 0 && len > prec)
8495 len = prec;
8496 break;
8497
8498 case 'i':
8499 case 'd':
8500 case 'u':
8501 case 'o':
8502 case 'x':
8503 case 'X':
8504 if (c == 'i')
8505 c = 'd';
8506 isnumok = 0;
8507 if (PyNumber_Check(v)) {
8508 PyObject *iobj=NULL;
8509
8510 if (PyInt_Check(v) || (PyLong_Check(v))) {
8511 iobj = v;
8512 Py_INCREF(iobj);
8513 }
8514 else {
8515 iobj = PyNumber_Int(v);
8516 if (iobj==NULL) iobj = PyNumber_Long(v);
8517 }
8518 if (iobj!=NULL) {
8519 if (PyInt_Check(iobj)) {
8520 isnumok = 1;
8521 pbuf = formatbuf;
8522 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8523 flags, prec, c, iobj);
8524 Py_DECREF(iobj);
8525 if (len < 0)
8526 goto onError;
8527 sign = 1;
8528 }
8529 else if (PyLong_Check(iobj)) {
8530 isnumok = 1;
8531 temp = formatlong(iobj, flags, prec, c);
8532 Py_DECREF(iobj);
8533 if (!temp)
8534 goto onError;
8535 pbuf = PyUnicode_AS_UNICODE(temp);
8536 len = PyUnicode_GET_SIZE(temp);
8537 sign = 1;
8538 }
8539 else {
8540 Py_DECREF(iobj);
8541 }
8542 }
8543 }
8544 if (!isnumok) {
8545 PyErr_Format(PyExc_TypeError,
8546 "%%%c format: a number is required, "
8547 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8548 goto onError;
8549 }
8550 if (flags & F_ZERO)
8551 fill = '0';
8552 break;
8553
8554 case 'e':
8555 case 'E':
8556 case 'f':
8557 case 'F':
8558 case 'g':
8559 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008560 temp = formatfloat(v, flags, prec, c);
8561 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008562 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008563 pbuf = PyUnicode_AS_UNICODE(temp);
8564 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008565 sign = 1;
8566 if (flags & F_ZERO)
8567 fill = '0';
8568 break;
8569
8570 case 'c':
8571 pbuf = formatbuf;
8572 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8573 if (len < 0)
8574 goto onError;
8575 break;
8576
8577 default:
8578 PyErr_Format(PyExc_ValueError,
8579 "unsupported format character '%c' (0x%x) "
8580 "at index %zd",
8581 (31<=c && c<=126) ? (char)c : '?',
8582 (int)c,
8583 (Py_ssize_t)(fmt - 1 -
8584 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008585 goto onError;
8586 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008587 if (sign) {
8588 if (*pbuf == '-' || *pbuf == '+') {
8589 sign = *pbuf++;
8590 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008591 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008592 else if (flags & F_SIGN)
8593 sign = '+';
8594 else if (flags & F_BLANK)
8595 sign = ' ';
8596 else
8597 sign = 0;
8598 }
8599 if (width < len)
8600 width = len;
8601 if (rescnt - (sign != 0) < width) {
8602 reslen -= rescnt;
8603 rescnt = width + fmtcnt + 100;
8604 reslen += rescnt;
8605 if (reslen < 0) {
8606 Py_XDECREF(temp);
8607 PyErr_NoMemory();
8608 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008609 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008610 if (_PyUnicode_Resize(&result, reslen) < 0) {
8611 Py_XDECREF(temp);
8612 goto onError;
8613 }
8614 res = PyUnicode_AS_UNICODE(result)
8615 + reslen - rescnt;
8616 }
8617 if (sign) {
8618 if (fill != ' ')
8619 *res++ = sign;
8620 rescnt--;
8621 if (width > len)
8622 width--;
8623 }
8624 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8625 assert(pbuf[0] == '0');
8626 assert(pbuf[1] == c);
8627 if (fill != ' ') {
8628 *res++ = *pbuf++;
8629 *res++ = *pbuf++;
8630 }
8631 rescnt -= 2;
8632 width -= 2;
8633 if (width < 0)
8634 width = 0;
8635 len -= 2;
8636 }
8637 if (width > len && !(flags & F_LJUST)) {
8638 do {
8639 --rescnt;
8640 *res++ = fill;
8641 } while (--width > len);
8642 }
8643 if (fill == ' ') {
8644 if (sign)
8645 *res++ = sign;
8646 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8647 assert(pbuf[0] == '0');
8648 assert(pbuf[1] == c);
8649 *res++ = *pbuf++;
8650 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008651 }
8652 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008653 Py_UNICODE_COPY(res, pbuf, len);
8654 res += len;
8655 rescnt -= len;
8656 while (--width >= len) {
8657 --rescnt;
8658 *res++ = ' ';
8659 }
8660 if (dict && (argidx < arglen) && c != '%') {
8661 PyErr_SetString(PyExc_TypeError,
8662 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008663 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008664 goto onError;
8665 }
8666 Py_XDECREF(temp);
8667 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 } /* until end */
8669 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008670 PyErr_SetString(PyExc_TypeError,
8671 "not all arguments converted during string formatting");
8672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 }
8674
Thomas Woutersa96affe2006-03-12 00:29:36 +00008675 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008678 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 }
8680 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 return (PyObject *)result;
8682
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 Py_XDECREF(result);
8685 Py_DECREF(uformat);
8686 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008687 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 }
8689 return NULL;
8690}
8691
8692static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 (readbufferproc) unicode_buffer_getreadbuf,
8694 (writebufferproc) unicode_buffer_getwritebuf,
8695 (segcountproc) unicode_buffer_getsegcount,
8696 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697};
8698
Jeremy Hylton938ace62002-07-17 16:30:39 +00008699static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008700unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8701
Tim Peters6d6c1a32001-08-02 04:15:00 +00008702static PyObject *
8703unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8704{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008705 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008706 static char *kwlist[] = {"string", "encoding", "errors", 0};
8707 char *encoding = NULL;
8708 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008709
Benjamin Peterson857ce152009-01-31 16:29:18 +00008710 if (type != &PyUnicode_Type)
8711 return unicode_subtype_new(type, args, kwds);
8712 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008713 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008714 return NULL;
8715 if (x == NULL)
8716 return (PyObject *)_PyUnicode_New(0);
8717 if (encoding == NULL && errors == NULL)
8718 return PyObject_Unicode(x);
8719 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008720 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008721}
8722
Guido van Rossume023fe02001-08-30 03:12:59 +00008723static PyObject *
8724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8725{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008726 PyUnicodeObject *tmp, *pnew;
8727 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008728
Benjamin Peterson857ce152009-01-31 16:29:18 +00008729 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8730 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8731 if (tmp == NULL)
8732 return NULL;
8733 assert(PyUnicode_Check(tmp));
8734 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8735 if (pnew == NULL) {
8736 Py_DECREF(tmp);
8737 return NULL;
8738 }
8739 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8740 if (pnew->str == NULL) {
8741 _Py_ForgetReference((PyObject *)pnew);
8742 PyObject_Del(pnew);
8743 Py_DECREF(tmp);
8744 return PyErr_NoMemory();
8745 }
8746 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8747 pnew->length = n;
8748 pnew->hash = tmp->hash;
8749 Py_DECREF(tmp);
8750 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008751}
8752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008753PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008754 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008755\n\
8756Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008757encoding defaults to the current default string encoding.\n\
8758errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008761 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008762 "unicode", /* tp_name */
8763 sizeof(PyUnicodeObject), /* tp_size */
8764 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008766 (destructor)unicode_dealloc, /* tp_dealloc */
8767 0, /* tp_print */
8768 0, /* tp_getattr */
8769 0, /* tp_setattr */
8770 0, /* tp_compare */
8771 unicode_repr, /* tp_repr */
8772 &unicode_as_number, /* tp_as_number */
8773 &unicode_as_sequence, /* tp_as_sequence */
8774 &unicode_as_mapping, /* tp_as_mapping */
8775 (hashfunc) unicode_hash, /* tp_hash*/
8776 0, /* tp_call*/
8777 (reprfunc) unicode_str, /* tp_str */
8778 PyObject_GenericGetAttr, /* tp_getattro */
8779 0, /* tp_setattro */
8780 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008781 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008782 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008783 unicode_doc, /* tp_doc */
8784 0, /* tp_traverse */
8785 0, /* tp_clear */
8786 PyUnicode_RichCompare, /* tp_richcompare */
8787 0, /* tp_weaklistoffset */
8788 0, /* tp_iter */
8789 0, /* tp_iternext */
8790 unicode_methods, /* tp_methods */
8791 0, /* tp_members */
8792 0, /* tp_getset */
8793 &PyBaseString_Type, /* tp_base */
8794 0, /* tp_dict */
8795 0, /* tp_descr_get */
8796 0, /* tp_descr_set */
8797 0, /* tp_dictoffset */
8798 0, /* tp_init */
8799 0, /* tp_alloc */
8800 unicode_new, /* tp_new */
8801 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802};
8803
8804/* Initialize the Unicode implementation */
8805
Thomas Wouters78890102000-07-22 19:25:51 +00008806void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008808 int i;
8809
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008810 /* XXX - move this array to unicodectype.c ? */
8811 Py_UNICODE linebreak[] = {
8812 0x000A, /* LINE FEED */
8813 0x000D, /* CARRIAGE RETURN */
8814 0x001C, /* FILE SEPARATOR */
8815 0x001D, /* GROUP SEPARATOR */
8816 0x001E, /* RECORD SEPARATOR */
8817 0x0085, /* NEXT LINE */
8818 0x2028, /* LINE SEPARATOR */
8819 0x2029, /* PARAGRAPH SEPARATOR */
8820 };
8821
Fred Drakee4315f52000-05-09 19:53:39 +00008822 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008823 free_list = NULL;
8824 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008826 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008827 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008828
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008829 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008830 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008831 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008832 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008833 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008834
8835 /* initialize the linebreak bloom filter */
8836 bloom_linebreak = make_bloom_mask(
8837 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8838 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008839
8840 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841}
8842
8843/* Finalize the Unicode implementation */
8844
Christian Heimes3b718a72008-02-14 12:47:33 +00008845int
8846PyUnicode_ClearFreeList(void)
8847{
8848 int freelist_size = numfree;
8849 PyUnicodeObject *u;
8850
8851 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008852 PyUnicodeObject *v = u;
8853 u = *(PyUnicodeObject **)u;
8854 if (v->str)
8855 PyObject_DEL(v->str);
8856 Py_XDECREF(v->defenc);
8857 PyObject_Del(v);
8858 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008859 }
8860 free_list = NULL;
8861 assert(numfree == 0);
8862 return freelist_size;
8863}
8864
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865void
Thomas Wouters78890102000-07-22 19:25:51 +00008866_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008868 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008870 Py_XDECREF(unicode_empty);
8871 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008872
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008873 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008874 if (unicode_latin1[i]) {
8875 Py_DECREF(unicode_latin1[i]);
8876 unicode_latin1[i] = NULL;
8877 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008878 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008879 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008881
Anthony Baxterac6bd462006-04-13 02:06:09 +00008882#ifdef __cplusplus
8883}
8884#endif