blob: 1cd3688fa5de652cd491df08c181af8214598684 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000288 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 }
290 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return 0;
293}
294
295/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000296 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Andrew Dalkee0df7622006-05-27 11:04:36 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitze7d8be82008-07-31 17:17:14 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000363 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000367 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000369 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000370 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372}
373
374static
Guido van Rossum9475a232001-10-05 20:51:39 +0000375void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000377 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000378 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000379 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381 PyObject_DEL(unicode->str);
382 unicode->str = NULL;
383 unicode->length = 0;
384 }
385 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000386 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000387 }
388 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 }
393 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 }
398}
399
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000400static
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000402{
403 register PyUnicodeObject *v;
404
405 /* Argument checks */
406 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyErr_BadInternalCall();
408 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000410 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000412 PyErr_BadInternalCall();
413 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000414 }
415
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000419 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 }
430
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
434}
435
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437{
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000442 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443{
444 PyUnicodeObject *unicode;
445
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
449
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000454 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
466 }
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
469 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470 }
Tim Petersced69f82003-09-16 20:30:58 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
475
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479
480 return (PyObject *)unicode;
481}
482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484{
485 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000486
Benjamin Peterson857ce152009-01-31 16:29:18 +0000487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000490 return NULL;
491 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000492
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
498
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000503 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000504
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515 }
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
518 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000519
520 return PyUnicode_DecodeUTF8(u, size, NULL);
521 }
522
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
526
527 return (PyObject *)unicode;
528}
529
530PyObject *PyUnicode_FromString(const char *u)
531{
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
536 }
537
538 return PyUnicode_FromStringAndSize(u, size);
539}
540
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541#ifdef HAVE_WCHAR_H
542
Mark Dickinson6b265f12009-03-18 16:07:26 +0000543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544# define CONVERT_WCHAR_TO_SURROGATES
545#endif
546
547#ifdef CONVERT_WCHAR_TO_SURROGATES
548
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
551
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
554{
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
559
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
563 }
564
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
571 }
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578 {
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
587 }
588 else
589 *u++ = *w++;
590 }
591 }
592 return (PyObject *)unicode;
593}
594
595#else
596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000598 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599{
600 PyUnicodeObject *unicode;
601
602 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000603 PyErr_BadInternalCall();
604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
606
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
610
611 /* Copy the wchar_t data into the new object */
612#ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000614#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 }
622#endif
623
624 return (PyObject *)unicode;
625}
626
Mark Dickinson6b265f12009-03-18 16:07:26 +0000627#endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629#undef CONVERT_WCHAR_TO_SURROGATES
630
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000631static void
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
639 }
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
648 }
649 *fmt++ = c;
650 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000651}
652
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655PyObject *
656PyUnicode_FromFormatV(const char *format, va_list vargs)
657{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000678
679#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000681#else
682#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000683 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000684#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#endif
687#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000691 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000695 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
700 ;
701 if (*f == 's')
702 ++callcount;
703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000704 }
705 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
712 }
713 callresult = callresults;
714 }
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
723 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
727 */
728 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731
Benjamin Peterson857ce152009-01-31 16:29:18 +0000732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
753 {
754 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000755 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000762 break;
763 }
764 case 'U':
765 {
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
770 }
771 case 'V':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
782 }
783 case 'S':
784 {
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
795 }
796 case 'R':
797 {
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
808 }
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
815 */
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
827 }
828 } else
829 n++;
830 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000831 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
837 }
838 realbuffer = abuffer;
839 }
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000849
Benjamin Peterson857ce152009-01-31 16:29:18 +0000850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000852
Benjamin Peterson857ce152009-01-31 16:29:18 +0000853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
868 }
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
874 }
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
879 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000880
Benjamin Peterson857ce152009-01-31 16:29:18 +0000881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
916 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000926 break;
927 }
928 case 'U':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
935 }
936 case 'V':
937 {
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
946 }
947 break;
948 }
949 case 'S':
950 case 'R':
951 {
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
966 }
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
976 }
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
985 }
986 } else
987 *s++ = *f;
988 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000990 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000997 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1003 }
1004 PyObject_Free(callresults);
1005 }
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001009}
1010
1011#undef appendstring
1012
1013PyObject *
1014PyUnicode_FromFormat(const char *format, ...)
1015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001016 PyObject* ret;
1017 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018
1019#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001020 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001021#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027}
1028
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001030 wchar_t *w,
1031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032{
1033 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001034 PyErr_BadInternalCall();
1035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037
1038 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001040 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042#ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044#else
1045 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 }
1052#endif
1053
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058}
1059
1060#endif
1061
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062PyObject *PyUnicode_FromOrdinal(int ordinal)
1063{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001064 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065
1066#ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072 }
1073#else
1074 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001079 }
1080#endif
1081
Hye-Shik Chang40574832004-04-06 07:24:51 +00001082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084}
1085
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086PyObject *PyUnicode_FromObject(register PyObject *obj)
1087{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001089 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 Py_INCREF(obj);
1092 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 }
1094 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101}
1102
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 const char *encoding,
1105 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001107 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001108 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 PyErr_BadInternalCall();
1113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001116#if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001121
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001123 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124
1125 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001131 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 return PyObject_Unicode(obj);
1133 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001134#else
1135 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001139 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001140#endif
1141
1142 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001143 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001146 }
Christian Heimes3497f942008-05-26 12:29:14 +00001147 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1152 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Tim Petersced69f82003-09-16 20:30:58 +00001169 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001170 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001171
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172 return v;
1173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001174 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176}
1177
1178PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182{
1183 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001184
1185 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001186 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001187
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001209 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001210 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001216
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001217 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_XDECREF(buffer);
1219 return NULL;
1220}
1221
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225{
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001242 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001243 return NULL;
1244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250{
1251 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1259}
1260
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1264{
1265 PyObject *v;
1266
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1270 }
1271
1272 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001274
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1280
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001281 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001282 return NULL;
1283}
1284
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
Fred Drakee4315f52000-05-09 19:53:39 +00001295
Tim Petersced69f82003-09-16 20:30:58 +00001296 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001297 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001298
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001308#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001317 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001319 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001320 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 Py_DECREF(v);
1322 goto onError;
1323 }
1324 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001325
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 return NULL;
1328}
1329
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001331 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332{
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344{
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
1349 return PyUnicode_AS_UNICODE(unicode);
1350
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001351 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 return NULL;
1353}
1354
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356{
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361 return PyUnicode_GET_SIZE(unicode);
1362
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 return -1;
1365}
1366
Thomas Wouters78890102000-07-22 19:25:51 +00001367const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001368{
1369 return unicode_default_encoding;
1370}
1371
1372int PyUnicode_SetDefaultEncoding(const char *encoding)
1373{
1374 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001375
Fred Drakee4315f52000-05-09 19:53:39 +00001376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001383 encoding,
1384 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001385 return 0;
1386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001388 return -1;
1389}
1390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391/* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001393 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 and adjust various state variables.
1395 return 0 on success, -1 on error
1396*/
1397
1398static
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 int res = -1;
1415
1416 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 }
1421
1422 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001423 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 }
1428 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 }
1436
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001442 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001448 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 }
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1474
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 Py_XDECREF(restuple);
1477 return res;
1478}
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480/* --- UTF-7 Codec -------------------------------------------------------- */
1481
Antoine Pitrou653dece2009-05-04 18:32:32 +00001482/* See RFC2152 for details. We encode conservatively and decode liberally. */
1483
1484/* Three simple macros defining base-64. */
1485
1486/* Is c a base-64 character? */
1487
1488#define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491/* given that c is a base-64 character, what is its base-64 value? */
1492
1493#define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1498
1499/* What is the base-64 character of the bottom 6 bits of n? */
1500
1501#define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509#define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1511
1512/* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525
Tim Petersced69f82003-09-16 20:30:58 +00001526static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001527char utf7_category[128] = {
1528/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532/* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536/* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540/* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542/* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544};
1545
Antoine Pitrou653dece2009-05-04 18:32:32 +00001546/* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552#define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001559 Py_ssize_t size,
1560 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563}
1564
Antoine Pitrou653dece2009-05-04 18:32:32 +00001565/* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601
1602 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001603 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 e = s + size;
1605
1606 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001607 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623#ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626#else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629#endif
1630 surrogate = 0;
1631 }
1632 else {
1633 surrogate = 0;
1634 errmsg = "second surrogate missing";
1635 goto utf7Error;
1636 }
1637 }
1638 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639 /* first surrogate */
1640 surrogate = outCh;
1641 }
1642 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643 errmsg = "unexpected second surrogate";
1644 goto utf7Error;
1645 }
1646 else {
1647 *p++ = outCh;
1648 }
1649 }
1650 }
1651 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 inShift = 0;
1653 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001654 if (surrogate) {
1655 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001656 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001658 if (base64bits > 0) { /* left-over bits */
1659 if (base64bits >= 6) {
1660 /* We've seen at least one base-64 character */
1661 errmsg = "partial character in shift sequence";
1662 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001664 else {
1665 /* Some bits remain; they should be zero */
1666 if (base64buffer != 0) {
1667 errmsg = "non-zero padding bits in shift sequence";
1668 goto utf7Error;
1669 }
1670 }
1671 }
1672 if (ch != '-') {
1673 /* '-' is absorbed; other terminating
1674 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 *p++ = ch;
1676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 }
1678 }
1679 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 s++; /* consume '+' */
1682 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 s++;
1684 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001685 }
1686 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001688 shiftOutStart = p;
1689 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690 }
1691 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001692 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 *p++ = ch;
1694 s++;
1695 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001696 else {
1697 startinpos = s-starts;
1698 s++;
1699 errmsg = "unexpected special character";
1700 goto utf7Error;
1701 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001703utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 outpos = p-PyUnicode_AS_UNICODE(unicode);
1705 endinpos = s-starts;
1706 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001707 errors, &errorHandler,
1708 "utf7", errmsg,
1709 starts, size, &startinpos, &endinpos, &exc, &s,
1710 &unicode, &outpos, &p))
1711 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 }
1713
Antoine Pitrou653dece2009-05-04 18:32:32 +00001714 /* end of string */
1715
1716 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717 /* if we're in an inconsistent state, that's an error */
1718 if (surrogate ||
1719 (base64bits >= 6) ||
1720 (base64bits > 0 && base64buffer != 0)) {
1721 outpos = p-PyUnicode_AS_UNICODE(unicode);
1722 endinpos = size;
1723 if (unicode_decode_call_errorhandler(
1724 errors, &errorHandler,
1725 "utf7", "unterminated shift sequence",
1726 starts, size, &startinpos, &endinpos, &exc, &s,
1727 &unicode, &outpos, &p))
1728 goto onError;
1729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001731
1732 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001733 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001734 if (inShift) {
1735 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001736 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001737 }
1738 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001739 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001740 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001743 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 goto onError;
1745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 Py_XDECREF(errorHandler);
1747 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 return (PyObject *)unicode;
1749
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001750 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 Py_XDECREF(errorHandler);
1752 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 Py_DECREF(unicode);
1754 return NULL;
1755}
1756
1757
1758PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001759 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001760 int base64SetO,
1761 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001762 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763{
1764 PyObject *v;
1765 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001766 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001768 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001769 unsigned int base64bits = 0;
1770 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 char * out;
1772 char * start;
1773
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001774 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001775 return PyErr_NoMemory();
1776
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001778 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779
Antoine Pitrou653dece2009-05-04 18:32:32 +00001780 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781 if (v == NULL)
1782 return NULL;
1783
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001784 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 for (;i < size; ++i) {
1786 Py_UNICODE ch = s[i];
1787
Antoine Pitrou653dece2009-05-04 18:32:32 +00001788 if (inShift) {
1789 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790 /* shifting out */
1791 if (base64bits) { /* output remaining bits */
1792 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793 base64buffer = 0;
1794 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001795 }
1796 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001797 /* Characters not in the BASE64 set implicitly unshift the sequence
1798 so no '-' is required, except if the character is itself a '-' */
1799 if (IS_BASE64(ch) || ch == '-') {
1800 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001801 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001802 *out++ = (char) ch;
1803 }
1804 else {
1805 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001806 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001808 else { /* not in a shift sequence */
1809 if (ch == '+') {
1810 *out++ = '+';
1811 *out++ = '-';
1812 }
1813 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814 *out++ = (char) ch;
1815 }
1816 else {
1817 *out++ = '+';
1818 inShift = 1;
1819 goto encode_char;
1820 }
1821 }
1822 continue;
1823encode_char:
1824#ifdef Py_UNICODE_WIDE
1825 if (ch >= 0x10000) {
1826 /* code first surrogate */
1827 base64bits += 16;
1828 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829 while (base64bits >= 6) {
1830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831 base64bits -= 6;
1832 }
1833 /* prepare second surrogate */
1834 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1835 }
1836#endif
1837 base64bits += 16;
1838 base64buffer = (base64buffer << 16) | ch;
1839 while (base64bits >= 6) {
1840 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841 base64bits -= 6;
1842 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001843 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001844 if (base64bits)
1845 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001849 if (_PyString_Resize(&v, out - start))
1850 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 return v;
1852}
1853
Antoine Pitrou653dece2009-05-04 18:32:32 +00001854#undef IS_BASE64
1855#undef FROM_BASE64
1856#undef TO_BASE64
1857#undef DECODE_DIRECT
1858#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001859
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860/* --- UTF-8 Codec -------------------------------------------------------- */
1861
Tim Petersced69f82003-09-16 20:30:58 +00001862static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001864 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1865 illegal prefix. See RFC 3629 for details */
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1878 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1879 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1880 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1881 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882};
1883
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001885 Py_ssize_t size,
1886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887{
Walter Dörwald69652032004-09-07 20:24:22 +00001888 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1889}
1890
1891PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001892 Py_ssize_t size,
1893 const char *errors,
1894 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001896 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001898 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001899 Py_ssize_t startinpos;
1900 Py_ssize_t endinpos;
1901 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 const char *e;
1903 PyUnicodeObject *unicode;
1904 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001905 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001906 PyObject *errorHandler = NULL;
1907 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908
1909 /* Note: size will always be longer than the resulting Unicode
1910 character count */
1911 unicode = _PyUnicode_New(size);
1912 if (!unicode)
1913 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001914 if (size == 0) {
1915 if (consumed)
1916 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
1920 /* Unpack UTF-8 encoded data */
1921 p = unicode->str;
1922 e = s + size;
1923
1924 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001925 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926
1927 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001928 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 s++;
1930 continue;
1931 }
1932
1933 n = utf8_code_length[ch];
1934
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001935 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001936 if (consumed)
1937 break;
1938 else {
1939 errmsg = "unexpected end of data";
1940 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001941 endinpos = startinpos+1;
1942 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1943 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001944 goto utf8Error;
1945 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947
1948 switch (n) {
1949
1950 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001951 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001952 startinpos = s-starts;
1953 endinpos = startinpos+1;
1954 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955
1956 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001957 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001958 startinpos = s-starts;
1959 endinpos = startinpos+1;
1960 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
1962 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001963 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001964 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001965 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 goto utf8Error;
1968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001970 assert ((ch > 0x007F) && (ch <= 0x07FF));
1971 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 break;
1973
1974 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001975 /* XXX: surrogates shouldn't be valid UTF-8!
1976 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1977 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1978 Uncomment the 2 lines below to make them invalid,
1979 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001980 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001981 (s[2] & 0xc0) != 0x80 ||
1982 ((unsigned char)s[0] == 0xE0 &&
1983 (unsigned char)s[1] < 0xA0)/* ||
1984 ((unsigned char)s[0] == 0xED &&
1985 (unsigned char)s[1] > 0x9F)*/) {
1986 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001987 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001988 endinpos = startinpos + 1;
1989
1990 /* if s[1] first two bits are 1 and 0, then the invalid
1991 continuation byte is s[2], so increment endinpos by 1,
1992 if not, s[1] is invalid and endinpos doesn't need to
1993 be incremented. */
1994 if ((s[1] & 0xC0) == 0x80)
1995 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001996 goto utf8Error;
1997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001999 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2000 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002001 break;
2002
2003 case 4:
2004 if ((s[1] & 0xc0) != 0x80 ||
2005 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002006 (s[3] & 0xc0) != 0x80 ||
2007 ((unsigned char)s[0] == 0xF0 &&
2008 (unsigned char)s[1] < 0x90) ||
2009 ((unsigned char)s[0] == 0xF4 &&
2010 (unsigned char)s[1] > 0x8F)) {
2011 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002012 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002013 endinpos = startinpos + 1;
2014 if ((s[1] & 0xC0) == 0x80) {
2015 endinpos++;
2016 if ((s[2] & 0xC0) == 0x80)
2017 endinpos++;
2018 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002019 goto utf8Error;
2020 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002021 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002022 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2023 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2024
Fredrik Lundh8f455852001-06-27 18:59:43 +00002025#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002026 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002027#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002028 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002029
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 /* translate from 10000..10FFFF to 0..FFFF */
2031 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002032
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002033 /* high surrogate = top 10 bits added to D800 */
2034 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002035
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002036 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002037 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002038#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 }
2041 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002042 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002043
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002044 utf8Error:
2045 outpos = p-PyUnicode_AS_UNICODE(unicode);
2046 if (unicode_decode_call_errorhandler(
2047 errors, &errorHandler,
2048 "utf8", errmsg,
2049 starts, size, &startinpos, &endinpos, &exc, &s,
2050 &unicode, &outpos, &p))
2051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 }
Walter Dörwald69652032004-09-07 20:24:22 +00002053 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002054 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055
2056 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002057 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 goto onError;
2059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 return (PyObject *)unicode;
2063
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002064 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002065 Py_XDECREF(errorHandler);
2066 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 Py_DECREF(unicode);
2068 return NULL;
2069}
2070
Tim Peters602f7402002-04-27 18:03:26 +00002071/* Allocation strategy: if the string is short, convert into a stack buffer
2072 and allocate exactly as much space needed at the end. Else allocate the
2073 maximum possible needed (4 result bytes per Unicode character), and return
2074 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002075*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002076PyObject *
2077PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002078 Py_ssize_t size,
2079 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080{
Tim Peters602f7402002-04-27 18:03:26 +00002081#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002082
Martin v. Löwis18e16552006-02-15 17:27:45 +00002083 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002084 PyObject *v; /* result string object */
2085 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002086 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002087 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002088 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002089
Tim Peters602f7402002-04-27 18:03:26 +00002090 assert(s != NULL);
2091 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092
Tim Peters602f7402002-04-27 18:03:26 +00002093 if (size <= MAX_SHORT_UNICHARS) {
2094 /* Write into the stack buffer; nallocated can't overflow.
2095 * At the end, we'll allocate exactly as much heap space as it
2096 * turns out we need.
2097 */
2098 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2099 v = NULL; /* will allocate after we're done */
2100 p = stackbuf;
2101 }
2102 else {
2103 /* Overallocate on the heap, and give the excess back at the end. */
2104 nallocated = size * 4;
2105 if (nallocated / 4 != size) /* overflow! */
2106 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002107 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002108 if (v == NULL)
2109 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002110 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002111 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112
Tim Peters602f7402002-04-27 18:03:26 +00002113 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002114 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002116 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002119
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002121 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002122 *p++ = (char)(0xc0 | (ch >> 6));
2123 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002124 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002125 else {
Tim Peters602f7402002-04-27 18:03:26 +00002126 /* Encode UCS2 Unicode ordinals */
2127 if (ch < 0x10000) {
2128 /* Special case: check for high surrogate */
2129 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2130 Py_UCS4 ch2 = s[i];
2131 /* Check for low surrogate and combine the two to
2132 form a UCS4 value */
2133 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002134 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002135 i++;
2136 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002137 }
Tim Peters602f7402002-04-27 18:03:26 +00002138 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002139 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002140 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002141 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2142 *p++ = (char)(0x80 | (ch & 0x3f));
2143 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002144 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002145 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002146 /* Encode UCS4 Unicode ordinals */
2147 *p++ = (char)(0xf0 | (ch >> 18));
2148 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2149 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2150 *p++ = (char)(0x80 | (ch & 0x3f));
2151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002153
Tim Peters602f7402002-04-27 18:03:26 +00002154 if (v == NULL) {
2155 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002156 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002157 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002158 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002159 }
2160 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002161 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002162 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002163 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002164 if (_PyString_Resize(&v, nneeded))
2165 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002168
Tim Peters602f7402002-04-27 18:03:26 +00002169#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170}
2171
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2173{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 if (!PyUnicode_Check(unicode)) {
2175 PyErr_BadArgument();
2176 return NULL;
2177 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002178 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002179 PyUnicode_GET_SIZE(unicode),
2180 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181}
2182
Walter Dörwald6e390802007-08-17 16:41:28 +00002183/* --- UTF-32 Codec ------------------------------------------------------- */
2184
2185PyObject *
2186PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002187 Py_ssize_t size,
2188 const char *errors,
2189 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002190{
2191 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2192}
2193
2194PyObject *
2195PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002196 Py_ssize_t size,
2197 const char *errors,
2198 int *byteorder,
2199 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002200{
2201 const char *starts = s;
2202 Py_ssize_t startinpos;
2203 Py_ssize_t endinpos;
2204 Py_ssize_t outpos;
2205 PyUnicodeObject *unicode;
2206 Py_UNICODE *p;
2207#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002208 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002209 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002210#else
2211 const int pairs = 0;
2212#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002213 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002214 int bo = 0; /* assume native ordering by default */
2215 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002216 /* Offsets from q for retrieving bytes in the right order. */
2217#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2218 int iorder[] = {0, 1, 2, 3};
2219#else
2220 int iorder[] = {3, 2, 1, 0};
2221#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002222 PyObject *errorHandler = NULL;
2223 PyObject *exc = NULL;
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002224
Walter Dörwald6e390802007-08-17 16:41:28 +00002225 q = (unsigned char *)s;
2226 e = q + size;
2227
2228 if (byteorder)
2229 bo = *byteorder;
2230
2231 /* Check for BOM marks (U+FEFF) in the input and adjust current
2232 byte order setting accordingly. In native mode, the leading BOM
2233 mark is skipped, in all other modes, it is copied to the output
2234 stream as-is (giving a ZWNBSP character). */
2235 if (bo == 0) {
2236 if (size >= 4) {
2237 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002238 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002240 if (bom == 0x0000FEFF) {
2241 q += 4;
2242 bo = -1;
2243 }
2244 else if (bom == 0xFFFE0000) {
2245 q += 4;
2246 bo = 1;
2247 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002248#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002249 if (bom == 0x0000FEFF) {
2250 q += 4;
2251 bo = 1;
2252 }
2253 else if (bom == 0xFFFE0000) {
2254 q += 4;
2255 bo = -1;
2256 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002257#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002258 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002259 }
2260
2261 if (bo == -1) {
2262 /* force LE */
2263 iorder[0] = 0;
2264 iorder[1] = 1;
2265 iorder[2] = 2;
2266 iorder[3] = 3;
2267 }
2268 else if (bo == 1) {
2269 /* force BE */
2270 iorder[0] = 3;
2271 iorder[1] = 2;
2272 iorder[2] = 1;
2273 iorder[3] = 0;
2274 }
2275
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002276 /* On narrow builds we split characters outside the BMP into two
2277 codepoints => count how much extra space we need. */
2278#ifndef Py_UNICODE_WIDE
2279 for (qq = q; qq < e; qq += 4)
2280 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2281 pairs++;
2282#endif
2283
2284 /* This might be one to much, because of a BOM */
2285 unicode = _PyUnicode_New((size+3)/4+pairs);
2286 if (!unicode)
2287 return NULL;
2288 if (size == 0)
2289 return (PyObject *)unicode;
2290
2291 /* Unpack UTF-32 encoded data */
2292 p = unicode->str;
2293
Walter Dörwald6e390802007-08-17 16:41:28 +00002294 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002295 Py_UCS4 ch;
2296 /* remaining bytes at the end? (size should be divisible by 4) */
2297 if (e-q<4) {
2298 if (consumed)
2299 break;
2300 errmsg = "truncated data";
2301 startinpos = ((const char *)q)-starts;
2302 endinpos = ((const char *)e)-starts;
2303 goto utf32Error;
2304 /* The remaining input chars are ignored if the callback
2305 chooses to skip the input */
2306 }
2307 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2308 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002309
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002310 if (ch >= 0x110000)
2311 {
2312 errmsg = "codepoint not in range(0x110000)";
2313 startinpos = ((const char *)q)-starts;
2314 endinpos = startinpos+4;
2315 goto utf32Error;
2316 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002317#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002318 if (ch >= 0x10000)
2319 {
2320 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2321 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2322 }
2323 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002324#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002325 *p++ = ch;
2326 q += 4;
2327 continue;
2328 utf32Error:
2329 outpos = p-PyUnicode_AS_UNICODE(unicode);
2330 if (unicode_decode_call_errorhandler(
2331 errors, &errorHandler,
2332 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002333 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002334 &unicode, &outpos, &p))
2335 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002336 }
2337
2338 if (byteorder)
2339 *byteorder = bo;
2340
2341 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002342 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002343
2344 /* Adjust length */
2345 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2346 goto onError;
2347
2348 Py_XDECREF(errorHandler);
2349 Py_XDECREF(exc);
2350 return (PyObject *)unicode;
2351
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002352 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002353 Py_DECREF(unicode);
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return NULL;
2357}
2358
2359PyObject *
2360PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002361 Py_ssize_t size,
2362 const char *errors,
2363 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002364{
2365 PyObject *v;
2366 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002367 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002368#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002369 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002370#else
2371 const int pairs = 0;
2372#endif
2373 /* Offsets from p for storing byte pairs in the right order. */
2374#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375 int iorder[] = {0, 1, 2, 3};
2376#else
2377 int iorder[] = {3, 2, 1, 0};
2378#endif
2379
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002380#define STORECHAR(CH) \
2381 do { \
2382 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2383 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2384 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2385 p[iorder[0]] = (CH) & 0xff; \
2386 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002387 } while(0)
2388
2389 /* In narrow builds we can output surrogate pairs as one codepoint,
2390 so we need less space. */
2391#ifndef Py_UNICODE_WIDE
2392 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002393 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2394 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2395 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002396#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002397 nsize = (size - pairs + (byteorder == 0));
2398 bytesize = nsize * 4;
2399 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002400 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002401 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002402 if (v == NULL)
2403 return NULL;
2404
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002405 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002406 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002407 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (size == 0)
2409 return v;
2410
2411 if (byteorder == -1) {
2412 /* force LE */
2413 iorder[0] = 0;
2414 iorder[1] = 1;
2415 iorder[2] = 2;
2416 iorder[3] = 3;
2417 }
2418 else if (byteorder == 1) {
2419 /* force BE */
2420 iorder[0] = 3;
2421 iorder[1] = 2;
2422 iorder[2] = 1;
2423 iorder[3] = 0;
2424 }
2425
2426 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002427 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002428#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002429 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2430 Py_UCS4 ch2 = *s;
2431 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2432 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2433 s++;
2434 size--;
2435 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002436 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002437#endif
2438 STORECHAR(ch);
2439 }
2440 return v;
2441#undef STORECHAR
2442}
2443
2444PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2445{
2446 if (!PyUnicode_Check(unicode)) {
2447 PyErr_BadArgument();
2448 return NULL;
2449 }
2450 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002451 PyUnicode_GET_SIZE(unicode),
2452 NULL,
2453 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002454}
2455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456/* --- UTF-16 Codec ------------------------------------------------------- */
2457
Tim Peters772747b2001-08-09 22:21:55 +00002458PyObject *
2459PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002460 Py_ssize_t size,
2461 const char *errors,
2462 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463{
Walter Dörwald69652032004-09-07 20:24:22 +00002464 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2465}
2466
2467PyObject *
2468PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002469 Py_ssize_t size,
2470 const char *errors,
2471 int *byteorder,
2472 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002475 Py_ssize_t startinpos;
2476 Py_ssize_t endinpos;
2477 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 PyUnicodeObject *unicode;
2479 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002480 const unsigned char *q, *e;
2481 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002482 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002483 /* Offsets from q for retrieving byte pairs in the right order. */
2484#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2485 int ihi = 1, ilo = 0;
2486#else
2487 int ihi = 0, ilo = 1;
2488#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 PyObject *errorHandler = NULL;
2490 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491
2492 /* Note: size will always be longer than the resulting Unicode
2493 character count */
2494 unicode = _PyUnicode_New(size);
2495 if (!unicode)
2496 return NULL;
2497 if (size == 0)
2498 return (PyObject *)unicode;
2499
2500 /* Unpack UTF-16 encoded data */
2501 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002502 q = (unsigned char *)s;
2503 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504
2505 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002506 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002508 /* Check for BOM marks (U+FEFF) in the input and adjust current
2509 byte order setting accordingly. In native mode, the leading BOM
2510 mark is skipped, in all other modes, it is copied to the output
2511 stream as-is (giving a ZWNBSP character). */
2512 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002513 if (size >= 2) {
2514 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002515#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002516 if (bom == 0xFEFF) {
2517 q += 2;
2518 bo = -1;
2519 }
2520 else if (bom == 0xFFFE) {
2521 q += 2;
2522 bo = 1;
2523 }
Tim Petersced69f82003-09-16 20:30:58 +00002524#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002525 if (bom == 0xFEFF) {
2526 q += 2;
2527 bo = 1;
2528 }
2529 else if (bom == 0xFFFE) {
2530 q += 2;
2531 bo = -1;
2532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002533#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536
Tim Peters772747b2001-08-09 22:21:55 +00002537 if (bo == -1) {
2538 /* force LE */
2539 ihi = 1;
2540 ilo = 0;
2541 }
2542 else if (bo == 1) {
2543 /* force BE */
2544 ihi = 0;
2545 ilo = 1;
2546 }
2547
2548 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002549 Py_UNICODE ch;
2550 /* remaining bytes at the end? (size should be even) */
2551 if (e-q<2) {
2552 if (consumed)
2553 break;
2554 errmsg = "truncated data";
2555 startinpos = ((const char *)q)-starts;
2556 endinpos = ((const char *)e)-starts;
2557 goto utf16Error;
2558 /* The remaining input chars are ignored if the callback
2559 chooses to skip the input */
2560 }
2561 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562
Benjamin Peterson857ce152009-01-31 16:29:18 +00002563 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002564
2565 if (ch < 0xD800 || ch > 0xDFFF) {
2566 *p++ = ch;
2567 continue;
2568 }
2569
2570 /* UTF-16 code pair: */
2571 if (q >= e) {
2572 errmsg = "unexpected end of data";
2573 startinpos = (((const char *)q)-2)-starts;
2574 endinpos = ((const char *)e)-starts;
2575 goto utf16Error;
2576 }
2577 if (0xD800 <= ch && ch <= 0xDBFF) {
2578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2579 q += 2;
2580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002581#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002582 *p++ = ch;
2583 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002584#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 continue;
2588 }
2589 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002591 startinpos = (((const char *)q)-4)-starts;
2592 endinpos = startinpos+2;
2593 goto utf16Error;
2594 }
2595
Benjamin Peterson857ce152009-01-31 16:29:18 +00002596 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002597 errmsg = "illegal encoding";
2598 startinpos = (((const char *)q)-2)-starts;
2599 endinpos = startinpos+2;
2600 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002601
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002602 utf16Error:
2603 outpos = p-PyUnicode_AS_UNICODE(unicode);
2604 if (unicode_decode_call_errorhandler(
2605 errors, &errorHandler,
2606 "utf16", errmsg,
2607 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2608 &unicode, &outpos, &p))
2609 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 }
2611
2612 if (byteorder)
2613 *byteorder = bo;
2614
Walter Dörwald69652032004-09-07 20:24:22 +00002615 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002616 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002617
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 goto onError;
2621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002622 Py_XDECREF(errorHandler);
2623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 return (PyObject *)unicode;
2625
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 Py_XDECREF(errorHandler);
2629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 return NULL;
2631}
2632
Tim Peters772747b2001-08-09 22:21:55 +00002633PyObject *
2634PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002635 Py_ssize_t size,
2636 const char *errors,
2637 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638{
2639 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002640 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002641 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002642#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002643 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002644#else
2645 const int pairs = 0;
2646#endif
Tim Peters772747b2001-08-09 22:21:55 +00002647 /* Offsets from p for storing byte pairs in the right order. */
2648#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2649 int ihi = 1, ilo = 0;
2650#else
2651 int ihi = 0, ilo = 1;
2652#endif
2653
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002654#define STORECHAR(CH) \
2655 do { \
2656 p[ihi] = ((CH) >> 8) & 0xff; \
2657 p[ilo] = (CH) & 0xff; \
2658 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002659 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002661#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002662 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002663 if (s[i] >= 0x10000)
2664 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002665#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002666 /* 2 * (size + pairs + (byteorder == 0)) */
2667 if (size > PY_SSIZE_T_MAX ||
2668 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002670 nsize = size + pairs + (byteorder == 0);
2671 bytesize = nsize * 2;
2672 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002673 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002674 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 if (v == NULL)
2676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002678 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002680 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002681 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002682 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002683
2684 if (byteorder == -1) {
2685 /* force LE */
2686 ihi = 1;
2687 ilo = 0;
2688 }
2689 else if (byteorder == 1) {
2690 /* force BE */
2691 ihi = 0;
2692 ilo = 1;
2693 }
2694
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002695 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002696 Py_UNICODE ch = *s++;
2697 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002698#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002699 if (ch >= 0x10000) {
2700 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2701 ch = 0xD800 | ((ch-0x10000) >> 10);
2702 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002703#endif
Tim Peters772747b2001-08-09 22:21:55 +00002704 STORECHAR(ch);
2705 if (ch2)
2706 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002709#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710}
2711
2712PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2713{
2714 if (!PyUnicode_Check(unicode)) {
2715 PyErr_BadArgument();
2716 return NULL;
2717 }
2718 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002719 PyUnicode_GET_SIZE(unicode),
2720 NULL,
2721 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722}
2723
2724/* --- Unicode Escape Codec ----------------------------------------------- */
2725
Fredrik Lundh06d12682001-01-24 07:59:11 +00002726static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002727
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002729 Py_ssize_t size,
2730 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002733 Py_ssize_t startinpos;
2734 Py_ssize_t endinpos;
2735 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002740 char* message;
2741 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 PyObject *errorHandler = NULL;
2743 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 /* Escaped strings will always be longer than the resulting
2746 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 length after conversion to the true value.
2748 (but if the error callback returns a long replacement string
2749 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 v = _PyUnicode_New(size);
2751 if (v == NULL)
2752 goto onError;
2753 if (size == 0)
2754 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 while (s < end) {
2760 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002761 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763
2764 /* Non-escape characters are interpreted as Unicode ordinals */
2765 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002766 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 continue;
2768 }
2769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 /* \ - Escapes */
2772 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002773 c = *s++;
2774 if (s > end)
2775 c = '\0'; /* Invalid after \ */
2776 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002778 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 case '\n': break;
2780 case '\\': *p++ = '\\'; break;
2781 case '\'': *p++ = '\''; break;
2782 case '\"': *p++ = '\"'; break;
2783 case 'b': *p++ = '\b'; break;
2784 case 'f': *p++ = '\014'; break; /* FF */
2785 case 't': *p++ = '\t'; break;
2786 case 'n': *p++ = '\n'; break;
2787 case 'r': *p++ = '\r'; break;
2788 case 'v': *p++ = '\013'; break; /* VT */
2789 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2790
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002791 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 case '0': case '1': case '2': case '3':
2793 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002794 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002795 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002796 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002797 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002798 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 break;
2802
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002803 /* hex escapes */
2804 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806 digits = 2;
2807 message = "truncated \\xXX escape";
2808 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002810 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812 digits = 4;
2813 message = "truncated \\uXXXX escape";
2814 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002816 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002817 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 digits = 8;
2819 message = "truncated \\UXXXXXXXX escape";
2820 hexescape:
2821 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 outpos = p-PyUnicode_AS_UNICODE(v);
2823 if (s+digits>end) {
2824 endinpos = size;
2825 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002826 errors, &errorHandler,
2827 "unicodeescape", "end of string in escape sequence",
2828 starts, size, &startinpos, &endinpos, &exc, &s,
2829 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 goto onError;
2831 goto nextByte;
2832 }
2833 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002834 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002835 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 endinpos = (s+i+1)-starts;
2837 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002838 errors, &errorHandler,
2839 "unicodeescape", message,
2840 starts, size, &startinpos, &endinpos, &exc, &s,
2841 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002842 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 }
2845 chr = (chr<<4) & ~0xF;
2846 if (c >= '0' && c <= '9')
2847 chr += c - '0';
2848 else if (c >= 'a' && c <= 'f')
2849 chr += 10 + c - 'a';
2850 else
2851 chr += 10 + c - 'A';
2852 }
2853 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002854 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 /* _decoding_error will have already written into the
2856 target buffer. */
2857 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002858 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002859 /* when we get here, chr is a 32-bit unicode character */
2860 if (chr <= 0xffff)
2861 /* UCS-2 character */
2862 *p++ = (Py_UNICODE) chr;
2863 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002864 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002865 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002866#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002867 *p++ = chr;
2868#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 chr -= 0x10000L;
2870 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002871 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002872#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002873 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874 endinpos = s-starts;
2875 outpos = p-PyUnicode_AS_UNICODE(v);
2876 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002877 errors, &errorHandler,
2878 "unicodeescape", "illegal Unicode character",
2879 starts, size, &startinpos, &endinpos, &exc, &s,
2880 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002881 goto onError;
2882 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 break;
2884
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002885 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002886 case 'N':
2887 message = "malformed \\N character escape";
2888 if (ucnhash_CAPI == NULL) {
2889 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002890 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002891 if (ucnhash_CAPI == NULL)
2892 goto ucnhashError;
2893 }
2894 if (*s == '{') {
2895 const char *start = s+1;
2896 /* look for the closing brace */
2897 while (*s != '}' && s < end)
2898 s++;
2899 if (s > start && s < end && *s == '}') {
2900 /* found a name. look it up in the unicode database */
2901 message = "unknown Unicode character name";
2902 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002904 goto store;
2905 }
2906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002910 errors, &errorHandler,
2911 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002915 break;
2916
2917 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002918 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002919 message = "\\ at end of string";
2920 s--;
2921 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002924 errors, &errorHandler,
2925 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002928 goto onError;
2929 }
2930 else {
2931 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1];
2933 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002934 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002936 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002945 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2949 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002950 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002953 return NULL;
2954
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002955 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 return NULL;
2960}
2961
2962/* Return a Unicode-Escape string version of the Unicode object.
2963
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2966
2967*/
2968
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002969Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002970 Py_ssize_t size,
2971 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002972{
2973 /* like wcschr, but doesn't stop at NULL characters */
2974
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2979 }
2980
2981 return NULL;
2982}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002983
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984static
2985PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002986 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 int quotes)
2988{
2989 PyObject *repr;
2990 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002992 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002993#ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995#else
2996 const Py_ssize_t expandsize = 6;
2997#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Neal Norwitz17753ec2006-08-21 22:21:19 +00002999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3002 */
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3005
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3011
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3015 */
3016
Neal Norwitze7d8be82008-07-31 17:17:14 +00003017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003018 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003019
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003020 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003021 2
3022 + expandsize*size
3023 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (repr == NULL)
3025 return NULL;
3026
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003027 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003031 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 !findchar(s, size, '"')) ? '"' : '\'';
3033 }
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003036
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003037 /* Escape quotes and backslashes */
3038 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 *p++ = '\\';
3041 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003042 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003043 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003044
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003045#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003057 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003058 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003059 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003060#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003066 ch2 = *s++;
3067 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3081 }
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003085 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003086#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003087
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003089 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 *p++ = '\\';
3091 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3102 }
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3106 }
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3110 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003111
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003112 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003113 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003115 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003119
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3123 }
3124 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003125 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126
3127 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003128 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 return repr;
3131}
3132
3133PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003134 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135{
3136 return unicodeescape_string(s, size, 0);
3137}
3138
3139PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140{
3141 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument();
3143 return NULL;
3144 }
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003146 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147}
3148
3149/* --- Raw Unicode Escape Codec ------------------------------------------- */
3150
3151PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 Py_ssize_t size,
3153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003156 Py_ssize_t startinpos;
3157 Py_ssize_t endinpos;
3158 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 const char *end;
3162 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 PyObject *errorHandler = NULL;
3164 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 /* Escaped strings will always be longer than the resulting
3167 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 length after conversion to the true value. (But decoding error
3169 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 v = _PyUnicode_New(size);
3171 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003174 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 end = s + size;
3177 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 unsigned char c;
3179 Py_UCS4 x;
3180 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003183 /* Non-escape characters are interpreted as Unicode ordinals */
3184 if (*s != '\\') {
3185 *p++ = (unsigned char)*s++;
3186 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003187 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003188 startinpos = s-starts;
3189
3190 /* \u-escapes are only interpreted iff the number of leading
3191 backslashes if odd */
3192 bs = s;
3193 for (;s < end;) {
3194 if (*s != '\\')
3195 break;
3196 *p++ = (unsigned char)*s++;
3197 }
3198 if (((s - bs) & 1) == 0 ||
3199 s >= end ||
3200 (*s != 'u' && *s != 'U')) {
3201 continue;
3202 }
3203 p--;
3204 count = *s=='u' ? 4 : 8;
3205 s++;
3206
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208 outpos = p-PyUnicode_AS_UNICODE(v);
3209 for (x = 0, i = 0; i < count; ++i, ++s) {
3210 c = (unsigned char)*s;
3211 if (!isxdigit(c)) {
3212 endinpos = s-starts;
3213 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler,
3215 "rawunicodeescape", "truncated \\uXXXX",
3216 starts, size, &startinpos, &endinpos, &exc, &s,
3217 &v, &outpos, &p))
3218 goto onError;
3219 goto nextByte;
3220 }
3221 x = (x<<4) & ~0xF;
3222 if (c >= '0' && c <= '9')
3223 x += c - '0';
3224 else if (c >= 'a' && c <= 'f')
3225 x += 10 + c - 'a';
3226 else
3227 x += 10 + c - 'A';
3228 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003229 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003230 /* UCS-2 character */
3231 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003232 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 /* UCS-4 character. Either store directly, or as
3234 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003237#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 x -= 0x10000L;
3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#endif
3242 } else {
3243 endinpos = s-starts;
3244 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003245 if (unicode_decode_call_errorhandler(
3246 errors, &errorHandler,
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 starts, size, &startinpos, &endinpos, &exc, &s,
3249 &v, &outpos, &p))
3250 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 nextByte:
3253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003260
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return NULL;
3266}
3267
3268PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270{
3271 PyObject *repr;
3272 char *p;
3273 char *q;
3274
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003275 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003276#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003277 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003279 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003281
Neal Norwitze7d8be82008-07-31 17:17:14 +00003282 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003284
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 if (repr == NULL)
3287 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003288 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003291 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 while (size-- > 0) {
3293 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003294#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 /* Map 32-bit characters to '\Uxxxxxxxx' */
3296 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003297 *p++ = '\\';
3298 *p++ = 'U';
3299 *p++ = hexdigit[(ch >> 28) & 0xf];
3300 *p++ = hexdigit[(ch >> 24) & 0xf];
3301 *p++ = hexdigit[(ch >> 20) & 0xf];
3302 *p++ = hexdigit[(ch >> 16) & 0xf];
3303 *p++ = hexdigit[(ch >> 12) & 0xf];
3304 *p++ = hexdigit[(ch >> 8) & 0xf];
3305 *p++ = hexdigit[(ch >> 4) & 0xf];
3306 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003307 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003308 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003309#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311 if (ch >= 0xD800 && ch < 0xDC00) {
3312 Py_UNICODE ch2;
3313 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003315 ch2 = *s++;
3316 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319 *p++ = '\\';
3320 *p++ = 'U';
3321 *p++ = hexdigit[(ucs >> 28) & 0xf];
3322 *p++ = hexdigit[(ucs >> 24) & 0xf];
3323 *p++ = hexdigit[(ucs >> 20) & 0xf];
3324 *p++ = hexdigit[(ucs >> 16) & 0xf];
3325 *p++ = hexdigit[(ucs >> 12) & 0xf];
3326 *p++ = hexdigit[(ucs >> 8) & 0xf];
3327 *p++ = hexdigit[(ucs >> 4) & 0xf];
3328 *p++ = hexdigit[ucs & 0xf];
3329 continue;
3330 }
3331 /* Fall through: isolated surrogates are copied as-is */
3332 s--;
3333 size++;
3334 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003335#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003336 /* Map 16-bit characters to '\uxxxx' */
3337 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 *p++ = '\\';
3339 *p++ = 'u';
3340 *p++ = hexdigit[(ch >> 12) & 0xf];
3341 *p++ = hexdigit[(ch >> 8) & 0xf];
3342 *p++ = hexdigit[(ch >> 4) & 0xf];
3343 *p++ = hexdigit[ch & 15];
3344 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 /* Copy everything else as-is */
3346 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 *p++ = (char) ch;
3348 }
3349 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003350 if (_PyString_Resize(&repr, p - q))
3351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return repr;
3353}
3354
3355PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356{
3357 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003358 PyErr_BadArgument();
3359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003362 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363}
3364
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003365/* --- Unicode Internal Codec ------------------------------------------- */
3366
3367PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 Py_ssize_t size,
3369 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003370{
3371 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 Py_ssize_t startinpos;
3373 Py_ssize_t endinpos;
3374 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 PyUnicodeObject *v;
3376 Py_UNICODE *p;
3377 const char *end;
3378 const char *reason;
3379 PyObject *errorHandler = NULL;
3380 PyObject *exc = NULL;
3381
Neal Norwitzd43069c2006-01-08 01:12:10 +00003382#ifdef Py_UNICODE_WIDE
3383 Py_UNICODE unimax = PyUnicode_GetMax();
3384#endif
3385
Armin Rigo7ccbca92006-10-04 12:17:45 +00003386 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003387 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3388 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003390 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003392 p = PyUnicode_AS_UNICODE(v);
3393 end = s + size;
3394
3395 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003396 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003397 /* We have to sanity check the raw data, otherwise doom looms for
3398 some malformed UCS-4 data. */
3399 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003400#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003401 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003402#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 end-s < Py_UNICODE_SIZE
3404 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003405 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003406 startinpos = s - starts;
3407 if (end-s < Py_UNICODE_SIZE) {
3408 endinpos = end-starts;
3409 reason = "truncated input";
3410 }
3411 else {
3412 endinpos = s - starts + Py_UNICODE_SIZE;
3413 reason = "illegal code point (> 0x10FFFF)";
3414 }
3415 outpos = p - PyUnicode_AS_UNICODE(v);
3416 if (unicode_decode_call_errorhandler(
3417 errors, &errorHandler,
3418 "unicode_internal", reason,
3419 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003420 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003421 goto onError;
3422 }
3423 }
3424 else {
3425 p++;
3426 s += Py_UNICODE_SIZE;
3427 }
3428 }
3429
Martin v. Löwis412fb672006-04-13 06:34:32 +00003430 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003431 goto onError;
3432 Py_XDECREF(errorHandler);
3433 Py_XDECREF(exc);
3434 return (PyObject *)v;
3435
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003436 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 Py_XDECREF(v);
3438 Py_XDECREF(errorHandler);
3439 Py_XDECREF(exc);
3440 return NULL;
3441}
3442
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443/* --- Latin-1 Codec ------------------------------------------------------ */
3444
3445PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003446 Py_ssize_t size,
3447 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448{
3449 PyUnicodeObject *v;
3450 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003451
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003453 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003454 Py_UNICODE r = *(unsigned char*)s;
3455 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003456 }
3457
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 v = _PyUnicode_New(size);
3459 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003462 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 p = PyUnicode_AS_UNICODE(v);
3464 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003465 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 Py_XDECREF(v);
3470 return NULL;
3471}
3472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473/* create or adjust a UnicodeEncodeError */
3474static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003475 const char *encoding,
3476 const Py_UNICODE *unicode, Py_ssize_t size,
3477 Py_ssize_t startpos, Py_ssize_t endpos,
3478 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 *exceptionObject = PyUnicodeEncodeError_Create(
3482 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 }
3484 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003485 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3486 goto onError;
3487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3488 goto onError;
3489 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3490 goto onError;
3491 return;
3492 onError:
3493 Py_DECREF(*exceptionObject);
3494 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 }
3496}
3497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498/* raises a UnicodeEncodeError */
3499static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003500 const char *encoding,
3501 const Py_UNICODE *unicode, Py_ssize_t size,
3502 Py_ssize_t startpos, Py_ssize_t endpos,
3503 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504{
3505 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509}
3510
3511/* error handling callback helper:
3512 build arguments, call the callback and check the arguments,
3513 put the result into newpos and return the replacement string, which
3514 has to be freed by the caller */
3515static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003516 PyObject **errorHandler,
3517 const char *encoding, const char *reason,
3518 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3519 Py_ssize_t startpos, Py_ssize_t endpos,
3520 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003522 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523
3524 PyObject *restuple;
3525 PyObject *resunicode;
3526
3527 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 }
3532
3533 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537
3538 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003539 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003543 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003544 Py_DECREF(restuple);
3545 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 }
3547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003548 &resunicode, newpos)) {
3549 Py_DECREF(restuple);
3550 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 }
3552 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003553 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003554 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3556 Py_DECREF(restuple);
3557 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003558 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_INCREF(resunicode);
3560 Py_DECREF(restuple);
3561 return resunicode;
3562}
3563
3564static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003565 Py_ssize_t size,
3566 const char *errors,
3567 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568{
3569 /* output object */
3570 PyObject *res;
3571 /* pointers to the beginning and end+1 of input */
3572 const Py_UNICODE *startp = p;
3573 const Py_UNICODE *endp = p + size;
3574 /* pointer to the beginning of the unencodable characters */
3575 /* const Py_UNICODE *badp = NULL; */
3576 /* pointer into the output */
3577 char *str;
3578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t respos = 0;
3580 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003581 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 PyObject *errorHandler = NULL;
3584 PyObject *exc = NULL;
3585 /* the following variable is used for caching string comparisons
3586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3587 int known_errorHandler = -1;
3588
3589 /* allocate enough for a simple encoding without
3590 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003591 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 if (res == NULL)
3593 goto onError;
3594 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003595 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003596 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 ressize = size;
3598
3599 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003600 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003602 /* can we encode this? */
3603 if (c<limit) {
3604 /* no overflow check, because we know that the space is enough */
3605 *str++ = (char)c;
3606 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003607 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003608 else {
3609 Py_ssize_t unicodepos = p-startp;
3610 Py_ssize_t requiredsize;
3611 PyObject *repunicode;
3612 Py_ssize_t repsize;
3613 Py_ssize_t newpos;
3614 Py_ssize_t respos;
3615 Py_UNICODE *uni2;
3616 /* startpos for collecting unencodable chars */
3617 const Py_UNICODE *collstart = p;
3618 const Py_UNICODE *collend = p;
3619 /* find all unecodable characters */
3620 while ((collend < endp) && ((*collend)>=limit))
3621 ++collend;
3622 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3623 if (known_errorHandler==-1) {
3624 if ((errors==NULL) || (!strcmp(errors, "strict")))
3625 known_errorHandler = 1;
3626 else if (!strcmp(errors, "replace"))
3627 known_errorHandler = 2;
3628 else if (!strcmp(errors, "ignore"))
3629 known_errorHandler = 3;
3630 else if (!strcmp(errors, "xmlcharrefreplace"))
3631 known_errorHandler = 4;
3632 else
3633 known_errorHandler = 0;
3634 }
3635 switch (known_errorHandler) {
3636 case 1: /* strict */
3637 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3638 goto onError;
3639 case 2: /* replace */
3640 while (collstart++<collend)
3641 *str++ = '?'; /* fall through */
3642 case 3: /* ignore */
3643 p = collend;
3644 break;
3645 case 4: /* xmlcharrefreplace */
3646 respos = str-PyString_AS_STRING(res);
3647 /* determine replacement size (temporarily (mis)uses p) */
3648 for (p = collstart, repsize = 0; p < collend; ++p) {
3649 if (*p<10)
3650 repsize += 2+1+1;
3651 else if (*p<100)
3652 repsize += 2+2+1;
3653 else if (*p<1000)
3654 repsize += 2+3+1;
3655 else if (*p<10000)
3656 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003657#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003658 else
3659 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003660#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003661 else if (*p<100000)
3662 repsize += 2+5+1;
3663 else if (*p<1000000)
3664 repsize += 2+6+1;
3665 else
3666 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003667#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003668 }
3669 requiredsize = respos+repsize+(endp-collend);
3670 if (requiredsize > ressize) {
3671 if (requiredsize<2*ressize)
3672 requiredsize = 2*ressize;
3673 if (_PyString_Resize(&res, requiredsize))
3674 goto onError;
3675 str = PyString_AS_STRING(res) + respos;
3676 ressize = requiredsize;
3677 }
3678 /* generate replacement (temporarily (mis)uses p) */
3679 for (p = collstart; p < collend; ++p) {
3680 str += sprintf(str, "&#%d;", (int)*p);
3681 }
3682 p = collend;
3683 break;
3684 default:
3685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3686 encoding, reason, startp, size, &exc,
3687 collstart-startp, collend-startp, &newpos);
3688 if (repunicode == NULL)
3689 goto onError;
3690 /* need more space? (at least enough for what we have+the
3691 replacement+the rest of the string, so we won't have to
3692 check space for encodable characters) */
3693 respos = str-PyString_AS_STRING(res);
3694 repsize = PyUnicode_GET_SIZE(repunicode);
3695 requiredsize = respos+repsize+(endp-collend);
3696 if (requiredsize > ressize) {
3697 if (requiredsize<2*ressize)
3698 requiredsize = 2*ressize;
3699 if (_PyString_Resize(&res, requiredsize)) {
3700 Py_DECREF(repunicode);
3701 goto onError;
3702 }
3703 str = PyString_AS_STRING(res) + respos;
3704 ressize = requiredsize;
3705 }
3706 /* check if there is anything unencodable in the replacement
3707 and copy it to the output */
3708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3709 c = *uni2;
3710 if (c >= limit) {
3711 raise_encode_exception(&exc, encoding, startp, size,
3712 unicodepos, unicodepos+1, reason);
3713 Py_DECREF(repunicode);
3714 goto onError;
3715 }
3716 *str = (char)c;
3717 }
3718 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003719 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003720 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003721 }
3722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003724 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003726 /* If this falls res will be NULL */
3727 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 Py_XDECREF(errorHandler);
3729 Py_XDECREF(exc);
3730 return res;
3731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 Py_XDECREF(res);
3734 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc);
3736 return NULL;
3737}
3738
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003740 Py_ssize_t size,
3741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744}
3745
3746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3747{
3748 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 PyErr_BadArgument();
3750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 }
3752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003753 PyUnicode_GET_SIZE(unicode),
3754 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755}
3756
3757/* --- 7-bit ASCII Codec -------------------------------------------------- */
3758
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003760 Py_ssize_t size,
3761 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 PyUnicodeObject *v;
3765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003766 Py_ssize_t startinpos;
3767 Py_ssize_t endinpos;
3768 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 const char *e;
3770 PyObject *errorHandler = NULL;
3771 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003772
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003774 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003775 Py_UNICODE r = *(unsigned char*)s;
3776 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003777 }
Tim Petersced69f82003-09-16 20:30:58 +00003778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 v = _PyUnicode_New(size);
3780 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003783 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 e = s + size;
3786 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 register unsigned char c = (unsigned char)*s;
3788 if (c < 128) {
3789 *p++ = c;
3790 ++s;
3791 }
3792 else {
3793 startinpos = s-starts;
3794 endinpos = startinpos + 1;
3795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3796 if (unicode_decode_call_errorhandler(
3797 errors, &errorHandler,
3798 "ascii", "ordinal not in range(128)",
3799 starts, size, &startinpos, &endinpos, &exc, &s,
3800 &v, &outpos, &p))
3801 goto onError;
3802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3806 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 Py_XDECREF(errorHandler);
3808 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003810
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 Py_XDECREF(errorHandler);
3814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return NULL;
3816}
3817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003819 Py_ssize_t size,
3820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823}
3824
3825PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3826{
3827 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003828 PyErr_BadArgument();
3829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 }
3831 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003832 PyUnicode_GET_SIZE(unicode),
3833 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834}
3835
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003836#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003839
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003840#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003841#define NEED_RETRY
3842#endif
3843
3844/* XXX This code is limited to "true" double-byte encodings, as
3845 a) it assumes an incomplete character consists of a single byte, and
3846 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003847 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003848
3849static int is_dbcs_lead_byte(const char *s, int offset)
3850{
3851 const char *curr = s + offset;
3852
3853 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003854 const char *prev = CharPrev(s, curr);
3855 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003856 }
3857 return 0;
3858}
3859
3860/*
3861 * Decode MBCS string into unicode object. If 'final' is set, converts
3862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3863 */
3864static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003865 const char *s, /* MBCS string */
3866 int size, /* sizeof MBCS string */
3867 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003868{
3869 Py_UNICODE *p;
3870 Py_ssize_t n = 0;
3871 int usize = 0;
3872
3873 assert(size >= 0);
3874
3875 /* Skip trailing lead-byte unless 'final' is set */
3876 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003877 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003878
3879 /* First get the size of the result */
3880 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003881 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3882 if (usize == 0) {
3883 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3884 return -1;
3885 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 }
3887
3888 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003889 /* Create unicode object */
3890 *v = _PyUnicode_New(usize);
3891 if (*v == NULL)
3892 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 }
3894 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003895 /* Extend unicode object */
3896 n = PyUnicode_GET_SIZE(*v);
3897 if (_PyUnicode_Resize(v, n + usize) < 0)
3898 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003899 }
3900
3901 /* Do the conversion */
3902 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003903 p = PyUnicode_AS_UNICODE(*v) + n;
3904 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3905 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3906 return -1;
3907 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003908 }
3909
3910 return size;
3911}
3912
3913PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003914 Py_ssize_t size,
3915 const char *errors,
3916 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003917{
3918 PyUnicodeObject *v = NULL;
3919 int done;
3920
3921 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003922 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923
3924#ifdef NEED_RETRY
3925 retry:
3926 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003927 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003928 else
3929#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003930 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003931
3932 if (done < 0) {
3933 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003934 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003935 }
3936
3937 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003938 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003939
3940#ifdef NEED_RETRY
3941 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003942 s += done;
3943 size -= done;
3944 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945 }
3946#endif
3947
3948 return (PyObject *)v;
3949}
3950
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003951PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003952 Py_ssize_t size,
3953 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003954{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003955 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3956}
3957
3958/*
3959 * Convert unicode into string object (MBCS).
3960 * Returns 0 if succeed, -1 otherwise.
3961 */
3962static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003963 const Py_UNICODE *p, /* unicode */
3964 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003965{
3966 int mbcssize = 0;
3967 Py_ssize_t n = 0;
3968
3969 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003970
3971 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3974 if (mbcssize == 0) {
3975 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3976 return -1;
3977 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003978 }
3979
Martin v. Löwisd8251432006-06-14 05:21:04 +00003980 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003981 /* Create string object */
3982 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3983 if (*repr == NULL)
3984 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003985 }
3986 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 /* Extend string object */
3988 n = PyString_Size(*repr);
3989 if (_PyString_Resize(repr, n + mbcssize) < 0)
3990 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003991 }
3992
3993 /* Do the conversion */
3994 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003995 char *s = PyString_AS_STRING(*repr) + n;
3996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3997 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998 return -1;
3999 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004000 }
4001
4002 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004003}
4004
4005PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004006 Py_ssize_t size,
4007 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004008{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004009 PyObject *repr = NULL;
4010 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004011
Martin v. Löwisd8251432006-06-14 05:21:04 +00004012#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004013 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004014 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004016 else
4017#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004018 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004019
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004021 Py_XDECREF(repr);
4022 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004023 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004024
4025#ifdef NEED_RETRY
4026 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 p += INT_MAX;
4028 size -= INT_MAX;
4029 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030 }
4031#endif
4032
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004033 return repr;
4034}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004035
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004036PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4037{
4038 if (!PyUnicode_Check(unicode)) {
4039 PyErr_BadArgument();
4040 return NULL;
4041 }
4042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004043 PyUnicode_GET_SIZE(unicode),
4044 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004045}
4046
Martin v. Löwisd8251432006-06-14 05:21:04 +00004047#undef NEED_RETRY
4048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004049#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051/* --- Character Mapping Codec -------------------------------------------- */
4052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004054 Py_ssize_t size,
4055 PyObject *mapping,
4056 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t startinpos;
4060 Py_ssize_t endinpos;
4061 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 PyUnicodeObject *v;
4064 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 PyObject *errorHandler = NULL;
4067 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004068 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004069 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004070
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 /* Default to Latin-1 */
4072 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004073 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074
4075 v = _PyUnicode_New(size);
4076 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004082 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 mapstring = PyUnicode_AS_UNICODE(mapping);
4084 maplen = PyUnicode_GET_SIZE(mapping);
4085 while (s < e) {
4086 unsigned char ch = *s;
4087 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 if (ch < maplen)
4090 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004092 if (x == 0xfffe) {
4093 /* undefined mapping */
4094 outpos = p-PyUnicode_AS_UNICODE(v);
4095 startinpos = s-starts;
4096 endinpos = startinpos+1;
4097 if (unicode_decode_call_errorhandler(
4098 errors, &errorHandler,
4099 "charmap", "character maps to <undefined>",
4100 starts, size, &startinpos, &endinpos, &exc, &s,
4101 &v, &outpos, &p)) {
4102 goto onError;
4103 }
4104 continue;
4105 }
4106 *p++ = x;
4107 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004108 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004109 }
4110 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004111 while (s < e) {
4112 unsigned char ch = *s;
4113 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004114
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004115 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116 w = PyInt_FromLong((long)ch);
4117 if (w == NULL)
4118 goto onError;
4119 x = PyObject_GetItem(mapping, w);
4120 Py_DECREF(w);
4121 if (x == NULL) {
4122 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4123 /* No mapping found means: mapping is undefined. */
4124 PyErr_Clear();
4125 x = Py_None;
4126 Py_INCREF(x);
4127 } else
4128 goto onError;
4129 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004130
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004131 /* Apply mapping */
4132 if (PyInt_Check(x)) {
4133 long value = PyInt_AS_LONG(x);
4134 if (value < 0 || value > 65535) {
4135 PyErr_SetString(PyExc_TypeError,
4136 "character mapping must be in range(65536)");
4137 Py_DECREF(x);
4138 goto onError;
4139 }
4140 *p++ = (Py_UNICODE)value;
4141 }
4142 else if (x == Py_None) {
4143 /* undefined mapping */
4144 outpos = p-PyUnicode_AS_UNICODE(v);
4145 startinpos = s-starts;
4146 endinpos = startinpos+1;
4147 if (unicode_decode_call_errorhandler(
4148 errors, &errorHandler,
4149 "charmap", "character maps to <undefined>",
4150 starts, size, &startinpos, &endinpos, &exc, &s,
4151 &v, &outpos, &p)) {
4152 Py_DECREF(x);
4153 goto onError;
4154 }
4155 Py_DECREF(x);
4156 continue;
4157 }
4158 else if (PyUnicode_Check(x)) {
4159 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004160
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004161 if (targetsize == 1)
4162 /* 1-1 mapping */
4163 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004164
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004165 else if (targetsize > 1) {
4166 /* 1-n mapping */
4167 if (targetsize > extrachars) {
4168 /* resize first */
4169 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4170 Py_ssize_t needed = (targetsize - extrachars) + \
4171 (targetsize << 2);
4172 extrachars += needed;
4173 /* XXX overflow detection missing */
4174 if (_PyUnicode_Resize(&v,
4175 PyUnicode_GET_SIZE(v) + needed) < 0) {
4176 Py_DECREF(x);
4177 goto onError;
4178 }
4179 p = PyUnicode_AS_UNICODE(v) + oldpos;
4180 }
4181 Py_UNICODE_COPY(p,
4182 PyUnicode_AS_UNICODE(x),
4183 targetsize);
4184 p += targetsize;
4185 extrachars -= targetsize;
4186 }
4187 /* 1-0 mapping: skip the character */
4188 }
4189 else {
4190 /* wrong return value */
4191 PyErr_SetString(PyExc_TypeError,
4192 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004193 Py_DECREF(x);
4194 goto onError;
4195 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004196 Py_DECREF(x);
4197 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 }
4200 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004201 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4202 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 Py_XDECREF(errorHandler);
4204 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004206
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004207 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 Py_XDECREF(errorHandler);
4209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 Py_XDECREF(v);
4211 return NULL;
4212}
4213
Martin v. Löwis3f767792006-06-04 19:36:28 +00004214/* Charmap encoding: the lookup table */
4215
4216struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004217 PyObject_HEAD
4218 unsigned char level1[32];
4219 int count2, count3;
4220 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004221};
4222
4223static PyObject*
4224encoding_map_size(PyObject *obj, PyObject* args)
4225{
4226 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004227 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004228 128*map->count3);
4229}
4230
4231static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004232 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004233 PyDoc_STR("Return the size (in bytes) of this object") },
4234 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004235};
4236
4237static void
4238encoding_map_dealloc(PyObject* o)
4239{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004240 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004241}
4242
4243static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004244 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004245 "EncodingMap", /*tp_name*/
4246 sizeof(struct encoding_map), /*tp_basicsize*/
4247 0, /*tp_itemsize*/
4248 /* methods */
4249 encoding_map_dealloc, /*tp_dealloc*/
4250 0, /*tp_print*/
4251 0, /*tp_getattr*/
4252 0, /*tp_setattr*/
4253 0, /*tp_compare*/
4254 0, /*tp_repr*/
4255 0, /*tp_as_number*/
4256 0, /*tp_as_sequence*/
4257 0, /*tp_as_mapping*/
4258 0, /*tp_hash*/
4259 0, /*tp_call*/
4260 0, /*tp_str*/
4261 0, /*tp_getattro*/
4262 0, /*tp_setattro*/
4263 0, /*tp_as_buffer*/
4264 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4265 0, /*tp_doc*/
4266 0, /*tp_traverse*/
4267 0, /*tp_clear*/
4268 0, /*tp_richcompare*/
4269 0, /*tp_weaklistoffset*/
4270 0, /*tp_iter*/
4271 0, /*tp_iternext*/
4272 encoding_map_methods, /*tp_methods*/
4273 0, /*tp_members*/
4274 0, /*tp_getset*/
4275 0, /*tp_base*/
4276 0, /*tp_dict*/
4277 0, /*tp_descr_get*/
4278 0, /*tp_descr_set*/
4279 0, /*tp_dictoffset*/
4280 0, /*tp_init*/
4281 0, /*tp_alloc*/
4282 0, /*tp_new*/
4283 0, /*tp_free*/
4284 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004285};
4286
4287PyObject*
4288PyUnicode_BuildEncodingMap(PyObject* string)
4289{
4290 Py_UNICODE *decode;
4291 PyObject *result;
4292 struct encoding_map *mresult;
4293 int i;
4294 int need_dict = 0;
4295 unsigned char level1[32];
4296 unsigned char level2[512];
4297 unsigned char *mlevel1, *mlevel2, *mlevel3;
4298 int count2 = 0, count3 = 0;
4299
4300 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4301 PyErr_BadArgument();
4302 return NULL;
4303 }
4304 decode = PyUnicode_AS_UNICODE(string);
4305 memset(level1, 0xFF, sizeof level1);
4306 memset(level2, 0xFF, sizeof level2);
4307
4308 /* If there isn't a one-to-one mapping of NULL to \0,
4309 or if there are non-BMP characters, we need to use
4310 a mapping dictionary. */
4311 if (decode[0] != 0)
4312 need_dict = 1;
4313 for (i = 1; i < 256; i++) {
4314 int l1, l2;
4315 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004316#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004317 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004318#endif
4319 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004320 need_dict = 1;
4321 break;
4322 }
4323 if (decode[i] == 0xFFFE)
4324 /* unmapped character */
4325 continue;
4326 l1 = decode[i] >> 11;
4327 l2 = decode[i] >> 7;
4328 if (level1[l1] == 0xFF)
4329 level1[l1] = count2++;
4330 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004331 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004332 }
4333
4334 if (count2 >= 0xFF || count3 >= 0xFF)
4335 need_dict = 1;
4336
4337 if (need_dict) {
4338 PyObject *result = PyDict_New();
4339 PyObject *key, *value;
4340 if (!result)
4341 return NULL;
4342 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004343 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004344 key = PyInt_FromLong(decode[i]);
4345 value = PyInt_FromLong(i);
4346 if (!key || !value)
4347 goto failed1;
4348 if (PyDict_SetItem(result, key, value) == -1)
4349 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004350 Py_DECREF(key);
4351 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004352 }
4353 return result;
4354 failed1:
4355 Py_XDECREF(key);
4356 Py_XDECREF(value);
4357 Py_DECREF(result);
4358 return NULL;
4359 }
4360
4361 /* Create a three-level trie */
4362 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4363 16*count2 + 128*count3 - 1);
4364 if (!result)
4365 return PyErr_NoMemory();
4366 PyObject_Init(result, &EncodingMapType);
4367 mresult = (struct encoding_map*)result;
4368 mresult->count2 = count2;
4369 mresult->count3 = count3;
4370 mlevel1 = mresult->level1;
4371 mlevel2 = mresult->level23;
4372 mlevel3 = mresult->level23 + 16*count2;
4373 memcpy(mlevel1, level1, 32);
4374 memset(mlevel2, 0xFF, 16*count2);
4375 memset(mlevel3, 0, 128*count3);
4376 count3 = 0;
4377 for (i = 1; i < 256; i++) {
4378 int o1, o2, o3, i2, i3;
4379 if (decode[i] == 0xFFFE)
4380 /* unmapped character */
4381 continue;
4382 o1 = decode[i]>>11;
4383 o2 = (decode[i]>>7) & 0xF;
4384 i2 = 16*mlevel1[o1] + o2;
4385 if (mlevel2[i2] == 0xFF)
4386 mlevel2[i2] = count3++;
4387 o3 = decode[i] & 0x7F;
4388 i3 = 128*mlevel2[i2] + o3;
4389 mlevel3[i3] = i;
4390 }
4391 return result;
4392}
4393
4394static int
4395encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4396{
4397 struct encoding_map *map = (struct encoding_map*)mapping;
4398 int l1 = c>>11;
4399 int l2 = (c>>7) & 0xF;
4400 int l3 = c & 0x7F;
4401 int i;
4402
4403#ifdef Py_UNICODE_WIDE
4404 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004405 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004406 }
4407#endif
4408 if (c == 0)
4409 return 0;
4410 /* level 1*/
4411 i = map->level1[l1];
4412 if (i == 0xFF) {
4413 return -1;
4414 }
4415 /* level 2*/
4416 i = map->level23[16*i+l2];
4417 if (i == 0xFF) {
4418 return -1;
4419 }
4420 /* level 3 */
4421 i = map->level23[16*map->count2 + 128*i + l3];
4422 if (i == 0) {
4423 return -1;
4424 }
4425 return i;
4426}
4427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428/* Lookup the character ch in the mapping. If the character
4429 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004430 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 PyObject *w = PyInt_FromLong((long)c);
4434 PyObject *x;
4435
4436 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004437 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 x = PyObject_GetItem(mapping, w);
4439 Py_DECREF(w);
4440 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004441 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4442 /* No mapping found means: mapping is undefined. */
4443 PyErr_Clear();
4444 x = Py_None;
4445 Py_INCREF(x);
4446 return x;
4447 } else
4448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004450 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004451 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004453 long value = PyInt_AS_LONG(x);
4454 if (value < 0 || value > 255) {
4455 PyErr_SetString(PyExc_TypeError,
4456 "character mapping must be in range(256)");
4457 Py_DECREF(x);
4458 return NULL;
4459 }
4460 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004462 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004463 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004465 /* wrong return value */
4466 PyErr_SetString(PyExc_TypeError,
4467 "character mapping must return integer, None or str");
4468 Py_DECREF(x);
4469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 }
4471}
4472
Martin v. Löwis3f767792006-06-04 19:36:28 +00004473static int
4474charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4475{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004476 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4477 /* exponentially overallocate to minimize reallocations */
4478 if (requiredsize < 2*outsize)
4479 requiredsize = 2*outsize;
4480 if (_PyString_Resize(outobj, requiredsize)) {
4481 return 0;
4482 }
4483 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004484}
4485
Benjamin Peterson857ce152009-01-31 16:29:18 +00004486typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004487 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004488}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489/* lookup the character, put the result in the output string and adjust
4490 various state variables. Reallocate the output string if not enough
4491 space is available. Return a new reference to the object that
4492 was put in the output buffer, or Py_None, if the mapping was undefined
4493 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004494 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004496charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004497 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004499 PyObject *rep;
4500 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004501 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502
Christian Heimese93237d2007-12-19 02:37:44 +00004503 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004504 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004505 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004506 if (res == -1)
4507 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004508 if (outsize<requiredsize)
4509 if (!charmapencode_resize(outobj, outpos, requiredsize))
4510 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004511 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004512 outstart[(*outpos)++] = (char)res;
4513 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004514 }
4515
4516 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004519 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004520 Py_DECREF(rep);
4521 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004522 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004523 if (PyInt_Check(rep)) {
4524 Py_ssize_t requiredsize = *outpos+1;
4525 if (outsize<requiredsize)
4526 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4527 Py_DECREF(rep);
4528 return enc_EXCEPTION;
4529 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004530 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004531 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004532 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 else {
4534 const char *repchars = PyString_AS_STRING(rep);
4535 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4536 Py_ssize_t requiredsize = *outpos+repsize;
4537 if (outsize<requiredsize)
4538 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4539 Py_DECREF(rep);
4540 return enc_EXCEPTION;
4541 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004542 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004543 memcpy(outstart + *outpos, repchars, repsize);
4544 *outpos += repsize;
4545 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 }
Georg Brandl9f167602006-06-04 21:46:16 +00004547 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004548 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549}
4550
4551/* handle an error in PyUnicode_EncodeCharmap
4552 Return 0 on success, -1 on error */
4553static
4554int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004555 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004557 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004558 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559{
4560 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 Py_ssize_t repsize;
4562 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 Py_UNICODE *uni2;
4564 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 Py_ssize_t collstartpos = *inpos;
4566 Py_ssize_t collendpos = *inpos+1;
4567 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 char *encoding = "charmap";
4569 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004570 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 /* find all unencodable characters */
4573 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004574 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004575 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004576 int res = encoding_map_lookup(p[collendpos], mapping);
4577 if (res != -1)
4578 break;
4579 ++collendpos;
4580 continue;
4581 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004582
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004583 rep = charmapencode_lookup(p[collendpos], mapping);
4584 if (rep==NULL)
4585 return -1;
4586 else if (rep!=Py_None) {
4587 Py_DECREF(rep);
4588 break;
4589 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004590 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004591 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 }
4593 /* cache callback name lookup
4594 * (if not done yet, i.e. it's the first error) */
4595 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004596 if ((errors==NULL) || (!strcmp(errors, "strict")))
4597 *known_errorHandler = 1;
4598 else if (!strcmp(errors, "replace"))
4599 *known_errorHandler = 2;
4600 else if (!strcmp(errors, "ignore"))
4601 *known_errorHandler = 3;
4602 else if (!strcmp(errors, "xmlcharrefreplace"))
4603 *known_errorHandler = 4;
4604 else
4605 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 }
4607 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004608 case 1: /* strict */
4609 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4610 return -1;
4611 case 2: /* replace */
4612 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004613 x = charmapencode_output('?', mapping, res, respos);
4614 if (x==enc_EXCEPTION) {
4615 return -1;
4616 }
4617 else if (x==enc_FAILED) {
4618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619 return -1;
4620 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004621 }
4622 /* fall through */
4623 case 3: /* ignore */
4624 *inpos = collendpos;
4625 break;
4626 case 4: /* xmlcharrefreplace */
4627 /* generate replacement (temporarily (mis)uses p) */
4628 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004629 char buffer[2+29+1+1];
4630 char *cp;
4631 sprintf(buffer, "&#%d;", (int)p[collpos]);
4632 for (cp = buffer; *cp; ++cp) {
4633 x = charmapencode_output(*cp, mapping, res, respos);
4634 if (x==enc_EXCEPTION)
4635 return -1;
4636 else if (x==enc_FAILED) {
4637 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4638 return -1;
4639 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004640 }
4641 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004642 *inpos = collendpos;
4643 break;
4644 default:
4645 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004646 encoding, reason, p, size, exceptionObject,
4647 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004648 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004649 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004650 /* generate replacement */
4651 repsize = PyUnicode_GET_SIZE(repunicode);
4652 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004653 x = charmapencode_output(*uni2, mapping, res, respos);
4654 if (x==enc_EXCEPTION) {
4655 return -1;
4656 }
4657 else if (x==enc_FAILED) {
4658 Py_DECREF(repunicode);
4659 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4660 return -1;
4661 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004662 }
4663 *inpos = newpos;
4664 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 }
4666 return 0;
4667}
4668
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004670 Py_ssize_t size,
4671 PyObject *mapping,
4672 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 /* output object */
4675 PyObject *res = NULL;
4676 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 PyObject *errorHandler = NULL;
4681 PyObject *exc = NULL;
4682 /* the following variable is used for caching string comparisons
4683 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4684 * 3=ignore, 4=xmlcharrefreplace */
4685 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686
4687 /* Default to Latin-1 */
4688 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004689 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 /* allocate enough for a simple encoding without
4692 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004693 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694 if (res == NULL)
4695 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004696 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004697 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004700 /* try to encode it */
4701 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4702 if (x==enc_EXCEPTION) /* error */
4703 goto onError;
4704 if (x==enc_FAILED) { /* unencodable character */
4705 if (charmap_encoding_error(p, size, &inpos, mapping,
4706 &exc,
4707 &known_errorHandler, &errorHandler, errors,
4708 &res, &respos)) {
4709 goto onError;
4710 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004711 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004712 else
4713 /* done with this character => adjust input position */
4714 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004718 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004719 if (_PyString_Resize(&res, respos))
4720 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 }
4722 Py_XDECREF(exc);
4723 Py_XDECREF(errorHandler);
4724 return res;
4725
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004726 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 Py_XDECREF(res);
4728 Py_XDECREF(exc);
4729 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 return NULL;
4731}
4732
4733PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004734 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735{
4736 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004737 PyErr_BadArgument();
4738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 }
4740 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004741 PyUnicode_GET_SIZE(unicode),
4742 mapping,
4743 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744}
4745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746/* create or adjust a UnicodeTranslateError */
4747static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004748 const Py_UNICODE *unicode, Py_ssize_t size,
4749 Py_ssize_t startpos, Py_ssize_t endpos,
4750 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004753 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004754 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 }
4756 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004757 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4758 goto onError;
4759 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4760 goto onError;
4761 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4762 goto onError;
4763 return;
4764 onError:
4765 Py_DECREF(*exceptionObject);
4766 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 }
4768}
4769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770/* raises a UnicodeTranslateError */
4771static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004772 const Py_UNICODE *unicode, Py_ssize_t size,
4773 Py_ssize_t startpos, Py_ssize_t endpos,
4774 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775{
4776 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004777 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004778 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004779 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780}
4781
4782/* error handling callback helper:
4783 build arguments, call the callback and check the arguments,
4784 put the result into newpos and return the replacement string, which
4785 has to be freed by the caller */
4786static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004787 PyObject **errorHandler,
4788 const char *reason,
4789 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4790 Py_ssize_t startpos, Py_ssize_t endpos,
4791 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004793 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794
Martin v. Löwis412fb672006-04-13 06:34:32 +00004795 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 PyObject *restuple;
4797 PyObject *resunicode;
4798
4799 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004800 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004802 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 }
4804
4805 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809
4810 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004813 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004815 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004816 Py_DECREF(restuple);
4817 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 }
4819 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004820 &resunicode, &i_newpos)) {
4821 Py_DECREF(restuple);
4822 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004825 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004826 else
4827 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004828 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004829 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4830 Py_DECREF(restuple);
4831 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 Py_INCREF(resunicode);
4834 Py_DECREF(restuple);
4835 return resunicode;
4836}
4837
4838/* Lookup the character ch in the mapping and put the result in result,
4839 which must be decrefed by the caller.
4840 Return 0 on success, -1 on error */
4841static
4842int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4843{
4844 PyObject *w = PyInt_FromLong((long)c);
4845 PyObject *x;
4846
4847 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004848 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 x = PyObject_GetItem(mapping, w);
4850 Py_DECREF(w);
4851 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004852 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4853 /* No mapping found means: use 1:1 mapping. */
4854 PyErr_Clear();
4855 *result = NULL;
4856 return 0;
4857 } else
4858 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859 }
4860 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004861 *result = x;
4862 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 }
4864 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004865 long value = PyInt_AS_LONG(x);
4866 long max = PyUnicode_GetMax();
4867 if (value < 0 || value > max) {
4868 PyErr_Format(PyExc_TypeError,
4869 "character mapping must be in range(0x%lx)", max+1);
4870 Py_DECREF(x);
4871 return -1;
4872 }
4873 *result = x;
4874 return 0;
4875 }
4876 else if (PyUnicode_Check(x)) {
4877 *result = x;
4878 return 0;
4879 }
4880 else {
4881 /* wrong return value */
4882 PyErr_SetString(PyExc_TypeError,
4883 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004884 Py_DECREF(x);
4885 return -1;
4886 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887}
4888/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004889 if not reallocate and adjust various state variables.
4890 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891static
Walter Dörwald4894c302003-10-24 14:25:28 +00004892int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004893 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004895 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004896 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004897 /* remember old output position */
4898 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4899 /* exponentially overallocate to minimize reallocations */
4900 if (requiredsize < 2 * oldsize)
4901 requiredsize = 2 * oldsize;
4902 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4903 return -1;
4904 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 }
4906 return 0;
4907}
4908/* lookup the character, put the result in the output string and adjust
4909 various state variables. Return a new reference to the object that
4910 was put in the output buffer in *result, or Py_None, if the mapping was
4911 undefined (in which case no character was written).
4912 The called must decref result.
4913 Return 0 on success, -1 on error. */
4914static
Walter Dörwald4894c302003-10-24 14:25:28 +00004915int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004916 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4917 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918{
Walter Dörwald4894c302003-10-24 14:25:28 +00004919 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004920 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004922 /* not found => default to 1:1 mapping */
4923 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 }
4925 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004926 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 /* no overflow check, because we know that the space is enough */
4929 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 }
4931 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004932 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4933 if (repsize==1) {
4934 /* no overflow check, because we know that the space is enough */
4935 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4936 }
4937 else if (repsize!=0) {
4938 /* more than one character */
4939 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4940 (insize - (curinp-startinp)) +
4941 repsize - 1;
4942 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4943 return -1;
4944 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4945 *outp += repsize;
4946 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 }
4948 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004949 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 return 0;
4951}
4952
4953PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004954 Py_ssize_t size,
4955 PyObject *mapping,
4956 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 /* output object */
4959 PyObject *res = NULL;
4960 /* pointers to the beginning and end+1 of input */
4961 const Py_UNICODE *startp = p;
4962 const Py_UNICODE *endp = p + size;
4963 /* pointer into the output */
4964 Py_UNICODE *str;
4965 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004966 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 char *reason = "character maps to <undefined>";
4968 PyObject *errorHandler = NULL;
4969 PyObject *exc = NULL;
4970 /* the following variable is used for caching string comparisons
4971 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4972 * 3=ignore, 4=xmlcharrefreplace */
4973 int known_errorHandler = -1;
4974
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004976 PyErr_BadArgument();
4977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979
4980 /* allocate enough for a simple 1:1 translation without
4981 replacements, if we need more, we'll resize */
4982 res = PyUnicode_FromUnicode(NULL, size);
4983 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004986 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004990 /* try to encode it */
4991 PyObject *x = NULL;
4992 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4993 Py_XDECREF(x);
4994 goto onError;
4995 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004996 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004997 if (x!=Py_None) /* it worked => adjust input pointer */
4998 ++p;
4999 else { /* untranslatable character */
5000 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5001 Py_ssize_t repsize;
5002 Py_ssize_t newpos;
5003 Py_UNICODE *uni2;
5004 /* startpos for collecting untranslatable chars */
5005 const Py_UNICODE *collstart = p;
5006 const Py_UNICODE *collend = p+1;
5007 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005009 /* find all untranslatable characters */
5010 while (collend < endp) {
5011 if (charmaptranslate_lookup(*collend, mapping, &x))
5012 goto onError;
5013 Py_XDECREF(x);
5014 if (x!=Py_None)
5015 break;
5016 ++collend;
5017 }
5018 /* cache callback name lookup
5019 * (if not done yet, i.e. it's the first error) */
5020 if (known_errorHandler==-1) {
5021 if ((errors==NULL) || (!strcmp(errors, "strict")))
5022 known_errorHandler = 1;
5023 else if (!strcmp(errors, "replace"))
5024 known_errorHandler = 2;
5025 else if (!strcmp(errors, "ignore"))
5026 known_errorHandler = 3;
5027 else if (!strcmp(errors, "xmlcharrefreplace"))
5028 known_errorHandler = 4;
5029 else
5030 known_errorHandler = 0;
5031 }
5032 switch (known_errorHandler) {
5033 case 1: /* strict */
5034 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005035 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005036 case 2: /* replace */
5037 /* No need to check for space, this is a 1:1 replacement */
5038 for (coll = collstart; coll<collend; ++coll)
5039 *str++ = '?';
5040 /* fall through */
5041 case 3: /* ignore */
5042 p = collend;
5043 break;
5044 case 4: /* xmlcharrefreplace */
5045 /* generate replacement (temporarily (mis)uses p) */
5046 for (p = collstart; p < collend; ++p) {
5047 char buffer[2+29+1+1];
5048 char *cp;
5049 sprintf(buffer, "&#%d;", (int)*p);
5050 if (charmaptranslate_makespace(&res, &str,
5051 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5052 goto onError;
5053 for (cp = buffer; *cp; ++cp)
5054 *str++ = *cp;
5055 }
5056 p = collend;
5057 break;
5058 default:
5059 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5060 reason, startp, size, &exc,
5061 collstart-startp, collend-startp, &newpos);
5062 if (repunicode == NULL)
5063 goto onError;
5064 /* generate replacement */
5065 repsize = PyUnicode_GET_SIZE(repunicode);
5066 if (charmaptranslate_makespace(&res, &str,
5067 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5068 Py_DECREF(repunicode);
5069 goto onError;
5070 }
5071 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5072 *str++ = *uni2;
5073 p = startp + newpos;
5074 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005075 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005076 }
5077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078 /* Resize if we allocated to much */
5079 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005080 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005081 if (PyUnicode_Resize(&res, respos) < 0)
5082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 }
5084 Py_XDECREF(exc);
5085 Py_XDECREF(errorHandler);
5086 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005088 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 Py_XDECREF(res);
5090 Py_XDECREF(exc);
5091 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 return NULL;
5093}
5094
5095PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005096 PyObject *mapping,
5097 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098{
5099 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 str = PyUnicode_FromObject(str);
5102 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005105 PyUnicode_GET_SIZE(str),
5106 mapping,
5107 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 Py_DECREF(str);
5109 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005110
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005111 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 Py_XDECREF(str);
5113 return NULL;
5114}
Tim Petersced69f82003-09-16 20:30:58 +00005115
Guido van Rossum9e896b32000-04-05 20:11:21 +00005116/* --- Decimal Encoder ---------------------------------------------------- */
5117
5118int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005119 Py_ssize_t length,
5120 char *output,
5121 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005122{
5123 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124 PyObject *errorHandler = NULL;
5125 PyObject *exc = NULL;
5126 const char *encoding = "decimal";
5127 const char *reason = "invalid decimal Unicode string";
5128 /* the following variable is used for caching string comparisons
5129 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5130 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005131
5132 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005133 PyErr_BadArgument();
5134 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005135 }
5136
5137 p = s;
5138 end = s + length;
5139 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005140 register Py_UNICODE ch = *p;
5141 int decimal;
5142 PyObject *repunicode;
5143 Py_ssize_t repsize;
5144 Py_ssize_t newpos;
5145 Py_UNICODE *uni2;
5146 Py_UNICODE *collstart;
5147 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005148
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005149 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005150 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005151 ++p;
5152 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005153 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005154 decimal = Py_UNICODE_TODECIMAL(ch);
5155 if (decimal >= 0) {
5156 *output++ = '0' + decimal;
5157 ++p;
5158 continue;
5159 }
5160 if (0 < ch && ch < 256) {
5161 *output++ = (char)ch;
5162 ++p;
5163 continue;
5164 }
5165 /* All other characters are considered unencodable */
5166 collstart = p;
5167 collend = p+1;
5168 while (collend < end) {
5169 if ((0 < *collend && *collend < 256) ||
5170 !Py_UNICODE_ISSPACE(*collend) ||
5171 Py_UNICODE_TODECIMAL(*collend))
5172 break;
5173 }
5174 /* cache callback name lookup
5175 * (if not done yet, i.e. it's the first error) */
5176 if (known_errorHandler==-1) {
5177 if ((errors==NULL) || (!strcmp(errors, "strict")))
5178 known_errorHandler = 1;
5179 else if (!strcmp(errors, "replace"))
5180 known_errorHandler = 2;
5181 else if (!strcmp(errors, "ignore"))
5182 known_errorHandler = 3;
5183 else if (!strcmp(errors, "xmlcharrefreplace"))
5184 known_errorHandler = 4;
5185 else
5186 known_errorHandler = 0;
5187 }
5188 switch (known_errorHandler) {
5189 case 1: /* strict */
5190 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5191 goto onError;
5192 case 2: /* replace */
5193 for (p = collstart; p < collend; ++p)
5194 *output++ = '?';
5195 /* fall through */
5196 case 3: /* ignore */
5197 p = collend;
5198 break;
5199 case 4: /* xmlcharrefreplace */
5200 /* generate replacement (temporarily (mis)uses p) */
5201 for (p = collstart; p < collend; ++p)
5202 output += sprintf(output, "&#%d;", (int)*p);
5203 p = collend;
5204 break;
5205 default:
5206 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5207 encoding, reason, s, length, &exc,
5208 collstart-s, collend-s, &newpos);
5209 if (repunicode == NULL)
5210 goto onError;
5211 /* generate replacement */
5212 repsize = PyUnicode_GET_SIZE(repunicode);
5213 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5214 Py_UNICODE ch = *uni2;
5215 if (Py_UNICODE_ISSPACE(ch))
5216 *output++ = ' ';
5217 else {
5218 decimal = Py_UNICODE_TODECIMAL(ch);
5219 if (decimal >= 0)
5220 *output++ = '0' + decimal;
5221 else if (0 < ch && ch < 256)
5222 *output++ = (char)ch;
5223 else {
5224 Py_DECREF(repunicode);
5225 raise_encode_exception(&exc, encoding,
5226 s, length, collstart-s, collend-s, reason);
5227 goto onError;
5228 }
5229 }
5230 }
5231 p = s + newpos;
5232 Py_DECREF(repunicode);
5233 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005234 }
5235 /* 0-terminate the output string */
5236 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 Py_XDECREF(exc);
5238 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005239 return 0;
5240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005241 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 Py_XDECREF(exc);
5243 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005244 return -1;
5245}
5246
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247/* --- Helpers ------------------------------------------------------------ */
5248
Eric Smitha9f7d622008-02-17 19:46:49 +00005249#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005250#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005251
5252#include "stringlib/count.h"
5253#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005254#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005255#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005256
Fredrik Lundhc8162812006-05-26 19:33:03 +00005257/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005258#define ADJUST_INDICES(start, end, len) \
5259 if (end > len) \
5260 end = len; \
5261 else if (end < 0) { \
5262 end += len; \
5263 if (end < 0) \
5264 end = 0; \
5265 } \
5266 if (start < 0) { \
5267 start += len; \
5268 if (start < 0) \
5269 start = 0; \
5270 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005271
Martin v. Löwis18e16552006-02-15 17:27:45 +00005272Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005273 PyObject *substr,
5274 Py_ssize_t start,
5275 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005278 PyUnicodeObject* str_obj;
5279 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005281 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5282 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005283 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005284 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5285 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005286 Py_DECREF(str_obj);
5287 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 }
Tim Petersced69f82003-09-16 20:30:58 +00005289
Antoine Pitrou64672132010-01-13 07:55:48 +00005290 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005291 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005292 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5293 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005294 );
5295
5296 Py_DECREF(sub_obj);
5297 Py_DECREF(str_obj);
5298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 return result;
5300}
5301
Martin v. Löwis18e16552006-02-15 17:27:45 +00005302Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005303 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005304 Py_ssize_t start,
5305 Py_ssize_t end,
5306 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005309
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005310 str = PyUnicode_FromObject(str);
5311 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005312 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005313 sub = PyUnicode_FromObject(sub);
5314 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005315 Py_DECREF(str);
5316 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 }
Tim Petersced69f82003-09-16 20:30:58 +00005318
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005319 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005320 result = stringlib_find_slice(
5321 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5322 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5323 start, end
5324 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005325 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005326 result = stringlib_rfind_slice(
5327 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5328 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5329 start, end
5330 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005331
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005332 Py_DECREF(str);
5333 Py_DECREF(sub);
5334
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 return result;
5336}
5337
Tim Petersced69f82003-09-16 20:30:58 +00005338static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005340 PyUnicodeObject *substring,
5341 Py_ssize_t start,
5342 Py_ssize_t end,
5343 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 if (substring->length == 0)
5346 return 1;
5347
Antoine Pitrou64672132010-01-13 07:55:48 +00005348 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 end -= substring->length;
5350 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005351 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352
5353 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005354 if (Py_UNICODE_MATCH(self, end, substring))
5355 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 } else {
5357 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005358 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 }
5360
5361 return 0;
5362}
5363
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005365 PyObject *substr,
5366 Py_ssize_t start,
5367 Py_ssize_t end,
5368 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005371
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 str = PyUnicode_FromObject(str);
5373 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005374 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 substr = PyUnicode_FromObject(substr);
5376 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005377 Py_DECREF(str);
5378 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 }
Tim Petersced69f82003-09-16 20:30:58 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005382 (PyUnicodeObject *)substr,
5383 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 Py_DECREF(str);
5385 Py_DECREF(substr);
5386 return result;
5387}
5388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389/* Apply fixfct filter to the Unicode object self and return a
5390 reference to the modified object */
5391
Tim Petersced69f82003-09-16 20:30:58 +00005392static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005394 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
5396
5397 PyUnicodeObject *u;
5398
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005399 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005401 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005402
5403 Py_UNICODE_COPY(u->str, self->str, self->length);
5404
Tim Peters7a29bd52001-09-12 03:03:31 +00005405 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005406 /* fixfct should return TRUE if it modified the buffer. If
5407 FALSE, return a reference to the original buffer instead
5408 (to save space, not time) */
5409 Py_INCREF(self);
5410 Py_DECREF(u);
5411 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 }
5413 return (PyObject*) u;
5414}
5415
Tim Petersced69f82003-09-16 20:30:58 +00005416static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417int fixupper(PyUnicodeObject *self)
5418{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005419 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 Py_UNICODE *s = self->str;
5421 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005424 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005425
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005426 ch = Py_UNICODE_TOUPPER(*s);
5427 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005429 *s = ch;
5430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 s++;
5432 }
5433
5434 return status;
5435}
5436
Tim Petersced69f82003-09-16 20:30:58 +00005437static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438int fixlower(PyUnicodeObject *self)
5439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005440 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 Py_UNICODE *s = self->str;
5442 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005443
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005445 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005446
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005447 ch = Py_UNICODE_TOLOWER(*s);
5448 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005450 *s = ch;
5451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 s++;
5453 }
5454
5455 return status;
5456}
5457
Tim Petersced69f82003-09-16 20:30:58 +00005458static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459int fixswapcase(PyUnicodeObject *self)
5460{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 Py_UNICODE *s = self->str;
5463 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005464
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 while (len-- > 0) {
5466 if (Py_UNICODE_ISUPPER(*s)) {
5467 *s = Py_UNICODE_TOLOWER(*s);
5468 status = 1;
5469 } else if (Py_UNICODE_ISLOWER(*s)) {
5470 *s = Py_UNICODE_TOUPPER(*s);
5471 status = 1;
5472 }
5473 s++;
5474 }
5475
5476 return status;
5477}
5478
Tim Petersced69f82003-09-16 20:30:58 +00005479static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480int fixcapitalize(PyUnicodeObject *self)
5481{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005482 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005483 Py_UNICODE *s = self->str;
5484 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005485
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005486 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005487 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005488 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005489 *s = Py_UNICODE_TOUPPER(*s);
5490 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005492 s++;
5493 while (--len > 0) {
5494 if (Py_UNICODE_ISUPPER(*s)) {
5495 *s = Py_UNICODE_TOLOWER(*s);
5496 status = 1;
5497 }
5498 s++;
5499 }
5500 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501}
5502
5503static
5504int fixtitle(PyUnicodeObject *self)
5505{
5506 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5507 register Py_UNICODE *e;
5508 int previous_is_cased;
5509
5510 /* Shortcut for single character strings */
5511 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005512 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5513 if (*p != ch) {
5514 *p = ch;
5515 return 1;
5516 }
5517 else
5518 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 }
Tim Petersced69f82003-09-16 20:30:58 +00005520
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 e = p + PyUnicode_GET_SIZE(self);
5522 previous_is_cased = 0;
5523 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005524 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005525
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005526 if (previous_is_cased)
5527 *p = Py_UNICODE_TOLOWER(ch);
5528 else
5529 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005530
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005531 if (Py_UNICODE_ISLOWER(ch) ||
5532 Py_UNICODE_ISUPPER(ch) ||
5533 Py_UNICODE_ISTITLE(ch))
5534 previous_is_cased = 1;
5535 else
5536 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 }
5538 return 1;
5539}
5540
Tim Peters8ce9f162004-08-27 01:49:32 +00005541PyObject *
5542PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543{
Tim Peters8ce9f162004-08-27 01:49:32 +00005544 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005545 const Py_UNICODE blank = ' ';
5546 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005547 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005548 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005549 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5550 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005551 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5552 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005553 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005554 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005555 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 fseq = PySequence_Fast(seq, "");
5558 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005559 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005560 }
5561
Tim Peters91879ab2004-08-27 22:35:44 +00005562 /* Grrrr. A codec may be invoked to convert str objects to
5563 * Unicode, and so it's possible to call back into Python code
5564 * during PyUnicode_FromObject(), and so it's possible for a sick
5565 * codec to change the size of fseq (if seq is a list). Therefore
5566 * we have to keep refetching the size -- can't assume seqlen
5567 * is invariant.
5568 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005569 seqlen = PySequence_Fast_GET_SIZE(fseq);
5570 /* If empty sequence, return u"". */
5571 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005572 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5573 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005574 }
5575 /* If singleton sequence with an exact Unicode, return that. */
5576 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005577 item = PySequence_Fast_GET_ITEM(fseq, 0);
5578 if (PyUnicode_CheckExact(item)) {
5579 Py_INCREF(item);
5580 res = (PyUnicodeObject *)item;
5581 goto Done;
5582 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005583 }
5584
Tim Peters05eba1f2004-08-27 21:32:02 +00005585 /* At least two items to join, or one that isn't exact Unicode. */
5586 if (seqlen > 1) {
5587 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005588 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005589 sep = &blank;
5590 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005591 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005592 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005593 internal_separator = PyUnicode_FromObject(separator);
5594 if (internal_separator == NULL)
5595 goto onError;
5596 sep = PyUnicode_AS_UNICODE(internal_separator);
5597 seplen = PyUnicode_GET_SIZE(internal_separator);
5598 /* In case PyUnicode_FromObject() mutated seq. */
5599 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005600 }
5601 }
5602
5603 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005604 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005605 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005606 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005607 res_p = PyUnicode_AS_UNICODE(res);
5608 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005609
Tim Peters05eba1f2004-08-27 21:32:02 +00005610 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005611 Py_ssize_t itemlen;
5612 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005613
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005614 item = PySequence_Fast_GET_ITEM(fseq, i);
5615 /* Convert item to Unicode. */
5616 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5617 PyErr_Format(PyExc_TypeError,
5618 "sequence item %zd: expected string or Unicode,"
5619 " %.80s found",
5620 i, Py_TYPE(item)->tp_name);
5621 goto onError;
5622 }
5623 item = PyUnicode_FromObject(item);
5624 if (item == NULL)
5625 goto onError;
5626 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005627
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005628 /* In case PyUnicode_FromObject() mutated seq. */
5629 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005630
Tim Peters8ce9f162004-08-27 01:49:32 +00005631 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005632 itemlen = PyUnicode_GET_SIZE(item);
5633 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005634 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005635 goto Overflow;
5636 if (i < seqlen - 1) {
5637 new_res_used += seplen;
5638 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005639 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005640 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005641 if (new_res_used > res_alloc) {
5642 /* double allocated size until it's big enough */
5643 do {
5644 res_alloc += res_alloc;
5645 if (res_alloc <= 0)
5646 goto Overflow;
5647 } while (new_res_used > res_alloc);
5648 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5649 Py_DECREF(item);
5650 goto onError;
5651 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005652 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005653 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005654
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005655 /* Copy item, and maybe the separator. */
5656 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5657 res_p += itemlen;
5658 if (i < seqlen - 1) {
5659 Py_UNICODE_COPY(res_p, sep, seplen);
5660 res_p += seplen;
5661 }
5662 Py_DECREF(item);
5663 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005664 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005665
Tim Peters05eba1f2004-08-27 21:32:02 +00005666 /* Shrink res to match the used area; this probably can't fail,
5667 * but it's cheap to check.
5668 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005669 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005670 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005671
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005672 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005673 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 return (PyObject *)res;
5676
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005677 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005678 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005679 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 Py_DECREF(item);
5681 /* fall through */
5682
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005683 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005684 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 return NULL;
5688}
5689
Tim Petersced69f82003-09-16 20:30:58 +00005690static
5691PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005692 Py_ssize_t left,
5693 Py_ssize_t right,
5694 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695{
5696 PyUnicodeObject *u;
5697
5698 if (left < 0)
5699 left = 0;
5700 if (right < 0)
5701 right = 0;
5702
Tim Peters7a29bd52001-09-12 03:03:31 +00005703 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 Py_INCREF(self);
5705 return self;
5706 }
5707
Neal Norwitze7d8be82008-07-31 17:17:14 +00005708 if (left > PY_SSIZE_T_MAX - self->length ||
5709 right > PY_SSIZE_T_MAX - (left + self->length)) {
5710 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5711 return NULL;
5712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 u = _PyUnicode_New(left + self->length + right);
5714 if (u) {
5715 if (left)
5716 Py_UNICODE_FILL(u->str, fill, left);
5717 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5718 if (right)
5719 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5720 }
5721
5722 return u;
5723}
5724
Antoine Pitrou64672132010-01-13 07:55:48 +00005725PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
5729 string = PyUnicode_FromObject(string);
5730 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Antoine Pitrou64672132010-01-13 07:55:48 +00005733 list = stringlib_splitlines(
5734 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5735 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
5737 Py_DECREF(string);
5738 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739}
5740
Tim Petersced69f82003-09-16 20:30:58 +00005741static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005743 PyUnicodeObject *substring,
5744 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005747 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005750 return stringlib_split_whitespace(
5751 (PyObject*) self, self->str, self->length, maxcount
5752 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
Antoine Pitrou64672132010-01-13 07:55:48 +00005754 return stringlib_split(
5755 (PyObject*) self, self->str, self->length,
5756 substring->str, substring->length,
5757 maxcount
5758 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759}
5760
Tim Petersced69f82003-09-16 20:30:58 +00005761static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005763 PyUnicodeObject *substring,
5764 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005766 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005767 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005769 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005770 return stringlib_rsplit_whitespace(
5771 (PyObject*) self, self->str, self->length, maxcount
5772 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773
Antoine Pitrou64672132010-01-13 07:55:48 +00005774 return stringlib_rsplit(
5775 (PyObject*) self, self->str, self->length,
5776 substring->str, substring->length,
5777 maxcount
5778 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779}
5780
5781static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005783 PyUnicodeObject *str1,
5784 PyUnicodeObject *str2,
5785 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
5787 PyUnicodeObject *u;
5788
5789 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005790 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005791 else if (maxcount == 0 || self->length == 0)
5792 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
Fredrik Lundh347ee272006-05-24 16:35:18 +00005794 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005795 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005796 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005797 if (str1->length == 0)
5798 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005799 if (str1->length == 1) {
5800 /* replace characters */
5801 Py_UNICODE u1, u2;
5802 if (!findchar(self->str, self->length, str1->str[0]))
5803 goto nothing;
5804 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5805 if (!u)
5806 return NULL;
5807 Py_UNICODE_COPY(u->str, self->str, self->length);
5808 u1 = str1->str[0];
5809 u2 = str2->str[0];
5810 for (i = 0; i < u->length; i++)
5811 if (u->str[i] == u1) {
5812 if (--maxcount < 0)
5813 break;
5814 u->str[i] = u2;
5815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005817 i = stringlib_find(
5818 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005820 if (i < 0)
5821 goto nothing;
5822 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5823 if (!u)
5824 return NULL;
5825 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005826
5827 /* change everything in-place, starting with this one */
5828 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5829 i += str1->length;
5830
5831 while ( --maxcount > 0) {
5832 i = stringlib_find(self->str+i, self->length-i,
5833 str1->str, str1->length,
5834 i);
5835 if (i == -1)
5836 break;
5837 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838 i += str1->length;
5839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005842
Brett Cannona7f13ee2010-05-04 01:16:51 +00005843 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005844 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 Py_UNICODE *p;
5846
5847 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005848 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5849 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005850 if (n == 0)
5851 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005852 /* new_size = self->length + n * (str2->length - str1->length)); */
5853 delta = (str2->length - str1->length);
5854 if (delta == 0) {
5855 new_size = self->length;
5856 } else {
5857 product = n * (str2->length - str1->length);
5858 if ((product / (str2->length - str1->length)) != n) {
5859 PyErr_SetString(PyExc_OverflowError,
5860 "replace string is too long");
5861 return NULL;
5862 }
5863 new_size = self->length + product;
5864 if (new_size < 0) {
5865 PyErr_SetString(PyExc_OverflowError,
5866 "replace string is too long");
5867 return NULL;
5868 }
5869 }
5870 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005871 if (!u)
5872 return NULL;
5873 i = 0;
5874 p = u->str;
5875 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005876 while (n-- > 0) {
5877 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005878 j = stringlib_find(self->str+i, self->length-i,
5879 str1->str, str1->length,
5880 i);
5881 if (j == -1)
5882 break;
5883 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005884 /* copy unchanged part [i:j] */
5885 Py_UNICODE_COPY(p, self->str+i, j-i);
5886 p += j - i;
5887 }
5888 /* copy substitution string */
5889 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005890 Py_UNICODE_COPY(p, str2->str, str2->length);
5891 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005892 }
5893 i = j + str1->length;
5894 }
5895 if (i < self->length)
5896 /* copy tail [i:] */
5897 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005898 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005899 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005900 while (n > 0) {
5901 Py_UNICODE_COPY(p, str2->str, str2->length);
5902 p += str2->length;
5903 if (--n <= 0)
5904 break;
5905 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005907 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 }
5909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005911
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005912 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005913 /* nothing to replace; return original string (when possible) */
5914 if (PyUnicode_CheckExact(self)) {
5915 Py_INCREF(self);
5916 return (PyObject *) self;
5917 }
5918 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919}
5920
5921/* --- Unicode Object Methods --------------------------------------------- */
5922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005923PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005924 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925\n\
5926Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005927characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
5929static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005930unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 return fixup(self, fixtitle);
5933}
5934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005936 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937\n\
5938Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005939have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
5941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005942unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 return fixup(self, fixcapitalize);
5945}
5946
5947#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005949 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950\n\
5951Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005955unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
5957 PyObject *list;
5958 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005959 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* Split into words */
5962 list = split(self, NULL, -1);
5963 if (!list)
5964 return NULL;
5965
5966 /* Capitalize each word */
5967 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5968 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005969 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 if (item == NULL)
5971 goto onError;
5972 Py_DECREF(PyList_GET_ITEM(list, i));
5973 PyList_SET_ITEM(list, i, item);
5974 }
5975
5976 /* Join the words to form a new string */
5977 item = PyUnicode_Join(NULL, list);
5978
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 Py_DECREF(list);
5981 return (PyObject *)item;
5982}
5983#endif
5984
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005985/* Argument converter. Coerces to a single unicode character */
5986
5987static int
5988convert_uc(PyObject *obj, void *addr)
5989{
Benjamin Peterson857ce152009-01-31 16:29:18 +00005990 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5991 PyObject *uniobj;
5992 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005993
Benjamin Peterson857ce152009-01-31 16:29:18 +00005994 uniobj = PyUnicode_FromObject(obj);
5995 if (uniobj == NULL) {
5996 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005997 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00005998 return 0;
5999 }
6000 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6001 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006002 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006003 Py_DECREF(uniobj);
6004 return 0;
6005 }
6006 unistr = PyUnicode_AS_UNICODE(uniobj);
6007 *fillcharloc = unistr[0];
6008 Py_DECREF(uniobj);
6009 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006010}
6011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006012PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006013 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006015Return S centered in a Unicode string of length width. Padding is\n\
6016done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
6018static PyObject *
6019unicode_center(PyUnicodeObject *self, PyObject *args)
6020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006021 Py_ssize_t marg, left;
6022 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006023 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
Thomas Woutersde017742006-02-16 19:34:37 +00006025 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 return NULL;
6027
Tim Peters7a29bd52001-09-12 03:03:31 +00006028 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 Py_INCREF(self);
6030 return (PyObject*) self;
6031 }
6032
6033 marg = width - self->length;
6034 left = marg / 2 + (marg & width & 1);
6035
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006036 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037}
6038
Marc-André Lemburge5034372000-08-08 08:04:29 +00006039#if 0
6040
6041/* This code should go into some future Unicode collation support
6042 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006043 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006044
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006045/* speedy UTF-16 code point order comparison */
6046/* gleaned from: */
6047/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6048
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006049static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006050{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006051 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006052 0, 0, 0, 0, 0, 0, 0, 0,
6053 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006054 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006055};
6056
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057static int
6058unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6059{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006060 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 Py_UNICODE *s1 = str1->str;
6063 Py_UNICODE *s2 = str2->str;
6064
6065 len1 = str1->length;
6066 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006069 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006070
6071 c1 = *s1++;
6072 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006073
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006074 if (c1 > (1<<11) * 26)
6075 c1 += utf16Fixup[c1>>11];
6076 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006079
6080 if (c1 != c2)
6081 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006082
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 }
6085
6086 return (len1 < len2) ? -1 : (len1 != len2);
6087}
6088
Marc-André Lemburge5034372000-08-08 08:04:29 +00006089#else
6090
6091static int
6092unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6093{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006094 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006095
6096 Py_UNICODE *s1 = str1->str;
6097 Py_UNICODE *s2 = str2->str;
6098
6099 len1 = str1->length;
6100 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006101
Marc-André Lemburge5034372000-08-08 08:04:29 +00006102 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006103 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006104
Fredrik Lundh45714e92001-06-26 16:39:36 +00006105 c1 = *s1++;
6106 c2 = *s2++;
6107
6108 if (c1 != c2)
6109 return (c1 < c2) ? -1 : 1;
6110
Marc-André Lemburge5034372000-08-08 08:04:29 +00006111 len1--; len2--;
6112 }
6113
6114 return (len1 < len2) ? -1 : (len1 != len2);
6115}
6116
6117#endif
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006120 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
6122 PyUnicodeObject *u = NULL, *v = NULL;
6123 int result;
6124
6125 /* Coerce the two arguments */
6126 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6127 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006128 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6130 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Thomas Wouters7e474022000-07-16 12:04:32 +00006133 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006135 Py_DECREF(u);
6136 Py_DECREF(v);
6137 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
6139
6140 result = unicode_compare(u, v);
6141
6142 Py_DECREF(u);
6143 Py_DECREF(v);
6144 return result;
6145
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 Py_XDECREF(u);
6148 Py_XDECREF(v);
6149 return -1;
6150}
6151
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006152PyObject *PyUnicode_RichCompare(PyObject *left,
6153 PyObject *right,
6154 int op)
6155{
6156 int result;
6157
6158 result = PyUnicode_Compare(left, right);
6159 if (result == -1 && PyErr_Occurred())
6160 goto onError;
6161
6162 /* Convert the return value to a Boolean */
6163 switch (op) {
6164 case Py_EQ:
6165 result = (result == 0);
6166 break;
6167 case Py_NE:
6168 result = (result != 0);
6169 break;
6170 case Py_LE:
6171 result = (result <= 0);
6172 break;
6173 case Py_GE:
6174 result = (result >= 0);
6175 break;
6176 case Py_LT:
6177 result = (result == -1);
6178 break;
6179 case Py_GT:
6180 result = (result == 1);
6181 break;
6182 }
6183 return PyBool_FromLong(result);
6184
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006185 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006186
6187 /* Standard case
6188
6189 Type errors mean that PyUnicode_FromObject() could not convert
6190 one of the arguments (usually the right hand side) to Unicode,
6191 ie. we can't handle the comparison request. However, it is
6192 possible that the other object knows a comparison method, which
6193 is why we return Py_NotImplemented to give the other object a
6194 chance.
6195
6196 */
6197 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6198 PyErr_Clear();
6199 Py_INCREF(Py_NotImplemented);
6200 return Py_NotImplemented;
6201 }
6202 if (op != Py_EQ && op != Py_NE)
6203 return NULL;
6204
6205 /* Equality comparison.
6206
6207 This is a special case: we silence any PyExc_UnicodeDecodeError
6208 and instead turn it into a PyErr_UnicodeWarning.
6209
6210 */
6211 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6212 return NULL;
6213 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006214 if (PyErr_Warn(PyExc_UnicodeWarning,
6215 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006216 "Unicode equal comparison "
6217 "failed to convert both arguments to Unicode - "
6218 "interpreting them as being unequal" :
6219 "Unicode unequal comparison "
6220 "failed to convert both arguments to Unicode - "
6221 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006222 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006223 return NULL;
6224 result = (op == Py_NE);
6225 return PyBool_FromLong(result);
6226}
6227
Guido van Rossum403d68b2000-03-13 15:55:09 +00006228int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006229 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006230{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006231 PyObject *str, *sub;
6232 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006233
6234 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006235 sub = PyUnicode_FromObject(element);
6236 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006237 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006238 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006239
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006240 str = PyUnicode_FromObject(container);
6241 if (!str) {
6242 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006243 return -1;
6244 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006245
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006246 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006247
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006248 Py_DECREF(str);
6249 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006250
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006251 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252}
6253
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254/* Concat to string or Unicode object giving a new Unicode object. */
6255
6256PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006257 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258{
6259 PyUnicodeObject *u = NULL, *v = NULL, *w;
6260
6261 /* Coerce the two arguments */
6262 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6263 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006264 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6266 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
6269 /* Shortcuts */
6270 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006271 Py_DECREF(v);
6272 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 }
6274 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006275 Py_DECREF(u);
6276 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 }
6278
6279 /* Concat the two Unicode strings */
6280 w = _PyUnicode_New(u->length + v->length);
6281 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 Py_UNICODE_COPY(w->str, u->str, u->length);
6284 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6285
6286 Py_DECREF(u);
6287 Py_DECREF(v);
6288 return (PyObject *)w;
6289
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 Py_XDECREF(u);
6292 Py_XDECREF(v);
6293 return NULL;
6294}
6295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006297 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006299Return the number of non-overlapping occurrences of substring sub in\n\
6300Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006301interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
6303static PyObject *
6304unicode_count(PyUnicodeObject *self, PyObject *args)
6305{
6306 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006308 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 PyObject *result;
6310
Guido van Rossumb8872e62000-05-09 14:14:27 +00006311 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006312 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
6314
6315 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006316 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006318 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006319
Antoine Pitrou64672132010-01-13 07:55:48 +00006320 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006321 result = PyInt_FromSsize_t(
6322 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006323 substring->str, substring->length,
6324 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006325 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006328
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 return result;
6330}
6331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006332PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006333 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006335Encodes S using the codec registered for encoding. encoding defaults\n\
6336to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006337handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6339'xmlcharrefreplace' as well as any other name registered with\n\
6340codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341
6342static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006343unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006345 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 char *encoding = NULL;
6347 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006348 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006349
Benjamin Peterson332d7212009-09-18 21:14:55 +00006350 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6351 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006353 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006354 if (v == NULL)
6355 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006356 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006357 PyErr_Format(PyExc_TypeError,
6358 "encoder did not return a string/unicode object "
6359 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006360 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006361 Py_DECREF(v);
6362 return NULL;
6363 }
6364 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006365
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006366 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006367 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006368}
6369
6370PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006371 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006372\n\
6373Decodes S using the codec registered for encoding. encoding defaults\n\
6374to the default encoding. errors may be given to set a different error\n\
6375handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6376a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6377as well as any other name registerd with codecs.register_error that is\n\
6378able to handle UnicodeDecodeErrors.");
6379
6380static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006381unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006382{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006383 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384 char *encoding = NULL;
6385 char *errors = NULL;
6386 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006387
Benjamin Peterson332d7212009-09-18 21:14:55 +00006388 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6389 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006390 return NULL;
6391 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006392 if (v == NULL)
6393 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006394 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006395 PyErr_Format(PyExc_TypeError,
6396 "decoder did not return a string/unicode object "
6397 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006398 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006399 Py_DECREF(v);
6400 return NULL;
6401 }
6402 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006403
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006404 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006408PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006409 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410\n\
6411Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006412If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
6414static PyObject*
6415unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6416{
6417 Py_UNICODE *e;
6418 Py_UNICODE *p;
6419 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006420 Py_UNICODE *qe;
6421 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 PyUnicodeObject *u;
6423 int tabsize = 8;
6424
6425 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
Thomas Wouters7e474022000-07-16 12:04:32 +00006428 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006429 i = 0; /* chars up to and including most recent \n or \r */
6430 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6431 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 for (p = self->str; p < e; p++)
6433 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006434 if (tabsize > 0) {
6435 incr = tabsize - (j % tabsize); /* cannot overflow */
6436 if (j > PY_SSIZE_T_MAX - incr)
6437 goto overflow1;
6438 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006439 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006442 if (j > PY_SSIZE_T_MAX - 1)
6443 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 j++;
6445 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006446 if (i > PY_SSIZE_T_MAX - j)
6447 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006449 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
6451 }
6452
Guido van Rossum5bdff602008-03-11 21:18:06 +00006453 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006454 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 /* Second pass: create output string and fill it */
6457 u = _PyUnicode_New(i + j);
6458 if (!u)
6459 return NULL;
6460
Guido van Rossum5bdff602008-03-11 21:18:06 +00006461 j = 0; /* same as in first pass */
6462 q = u->str; /* next output char */
6463 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465 for (p = self->str; p < e; p++)
6466 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006467 if (tabsize > 0) {
6468 i = tabsize - (j % tabsize);
6469 j += i;
6470 while (i--) {
6471 if (q >= qe)
6472 goto overflow2;
6473 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006474 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006475 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006476 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006477 else {
6478 if (q >= qe)
6479 goto overflow2;
6480 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006481 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 if (*p == '\n' || *p == '\r')
6483 j = 0;
6484 }
6485
6486 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006487
6488 overflow2:
6489 Py_DECREF(u);
6490 overflow1:
6491 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006495PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497\n\
6498Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006499such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500arguments start and end are interpreted as in slice notation.\n\
6501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
6504static PyObject *
6505unicode_find(PyUnicodeObject *self, PyObject *args)
6506{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006507 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006508 Py_ssize_t start;
6509 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006510 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Facundo Batista57d56692007-11-16 18:04:14 +00006512 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006515 result = stringlib_find_slice(
6516 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6517 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6518 start, end
6519 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
6521 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006522
6523 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524}
6525
6526static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
6529 if (index < 0 || index >= self->length) {
6530 PyErr_SetString(PyExc_IndexError, "string index out of range");
6531 return NULL;
6532 }
6533
6534 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6535}
6536
6537static long
6538unicode_hash(PyUnicodeObject *self)
6539{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006540 /* Since Unicode objects compare equal to their ASCII string
6541 counterparts, they should use the individual character values
6542 as basis for their hash value. This is needed to assure that
6543 strings and Unicode objects behave in the same way as
6544 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
Martin v. Löwis18e16552006-02-15 17:27:45 +00006546 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006547 register Py_UNICODE *p;
6548 register long x;
6549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006551 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006552 len = PyUnicode_GET_SIZE(self);
6553 p = PyUnicode_AS_UNICODE(self);
6554 x = *p << 7;
6555 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006556 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006557 x ^= PyUnicode_GET_SIZE(self);
6558 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006559 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006560 self->hash = x;
6561 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006565 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569static PyObject *
6570unicode_index(PyUnicodeObject *self, PyObject *args)
6571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006573 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006574 Py_ssize_t start;
6575 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Facundo Batista57d56692007-11-16 18:04:14 +00006577 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006580 result = stringlib_find_slice(
6581 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6582 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6583 start, end
6584 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
6586 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006587
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 if (result < 0) {
6589 PyErr_SetString(PyExc_ValueError, "substring not found");
6590 return NULL;
6591 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006592
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006597 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006599Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
6605 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6606 register const Py_UNICODE *e;
6607 int cased;
6608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 /* Shortcut for single character strings */
6610 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006611 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006613 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006614 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006615 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 e = p + PyUnicode_GET_SIZE(self);
6618 cased = 0;
6619 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006620 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006621
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006622 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6623 return PyBool_FromLong(0);
6624 else if (!cased && Py_UNICODE_ISLOWER(ch))
6625 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006627 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628}
6629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006630PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006631 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006633Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
6636static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006637unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
6639 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6640 register const Py_UNICODE *e;
6641 int cased;
6642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 /* Shortcut for single character strings */
6644 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006645 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006647 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006648 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006649 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 e = p + PyUnicode_GET_SIZE(self);
6652 cased = 0;
6653 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006654 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6657 return PyBool_FromLong(0);
6658 else if (!cased && Py_UNICODE_ISUPPER(ch))
6659 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006665 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006667Return True if S is a titlecased string and there is at least one\n\
6668character in S, i.e. upper- and titlecase characters may only\n\
6669follow uncased characters and lowercase characters only cased ones.\n\
6670Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006673unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
6675 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6676 register const Py_UNICODE *e;
6677 int cased, previous_is_cased;
6678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 /* Shortcut for single character strings */
6680 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006681 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6682 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006684 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006685 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006686 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006687
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 e = p + PyUnicode_GET_SIZE(self);
6689 cased = 0;
6690 previous_is_cased = 0;
6691 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006692 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006693
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006694 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6695 if (previous_is_cased)
6696 return PyBool_FromLong(0);
6697 previous_is_cased = 1;
6698 cased = 1;
6699 }
6700 else if (Py_UNICODE_ISLOWER(ch)) {
6701 if (!previous_is_cased)
6702 return PyBool_FromLong(0);
6703 previous_is_cased = 1;
6704 cased = 1;
6705 }
6706 else
6707 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710}
6711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006713 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006715Return True if all characters in S are whitespace\n\
6716and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
6718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006719unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6722 register const Py_UNICODE *e;
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 /* Shortcut for single character strings */
6725 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006726 Py_UNICODE_ISSPACE(*p))
6727 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006730 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006731 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006732
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 e = p + PyUnicode_GET_SIZE(self);
6734 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006735 if (!Py_UNICODE_ISSPACE(*p))
6736 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739}
6740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006742 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006743\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006744Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006746
6747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006748unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006749{
6750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6751 register const Py_UNICODE *e;
6752
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006753 /* Shortcut for single character strings */
6754 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 Py_UNICODE_ISALPHA(*p))
6756 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006757
6758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006760 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006761
6762 e = p + PyUnicode_GET_SIZE(self);
6763 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006764 if (!Py_UNICODE_ISALPHA(*p))
6765 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006766 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006768}
6769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006770PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006771 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006772\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006773Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006774and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006775
6776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006777unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778{
6779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6780 register const Py_UNICODE *e;
6781
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782 /* Shortcut for single character strings */
6783 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006784 Py_UNICODE_ISALNUM(*p))
6785 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786
6787 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006788 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006789 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006790
6791 e = p + PyUnicode_GET_SIZE(self);
6792 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006793 if (!Py_UNICODE_ISALNUM(*p))
6794 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797}
6798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006800 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
6805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006806unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807{
6808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6809 register const Py_UNICODE *e;
6810
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006813 Py_UNICODE_ISDECIMAL(*p))
6814 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006817 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006818 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 e = p + PyUnicode_GET_SIZE(self);
6821 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006822 if (!Py_UNICODE_ISDECIMAL(*p))
6823 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006831Return True if all characters in S are digits\n\
6832and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
6834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006835unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
6837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6838 register const Py_UNICODE *e;
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 /* Shortcut for single character strings */
6841 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006842 Py_UNICODE_ISDIGIT(*p))
6843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006846 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 e = p + PyUnicode_GET_SIZE(self);
6850 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006851 if (!Py_UNICODE_ISDIGIT(*p))
6852 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006871 Py_UNICODE_ISNUMERIC(*p))
6872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 e = p + PyUnicode_GET_SIZE(self);
6879 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006880 if (!Py_UNICODE_ISNUMERIC(*p))
6881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006887 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888\n\
6889Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006890iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006895 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899unicode_length(PyUnicodeObject *self)
6900{
6901 return self->length;
6902}
6903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006905 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006907Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006908done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
6910static PyObject *
6911unicode_ljust(PyUnicodeObject *self, PyObject *args)
6912{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006913 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006914 Py_UNICODE fillchar = ' ';
6915
Martin v. Löwis412fb672006-04-13 06:34:32 +00006916 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 return NULL;
6918
Tim Peters7a29bd52001-09-12 03:03:31 +00006919 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 Py_INCREF(self);
6921 return (PyObject*) self;
6922 }
6923
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006924 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006927PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006928 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006930Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006933unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 return fixup(self, fixlower);
6936}
6937
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006938#define LEFTSTRIP 0
6939#define RIGHTSTRIP 1
6940#define BOTHSTRIP 2
6941
6942/* Arrays indexed by above */
6943static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6944
6945#define STRIPNAME(i) (stripformat[i]+3)
6946
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006947/* externally visible for str.strip(unicode) */
6948PyObject *
6949_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6950{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006951 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6952 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6953 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6954 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6955 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006956
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006957 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006958
Benjamin Peterson857ce152009-01-31 16:29:18 +00006959 i = 0;
6960 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006961 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6962 i++;
6963 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006964 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006965
Benjamin Peterson857ce152009-01-31 16:29:18 +00006966 j = len;
6967 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006968 do {
6969 j--;
6970 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6971 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006972 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006973
Benjamin Peterson857ce152009-01-31 16:29:18 +00006974 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006975 Py_INCREF(self);
6976 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006977 }
6978 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006979 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006980}
6981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006984do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006986 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6987 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006988
Benjamin Peterson857ce152009-01-31 16:29:18 +00006989 i = 0;
6990 if (striptype != RIGHTSTRIP) {
6991 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6992 i++;
6993 }
6994 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006995
Benjamin Peterson857ce152009-01-31 16:29:18 +00006996 j = len;
6997 if (striptype != LEFTSTRIP) {
6998 do {
6999 j--;
7000 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7001 j++;
7002 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007003
Benjamin Peterson857ce152009-01-31 16:29:18 +00007004 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7005 Py_INCREF(self);
7006 return (PyObject*)self;
7007 }
7008 else
7009 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010}
7011
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007012
7013static PyObject *
7014do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007016 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017
Benjamin Peterson857ce152009-01-31 16:29:18 +00007018 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7019 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020
Benjamin Peterson857ce152009-01-31 16:29:18 +00007021 if (sep != NULL && sep != Py_None) {
7022 if (PyUnicode_Check(sep))
7023 return _PyUnicode_XStrip(self, striptype, sep);
7024 else if (PyString_Check(sep)) {
7025 PyObject *res;
7026 sep = PyUnicode_FromObject(sep);
7027 if (sep==NULL)
7028 return NULL;
7029 res = _PyUnicode_XStrip(self, striptype, sep);
7030 Py_DECREF(sep);
7031 return res;
7032 }
7033 else {
7034 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007035 "%s arg must be None, unicode or str",
7036 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007037 return NULL;
7038 }
7039 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040
Benjamin Peterson857ce152009-01-31 16:29:18 +00007041 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042}
7043
7044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007045PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007046 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007047\n\
7048Return a copy of the string S with leading and trailing\n\
7049whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007050If chars is given and not None, remove characters in chars instead.\n\
7051If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007052
7053static PyObject *
7054unicode_strip(PyUnicodeObject *self, PyObject *args)
7055{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007056 if (PyTuple_GET_SIZE(args) == 0)
7057 return do_strip(self, BOTHSTRIP); /* Common case */
7058 else
7059 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060}
7061
7062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007064 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065\n\
7066Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007067If chars is given and not None, remove characters in chars instead.\n\
7068If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
7070static PyObject *
7071unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007073 if (PyTuple_GET_SIZE(args) == 0)
7074 return do_strip(self, LEFTSTRIP); /* Common case */
7075 else
7076 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077}
7078
7079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007080PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082\n\
7083Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007084If chars is given and not None, remove characters in chars instead.\n\
7085If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
7087static PyObject *
7088unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7089{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007090 if (PyTuple_GET_SIZE(args) == 0)
7091 return do_strip(self, RIGHTSTRIP); /* Common case */
7092 else
7093 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094}
7095
7096
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007098unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
7100 PyUnicodeObject *u;
7101 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007102 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007103 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105 if (len < 0)
7106 len = 0;
7107
Tim Peters7a29bd52001-09-12 03:03:31 +00007108 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 /* no repeat, return original string */
7110 Py_INCREF(str);
7111 return (PyObject*) str;
7112 }
Tim Peters8f422462000-09-09 06:13:41 +00007113
7114 /* ensure # of chars needed doesn't overflow int and # of bytes
7115 * needed doesn't overflow size_t
7116 */
7117 nchars = len * str->length;
7118 if (len && nchars / len != str->length) {
7119 PyErr_SetString(PyExc_OverflowError,
7120 "repeated string is too long");
7121 return NULL;
7122 }
7123 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7124 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7125 PyErr_SetString(PyExc_OverflowError,
7126 "repeated string is too long");
7127 return NULL;
7128 }
7129 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 if (!u)
7131 return NULL;
7132
7133 p = u->str;
7134
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007135 if (str->length == 1 && len > 0) {
7136 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007137 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007138 Py_ssize_t done = 0; /* number of characters copied this far */
7139 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007140 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007141 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007142 }
7143 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007144 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007145 Py_UNICODE_COPY(p+done, p, n);
7146 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007147 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150 return (PyObject*) u;
7151}
7152
7153PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007154 PyObject *subobj,
7155 PyObject *replobj,
7156 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157{
7158 PyObject *self;
7159 PyObject *str1;
7160 PyObject *str2;
7161 PyObject *result;
7162
7163 self = PyUnicode_FromObject(obj);
7164 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 str1 = PyUnicode_FromObject(subobj);
7167 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007168 Py_DECREF(self);
7169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 }
7171 str2 = PyUnicode_FromObject(replobj);
7172 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007173 Py_DECREF(self);
7174 Py_DECREF(str1);
7175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
Tim Petersced69f82003-09-16 20:30:58 +00007177 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007178 (PyUnicodeObject *)str1,
7179 (PyUnicodeObject *)str2,
7180 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 Py_DECREF(self);
7182 Py_DECREF(str1);
7183 Py_DECREF(str2);
7184 return result;
7185}
7186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007188 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189\n\
7190Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007191old replaced by new. If the optional argument count is\n\
7192given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193
7194static PyObject*
7195unicode_replace(PyUnicodeObject *self, PyObject *args)
7196{
7197 PyUnicodeObject *str1;
7198 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007199 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 PyObject *result;
7201
Martin v. Löwis18e16552006-02-15 17:27:45 +00007202 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 return NULL;
7204 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7205 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007208 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007209 Py_DECREF(str1);
7210 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213 result = replace(self, str1, str2, maxcount);
7214
7215 Py_DECREF(str1);
7216 Py_DECREF(str2);
7217 return result;
7218}
7219
7220static
7221PyObject *unicode_repr(PyObject *unicode)
7222{
7223 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007224 PyUnicode_GET_SIZE(unicode),
7225 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007228PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007229 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230\n\
7231Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007232such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233arguments start and end are interpreted as in slice notation.\n\
7234\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject *
7238unicode_rfind(PyUnicodeObject *self, PyObject *args)
7239{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007240 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007241 Py_ssize_t start;
7242 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007243 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
Facundo Batista57d56692007-11-16 18:04:14 +00007245 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007248 result = stringlib_rfind_slice(
7249 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7250 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7251 start, end
7252 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
7254 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007255
7256 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257}
7258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007259PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007260 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007262Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
7264static PyObject *
7265unicode_rindex(PyUnicodeObject *self, PyObject *args)
7266{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007267 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007268 Py_ssize_t start;
7269 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007270 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
Facundo Batista57d56692007-11-16 18:04:14 +00007272 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007275 result = stringlib_rfind_slice(
7276 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7277 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7278 start, end
7279 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
7281 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007282
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 if (result < 0) {
7284 PyErr_SetString(PyExc_ValueError, "substring not found");
7285 return NULL;
7286 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007287 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007291 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007293Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007294done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295
7296static PyObject *
7297unicode_rjust(PyUnicodeObject *self, PyObject *args)
7298{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007299 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007300 Py_UNICODE fillchar = ' ';
7301
Martin v. Löwis412fb672006-04-13 06:34:32 +00007302 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 return NULL;
7304
Tim Peters7a29bd52001-09-12 03:03:31 +00007305 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 Py_INCREF(self);
7307 return (PyObject*) self;
7308 }
7309
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007310 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311}
7312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007314unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315{
7316 /* standard clamping */
7317 if (start < 0)
7318 start = 0;
7319 if (end < 0)
7320 end = 0;
7321 if (end > self->length)
7322 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007323 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 /* full slice, return original string */
7325 Py_INCREF(self);
7326 return (PyObject*) self;
7327 }
7328 if (start > end)
7329 start = end;
7330 /* copy slice */
7331 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007332 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
7335PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007336 PyObject *sep,
7337 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338{
7339 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007340
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 s = PyUnicode_FromObject(s);
7342 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007343 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007344 if (sep != NULL) {
7345 sep = PyUnicode_FromObject(sep);
7346 if (sep == NULL) {
7347 Py_DECREF(s);
7348 return NULL;
7349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
7351
7352 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7353
7354 Py_DECREF(s);
7355 Py_XDECREF(sep);
7356 return result;
7357}
7358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007359PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007360 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361\n\
7362Return a list of the words in S, using sep as the\n\
7363delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007364splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007365whitespace string is a separator and empty strings are\n\
7366removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject*
7369unicode_split(PyUnicodeObject *self, PyObject *args)
7370{
7371 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007372 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Martin v. Löwis18e16552006-02-15 17:27:45 +00007374 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 return NULL;
7376
7377 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007378 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007380 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007382 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383}
7384
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007385PyObject *
7386PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7387{
7388 PyObject* str_obj;
7389 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007390 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007391
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007392 str_obj = PyUnicode_FromObject(str_in);
7393 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007394 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007395 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007396 if (!sep_obj) {
7397 Py_DECREF(str_obj);
7398 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007399 }
7400
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007401 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007402 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7403 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7404 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007405
Fredrik Lundhb9479482006-05-26 17:22:38 +00007406 Py_DECREF(sep_obj);
7407 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007408
7409 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007410}
7411
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007412
7413PyObject *
7414PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7415{
7416 PyObject* str_obj;
7417 PyObject* sep_obj;
7418 PyObject* out;
7419
7420 str_obj = PyUnicode_FromObject(str_in);
7421 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007422 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007423 sep_obj = PyUnicode_FromObject(sep_in);
7424 if (!sep_obj) {
7425 Py_DECREF(str_obj);
7426 return NULL;
7427 }
7428
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007429 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007430 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7431 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7432 );
7433
7434 Py_DECREF(sep_obj);
7435 Py_DECREF(str_obj);
7436
7437 return out;
7438}
7439
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007440PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007441 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007442\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007443Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007444the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007445found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007446
7447static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007448unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007449{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007450 return PyUnicode_Partition((PyObject *)self, separator);
7451}
7452
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007453PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007454 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007455\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007456Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007457the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007458separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007459
7460static PyObject*
7461unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7462{
7463 return PyUnicode_RPartition((PyObject *)self, separator);
7464}
7465
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007466PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007467 PyObject *sep,
7468 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007469{
7470 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007471
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007472 s = PyUnicode_FromObject(s);
7473 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007474 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007475 if (sep != NULL) {
7476 sep = PyUnicode_FromObject(sep);
7477 if (sep == NULL) {
7478 Py_DECREF(s);
7479 return NULL;
7480 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007481 }
7482
7483 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7484
7485 Py_DECREF(s);
7486 Py_XDECREF(sep);
7487 return result;
7488}
7489
7490PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007491 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007492\n\
7493Return a list of the words in S, using sep as the\n\
7494delimiter string, starting at the end of the string and\n\
7495working to the front. If maxsplit is given, at most maxsplit\n\
7496splits are done. If sep is not specified, any whitespace string\n\
7497is a separator.");
7498
7499static PyObject*
7500unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7501{
7502 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007503 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007504
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007506 return NULL;
7507
7508 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007509 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007510 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007511 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007512 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007513 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007514}
7515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007516PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007517 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518\n\
7519Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007520Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523static PyObject*
7524unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7525{
Guido van Rossum86662912000-04-11 15:38:46 +00007526 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
Guido van Rossum86662912000-04-11 15:38:46 +00007528 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 return NULL;
7530
Guido van Rossum86662912000-04-11 15:38:46 +00007531 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532}
7533
7534static
7535PyObject *unicode_str(PyUnicodeObject *self)
7536{
Fred Drakee4315f52000-05-09 19:53:39 +00007537 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538}
7539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007541 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542\n\
7543Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007544and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
7546static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007547unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 return fixup(self, fixswapcase);
7550}
7551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007553 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554\n\
7555Return a copy of the string S, where all characters have been mapped\n\
7556through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007557Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7558Unmapped characters are left untouched. Characters mapped to None\n\
7559are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
7561static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007562unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563{
Tim Petersced69f82003-09-16 20:30:58 +00007564 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007565 self->length,
7566 table,
7567 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568}
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007571 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007573Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
7575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007576unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 return fixup(self, fixupper);
7579}
7580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007581PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007582 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583\n\
Georg Brandl98064072008-09-09 19:26:00 +00007584Pad a numeric string S with zeros on the left, to fill a field\n\
7585of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
7587static PyObject *
7588unicode_zfill(PyUnicodeObject *self, PyObject *args)
7589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007590 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 PyUnicodeObject *u;
7592
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 Py_ssize_t width;
7594 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 return NULL;
7596
7597 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007598 if (PyUnicode_CheckExact(self)) {
7599 Py_INCREF(self);
7600 return (PyObject*) self;
7601 }
7602 else
7603 return PyUnicode_FromUnicode(
7604 PyUnicode_AS_UNICODE(self),
7605 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007606 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 }
7608
7609 fill = width - self->length;
7610
7611 u = pad(self, fill, 0, '0');
7612
Walter Dörwald068325e2002-04-15 13:36:47 +00007613 if (u == NULL)
7614 return NULL;
7615
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 if (u->str[fill] == '+' || u->str[fill] == '-') {
7617 /* move sign to beginning of string */
7618 u->str[0] = u->str[fill];
7619 u->str[fill] = '0';
7620 }
7621
7622 return (PyObject*) u;
7623}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
7625#if 0
7626static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007627free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007629 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630}
7631#endif
7632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007634 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007636Return True if S starts with the specified prefix, False otherwise.\n\
7637With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007638With optional end, stop comparing S at that position.\n\
7639prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641static PyObject *
7642unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007643 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644{
Georg Brandl24250812006-06-09 18:45:48 +00007645 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007647 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007648 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007649 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
Georg Brandl24250812006-06-09 18:45:48 +00007651 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007652 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7653 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007654 if (PyTuple_Check(subobj)) {
7655 Py_ssize_t i;
7656 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7657 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007658 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007659 if (substring == NULL)
7660 return NULL;
7661 result = tailmatch(self, substring, start, end, -1);
7662 Py_DECREF(substring);
7663 if (result) {
7664 Py_RETURN_TRUE;
7665 }
7666 }
7667 /* nothing matched */
7668 Py_RETURN_FALSE;
7669 }
7670 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007672 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007673 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007675 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676}
7677
7678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007679PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007680 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007682Return True if S ends with the specified suffix, False otherwise.\n\
7683With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007684With optional end, stop comparing S at that position.\n\
7685suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
7687static PyObject *
7688unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007689 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Georg Brandl24250812006-06-09 18:45:48 +00007691 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007694 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007695 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
Georg Brandl24250812006-06-09 18:45:48 +00007697 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007698 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7699 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007700 if (PyTuple_Check(subobj)) {
7701 Py_ssize_t i;
7702 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7703 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007704 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007705 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007706 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007707 result = tailmatch(self, substring, start, end, +1);
7708 Py_DECREF(substring);
7709 if (result) {
7710 Py_RETURN_TRUE;
7711 }
7712 }
7713 Py_RETURN_FALSE;
7714 }
7715 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Georg Brandl24250812006-06-09 18:45:48 +00007719 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007721 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
7724
Eric Smitha9f7d622008-02-17 19:46:49 +00007725/* Implements do_string_format, which is unicode because of stringlib */
7726#include "stringlib/string_format.h"
7727
7728PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007729 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007730\n\
Eric Smith6c840852010-11-06 19:43:44 +00007731Return a formatted version of S, using substitutions from args and kwargs.\n\
7732The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007733
Eric Smithdc13b792008-05-30 18:10:04 +00007734static PyObject *
7735unicode__format__(PyObject *self, PyObject *args)
7736{
7737 PyObject *format_spec;
7738 PyObject *result = NULL;
7739 PyObject *tmp = NULL;
7740
7741 /* If 2.x, convert format_spec to the same type as value */
7742 /* This is to allow things like u''.format('') */
7743 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7744 goto done;
7745 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7746 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007747 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007748 goto done;
7749 }
7750 tmp = PyObject_Unicode(format_spec);
7751 if (tmp == NULL)
7752 goto done;
7753 format_spec = tmp;
7754
7755 result = _PyUnicode_FormatAdvanced(self,
7756 PyUnicode_AS_UNICODE(format_spec),
7757 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007758 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007759 Py_XDECREF(tmp);
7760 return result;
7761}
7762
Eric Smitha9f7d622008-02-17 19:46:49 +00007763PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007764 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007765\n\
Eric Smith6c840852010-11-06 19:43:44 +00007766Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007767
Robert Schuppenies901c9972008-06-10 10:10:31 +00007768static PyObject *
7769unicode__sizeof__(PyUnicodeObject *v)
7770{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007771 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7772 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007773}
7774
7775PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007776 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007777\n\
7778");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007779
7780static PyObject *
7781unicode_getnewargs(PyUnicodeObject *v)
7782{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007783 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007784}
7785
7786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787static PyMethodDef unicode_methods[] = {
7788
7789 /* Order is according to common usage: often used methods should
7790 appear first, since lookup is done sequentially. */
7791
Benjamin Peterson332d7212009-09-18 21:14:55 +00007792 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007793 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7794 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007795 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7797 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7798 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7799 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7800 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7801 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7802 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007803 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007804 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7805 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7806 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007807 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007808 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007809/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7810 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7811 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7812 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007814 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007815 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007817 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7818 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7819 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7820 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7821 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7822 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7823 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7824 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7825 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7826 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7827 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7828 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7829 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7830 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007831 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007832 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7833 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7834 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7835 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007836 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007837#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007838 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839#endif
7840
7841#if 0
7842 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007843 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844#endif
7845
Benjamin Peterson857ce152009-01-31 16:29:18 +00007846 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 {NULL, NULL}
7848};
7849
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007850static PyObject *
7851unicode_mod(PyObject *v, PyObject *w)
7852{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007853 if (!PyUnicode_Check(v)) {
7854 Py_INCREF(Py_NotImplemented);
7855 return Py_NotImplemented;
7856 }
7857 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007858}
7859
7860static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007861 0, /*nb_add*/
7862 0, /*nb_subtract*/
7863 0, /*nb_multiply*/
7864 0, /*nb_divide*/
7865 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007866};
7867
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007869 (lenfunc) unicode_length, /* sq_length */
7870 PyUnicode_Concat, /* sq_concat */
7871 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7872 (ssizeargfunc) unicode_getitem, /* sq_item */
7873 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7874 0, /* sq_ass_item */
7875 0, /* sq_ass_slice */
7876 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877};
7878
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007879static PyObject*
7880unicode_subscript(PyUnicodeObject* self, PyObject* item)
7881{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007882 if (PyIndex_Check(item)) {
7883 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007884 if (i == -1 && PyErr_Occurred())
7885 return NULL;
7886 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007887 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007888 return unicode_getitem(self, i);
7889 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007890 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007891 Py_UNICODE* source_buf;
7892 Py_UNICODE* result_buf;
7893 PyObject* result;
7894
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007895 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007896 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007897 return NULL;
7898 }
7899
7900 if (slicelength <= 0) {
7901 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007902 } else if (start == 0 && step == 1 && slicelength == self->length &&
7903 PyUnicode_CheckExact(self)) {
7904 Py_INCREF(self);
7905 return (PyObject *)self;
7906 } else if (step == 1) {
7907 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007908 } else {
7909 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007910 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7911 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007912
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007913 if (result_buf == NULL)
7914 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007915
7916 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7917 result_buf[i] = source_buf[cur];
7918 }
Tim Petersced69f82003-09-16 20:30:58 +00007919
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007920 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007921 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007922 return result;
7923 }
7924 } else {
7925 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7926 return NULL;
7927 }
7928}
7929
7930static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007931 (lenfunc)unicode_length, /* mp_length */
7932 (binaryfunc)unicode_subscript, /* mp_subscript */
7933 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007934};
7935
Martin v. Löwis18e16552006-02-15 17:27:45 +00007936static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007938 Py_ssize_t index,
7939 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940{
7941 if (index != 0) {
7942 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007943 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944 return -1;
7945 }
7946 *ptr = (void *) self->str;
7947 return PyUnicode_GET_DATA_SIZE(self);
7948}
7949
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950static Py_ssize_t
7951unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007952 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953{
7954 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007955 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 return -1;
7957}
7958
7959static int
7960unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007961 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
7963 if (lenp)
7964 *lenp = PyUnicode_GET_DATA_SIZE(self);
7965 return 1;
7966}
7967
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007968static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007970 Py_ssize_t index,
7971 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
7973 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007974
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 if (index != 0) {
7976 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007977 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 return -1;
7979 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007980 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007982 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007983 *ptr = (void *) PyString_AS_STRING(str);
7984 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985}
7986
7987/* Helpers for PyUnicode_Format() */
7988
7989static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007990getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007992 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007994 (*p_argidx)++;
7995 if (arglen < 0)
7996 return args;
7997 else
7998 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
8000 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008001 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 return NULL;
8003}
8004
8005#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008006#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008008#define F_ALT (1<<3)
8009#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008012strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008014 register Py_ssize_t i;
8015 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008017 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 return len;
8020}
8021
Neal Norwitzfc76d632006-01-10 06:03:13 +00008022static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008023longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8024{
Tim Peters15231542006-02-16 01:08:01 +00008025 Py_ssize_t result;
8026
Neal Norwitzfc76d632006-01-10 06:03:13 +00008027 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008028 result = strtounicode(buffer, (char *)buffer);
8029 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008030}
8031
Guido van Rossum078151d2002-08-11 04:24:12 +00008032/* XXX To save some code duplication, formatfloat/long/int could have been
8033 shared with stringobject.c, converting from 8-bit to Unicode after the
8034 formatting is done. */
8035
Mark Dickinson18cfada2009-11-23 18:46:41 +00008036/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8037
8038static PyObject *
8039formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008041 char *p;
8042 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008044
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 x = PyFloat_AsDouble(v);
8046 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008047 return NULL;
8048
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008050 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008051
Mark Dickinson18cfada2009-11-23 18:46:41 +00008052 p = PyOS_double_to_string(x, type, prec,
8053 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8054 if (p == NULL)
8055 return NULL;
8056 result = PyUnicode_FromStringAndSize(p, strlen(p));
8057 PyMem_Free(p);
8058 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059}
8060
Tim Peters38fd5b62000-09-21 05:43:11 +00008061static PyObject*
8062formatlong(PyObject *val, int flags, int prec, int type)
8063{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008064 char *buf;
8065 int i, len;
8066 PyObject *str; /* temporary string object. */
8067 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008068
Benjamin Peterson857ce152009-01-31 16:29:18 +00008069 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8070 if (!str)
8071 return NULL;
8072 result = _PyUnicode_New(len);
8073 if (!result) {
8074 Py_DECREF(str);
8075 return NULL;
8076 }
8077 for (i = 0; i < len; i++)
8078 result->str[i] = buf[i];
8079 result->str[len] = 0;
8080 Py_DECREF(str);
8081 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008082}
8083
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084static int
8085formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008086 size_t buflen,
8087 int flags,
8088 int prec,
8089 int type,
8090 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008092 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008093 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8094 * + 1 + 1
8095 * = 24
8096 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008097 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008098 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 long x;
8100
8101 x = PyInt_AsLong(v);
8102 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008103 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008104 if (x < 0 && type == 'u') {
8105 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008106 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008107 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8108 sign = "-";
8109 else
8110 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008112 prec = 1;
8113
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008114 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8115 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008116 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008117 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008118 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008119 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008120 return -1;
8121 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008122
8123 if ((flags & F_ALT) &&
8124 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008125 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008126 * of issues that cause pain:
8127 * - when 0 is being converted, the C standard leaves off
8128 * the '0x' or '0X', which is inconsistent with other
8129 * %#x/%#X conversions and inconsistent with Python's
8130 * hex() function
8131 * - there are platforms that violate the standard and
8132 * convert 0 with the '0x' or '0X'
8133 * (Metrowerks, Compaq Tru64)
8134 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008135 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008136 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008137 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008138 * We can achieve the desired consistency by inserting our
8139 * own '0x' or '0X' prefix, and substituting %x/%X in place
8140 * of %#x/%#X.
8141 *
8142 * Note that this is the same approach as used in
8143 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008144 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008145 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8146 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008147 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008148 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008149 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8150 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008151 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008152 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008153 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008154 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008155 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008156 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157}
8158
8159static int
8160formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008161 size_t buflen,
8162 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163{
Ezio Melotti32125152010-02-25 17:36:04 +00008164 PyObject *unistr;
8165 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008166 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008167 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008168 if (PyUnicode_GET_SIZE(v) != 1)
8169 goto onError;
8170 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008173 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008174 if (PyString_GET_SIZE(v) != 1)
8175 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008176 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8177 with a UnicodeDecodeError if 'char' is not decodable with the
8178 default encoding (usually ASCII, but it might be something else) */
8179 str = PyString_AS_STRING(v);
8180 if ((unsigned char)str[0] > 0x7F) {
8181 /* the char is not ASCII; try to decode the string using the
8182 default encoding and return -1 to let the UnicodeDecodeError
8183 be raised if the string can't be decoded */
8184 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8185 if (unistr == NULL)
8186 return -1;
8187 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8188 Py_DECREF(unistr);
8189 }
8190 else
8191 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193
8194 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008195 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008197 x = PyInt_AsLong(v);
8198 if (x == -1 && PyErr_Occurred())
8199 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008200#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008201 if (x < 0 || x > 0x10ffff) {
8202 PyErr_SetString(PyExc_OverflowError,
8203 "%c arg not in range(0x110000) "
8204 "(wide Python build)");
8205 return -1;
8206 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008207#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008208 if (x < 0 || x > 0xffff) {
8209 PyErr_SetString(PyExc_OverflowError,
8210 "%c arg not in range(0x10000) "
8211 "(narrow Python build)");
8212 return -1;
8213 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008214#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008215 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 }
8217 buf[1] = '\0';
8218 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008219
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008221 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008222 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008223 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224}
8225
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008226/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8227
Mark Dickinson18cfada2009-11-23 18:46:41 +00008228 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008229 chars are formatted. XXX This is a magic number. Each formatting
8230 routine does bounds checking to ensure no overflow, but a better
8231 solution may be to malloc a buffer of appropriate size for each
8232 format. For now, the current solution is sufficient.
8233*/
8234#define FORMATBUFLEN (size_t)120
8235
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008237 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238{
8239 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008240 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 int args_owned = 0;
8242 PyUnicodeObject *result = NULL;
8243 PyObject *dict = NULL;
8244 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008245
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 PyErr_BadInternalCall();
8248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 }
8250 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008251 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 fmt = PyUnicode_AS_UNICODE(uformat);
8254 fmtcnt = PyUnicode_GET_SIZE(uformat);
8255
8256 reslen = rescnt = fmtcnt + 100;
8257 result = _PyUnicode_New(reslen);
8258 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 res = PyUnicode_AS_UNICODE(result);
8261
8262 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008263 arglen = PyTuple_Size(args);
8264 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 }
8266 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008267 arglen = -1;
8268 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
Christian Heimese93237d2007-12-19 02:37:44 +00008270 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008271 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008272 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
8274 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008275 if (*fmt != '%') {
8276 if (--rescnt < 0) {
8277 rescnt = fmtcnt + 100;
8278 reslen += rescnt;
8279 if (_PyUnicode_Resize(&result, reslen) < 0)
8280 goto onError;
8281 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8282 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008283 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008284 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008285 }
8286 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 /* Got a format specifier */
8288 int flags = 0;
8289 Py_ssize_t width = -1;
8290 int prec = -1;
8291 Py_UNICODE c = '\0';
8292 Py_UNICODE fill;
8293 int isnumok;
8294 PyObject *v = NULL;
8295 PyObject *temp = NULL;
8296 Py_UNICODE *pbuf;
8297 Py_UNICODE sign;
8298 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008299 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008300
8301 fmt++;
8302 if (*fmt == '(') {
8303 Py_UNICODE *keystart;
8304 Py_ssize_t keylen;
8305 PyObject *key;
8306 int pcount = 1;
8307
8308 if (dict == NULL) {
8309 PyErr_SetString(PyExc_TypeError,
8310 "format requires a mapping");
8311 goto onError;
8312 }
8313 ++fmt;
8314 --fmtcnt;
8315 keystart = fmt;
8316 /* Skip over balanced parentheses */
8317 while (pcount > 0 && --fmtcnt >= 0) {
8318 if (*fmt == ')')
8319 --pcount;
8320 else if (*fmt == '(')
8321 ++pcount;
8322 fmt++;
8323 }
8324 keylen = fmt - keystart - 1;
8325 if (fmtcnt < 0 || pcount > 0) {
8326 PyErr_SetString(PyExc_ValueError,
8327 "incomplete format key");
8328 goto onError;
8329 }
8330#if 0
8331 /* keys are converted to strings using UTF-8 and
8332 then looked up since Python uses strings to hold
8333 variables names etc. in its namespaces and we
8334 wouldn't want to break common idioms. */
8335 key = PyUnicode_EncodeUTF8(keystart,
8336 keylen,
8337 NULL);
8338#else
8339 key = PyUnicode_FromUnicode(keystart, keylen);
8340#endif
8341 if (key == NULL)
8342 goto onError;
8343 if (args_owned) {
8344 Py_DECREF(args);
8345 args_owned = 0;
8346 }
8347 args = PyObject_GetItem(dict, key);
8348 Py_DECREF(key);
8349 if (args == NULL) {
8350 goto onError;
8351 }
8352 args_owned = 1;
8353 arglen = -1;
8354 argidx = -2;
8355 }
8356 while (--fmtcnt >= 0) {
8357 switch (c = *fmt++) {
8358 case '-': flags |= F_LJUST; continue;
8359 case '+': flags |= F_SIGN; continue;
8360 case ' ': flags |= F_BLANK; continue;
8361 case '#': flags |= F_ALT; continue;
8362 case '0': flags |= F_ZERO; continue;
8363 }
8364 break;
8365 }
8366 if (c == '*') {
8367 v = getnextarg(args, arglen, &argidx);
8368 if (v == NULL)
8369 goto onError;
8370 if (!PyInt_Check(v)) {
8371 PyErr_SetString(PyExc_TypeError,
8372 "* wants int");
8373 goto onError;
8374 }
8375 width = PyInt_AsLong(v);
8376 if (width < 0) {
8377 flags |= F_LJUST;
8378 width = -width;
8379 }
8380 if (--fmtcnt >= 0)
8381 c = *fmt++;
8382 }
8383 else if (c >= '0' && c <= '9') {
8384 width = c - '0';
8385 while (--fmtcnt >= 0) {
8386 c = *fmt++;
8387 if (c < '0' || c > '9')
8388 break;
8389 if ((width*10) / 10 != width) {
8390 PyErr_SetString(PyExc_ValueError,
8391 "width too big");
8392 goto onError;
8393 }
8394 width = width*10 + (c - '0');
8395 }
8396 }
8397 if (c == '.') {
8398 prec = 0;
8399 if (--fmtcnt >= 0)
8400 c = *fmt++;
8401 if (c == '*') {
8402 v = getnextarg(args, arglen, &argidx);
8403 if (v == NULL)
8404 goto onError;
8405 if (!PyInt_Check(v)) {
8406 PyErr_SetString(PyExc_TypeError,
8407 "* wants int");
8408 goto onError;
8409 }
8410 prec = PyInt_AsLong(v);
8411 if (prec < 0)
8412 prec = 0;
8413 if (--fmtcnt >= 0)
8414 c = *fmt++;
8415 }
8416 else if (c >= '0' && c <= '9') {
8417 prec = c - '0';
8418 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008419 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008420 if (c < '0' || c > '9')
8421 break;
8422 if ((prec*10) / 10 != prec) {
8423 PyErr_SetString(PyExc_ValueError,
8424 "prec too big");
8425 goto onError;
8426 }
8427 prec = prec*10 + (c - '0');
8428 }
8429 }
8430 } /* prec */
8431 if (fmtcnt >= 0) {
8432 if (c == 'h' || c == 'l' || c == 'L') {
8433 if (--fmtcnt >= 0)
8434 c = *fmt++;
8435 }
8436 }
8437 if (fmtcnt < 0) {
8438 PyErr_SetString(PyExc_ValueError,
8439 "incomplete format");
8440 goto onError;
8441 }
8442 if (c != '%') {
8443 v = getnextarg(args, arglen, &argidx);
8444 if (v == NULL)
8445 goto onError;
8446 }
8447 sign = 0;
8448 fill = ' ';
8449 switch (c) {
8450
8451 case '%':
8452 pbuf = formatbuf;
8453 /* presume that buffer length is at least 1 */
8454 pbuf[0] = '%';
8455 len = 1;
8456 break;
8457
8458 case 's':
8459 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008460 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008461 temp = v;
8462 Py_INCREF(temp);
8463 }
8464 else {
8465 PyObject *unicode;
8466 if (c == 's')
8467 temp = PyObject_Unicode(v);
8468 else
8469 temp = PyObject_Repr(v);
8470 if (temp == NULL)
8471 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008472 if (PyUnicode_Check(temp))
8473 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008474 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008475 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008476 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8477 PyString_GET_SIZE(temp),
8478 NULL,
8479 "strict");
8480 Py_DECREF(temp);
8481 temp = unicode;
8482 if (temp == NULL)
8483 goto onError;
8484 }
8485 else {
8486 Py_DECREF(temp);
8487 PyErr_SetString(PyExc_TypeError,
8488 "%s argument has non-string str()");
8489 goto onError;
8490 }
8491 }
8492 pbuf = PyUnicode_AS_UNICODE(temp);
8493 len = PyUnicode_GET_SIZE(temp);
8494 if (prec >= 0 && len > prec)
8495 len = prec;
8496 break;
8497
8498 case 'i':
8499 case 'd':
8500 case 'u':
8501 case 'o':
8502 case 'x':
8503 case 'X':
8504 if (c == 'i')
8505 c = 'd';
8506 isnumok = 0;
8507 if (PyNumber_Check(v)) {
8508 PyObject *iobj=NULL;
8509
8510 if (PyInt_Check(v) || (PyLong_Check(v))) {
8511 iobj = v;
8512 Py_INCREF(iobj);
8513 }
8514 else {
8515 iobj = PyNumber_Int(v);
8516 if (iobj==NULL) iobj = PyNumber_Long(v);
8517 }
8518 if (iobj!=NULL) {
8519 if (PyInt_Check(iobj)) {
8520 isnumok = 1;
8521 pbuf = formatbuf;
8522 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8523 flags, prec, c, iobj);
8524 Py_DECREF(iobj);
8525 if (len < 0)
8526 goto onError;
8527 sign = 1;
8528 }
8529 else if (PyLong_Check(iobj)) {
8530 isnumok = 1;
8531 temp = formatlong(iobj, flags, prec, c);
8532 Py_DECREF(iobj);
8533 if (!temp)
8534 goto onError;
8535 pbuf = PyUnicode_AS_UNICODE(temp);
8536 len = PyUnicode_GET_SIZE(temp);
8537 sign = 1;
8538 }
8539 else {
8540 Py_DECREF(iobj);
8541 }
8542 }
8543 }
8544 if (!isnumok) {
8545 PyErr_Format(PyExc_TypeError,
8546 "%%%c format: a number is required, "
8547 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8548 goto onError;
8549 }
8550 if (flags & F_ZERO)
8551 fill = '0';
8552 break;
8553
8554 case 'e':
8555 case 'E':
8556 case 'f':
8557 case 'F':
8558 case 'g':
8559 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008560 temp = formatfloat(v, flags, prec, c);
8561 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008562 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008563 pbuf = PyUnicode_AS_UNICODE(temp);
8564 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008565 sign = 1;
8566 if (flags & F_ZERO)
8567 fill = '0';
8568 break;
8569
8570 case 'c':
8571 pbuf = formatbuf;
8572 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8573 if (len < 0)
8574 goto onError;
8575 break;
8576
8577 default:
8578 PyErr_Format(PyExc_ValueError,
8579 "unsupported format character '%c' (0x%x) "
8580 "at index %zd",
8581 (31<=c && c<=126) ? (char)c : '?',
8582 (int)c,
8583 (Py_ssize_t)(fmt - 1 -
8584 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008585 goto onError;
8586 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008587 if (sign) {
8588 if (*pbuf == '-' || *pbuf == '+') {
8589 sign = *pbuf++;
8590 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008591 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008592 else if (flags & F_SIGN)
8593 sign = '+';
8594 else if (flags & F_BLANK)
8595 sign = ' ';
8596 else
8597 sign = 0;
8598 }
8599 if (width < len)
8600 width = len;
8601 if (rescnt - (sign != 0) < width) {
8602 reslen -= rescnt;
8603 rescnt = width + fmtcnt + 100;
8604 reslen += rescnt;
8605 if (reslen < 0) {
8606 Py_XDECREF(temp);
8607 PyErr_NoMemory();
8608 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008609 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008610 if (_PyUnicode_Resize(&result, reslen) < 0) {
8611 Py_XDECREF(temp);
8612 goto onError;
8613 }
8614 res = PyUnicode_AS_UNICODE(result)
8615 + reslen - rescnt;
8616 }
8617 if (sign) {
8618 if (fill != ' ')
8619 *res++ = sign;
8620 rescnt--;
8621 if (width > len)
8622 width--;
8623 }
8624 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8625 assert(pbuf[0] == '0');
8626 assert(pbuf[1] == c);
8627 if (fill != ' ') {
8628 *res++ = *pbuf++;
8629 *res++ = *pbuf++;
8630 }
8631 rescnt -= 2;
8632 width -= 2;
8633 if (width < 0)
8634 width = 0;
8635 len -= 2;
8636 }
8637 if (width > len && !(flags & F_LJUST)) {
8638 do {
8639 --rescnt;
8640 *res++ = fill;
8641 } while (--width > len);
8642 }
8643 if (fill == ' ') {
8644 if (sign)
8645 *res++ = sign;
8646 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8647 assert(pbuf[0] == '0');
8648 assert(pbuf[1] == c);
8649 *res++ = *pbuf++;
8650 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008651 }
8652 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008653 Py_UNICODE_COPY(res, pbuf, len);
8654 res += len;
8655 rescnt -= len;
8656 while (--width >= len) {
8657 --rescnt;
8658 *res++ = ' ';
8659 }
8660 if (dict && (argidx < arglen) && c != '%') {
8661 PyErr_SetString(PyExc_TypeError,
8662 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008663 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008664 goto onError;
8665 }
8666 Py_XDECREF(temp);
8667 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 } /* until end */
8669 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008670 PyErr_SetString(PyExc_TypeError,
8671 "not all arguments converted during string formatting");
8672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 }
8674
Thomas Woutersa96affe2006-03-12 00:29:36 +00008675 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008678 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 }
8680 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 return (PyObject *)result;
8682
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 Py_XDECREF(result);
8685 Py_DECREF(uformat);
8686 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008687 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 }
8689 return NULL;
8690}
8691
8692static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 (readbufferproc) unicode_buffer_getreadbuf,
8694 (writebufferproc) unicode_buffer_getwritebuf,
8695 (segcountproc) unicode_buffer_getsegcount,
8696 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697};
8698
Jeremy Hylton938ace62002-07-17 16:30:39 +00008699static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008700unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8701
Tim Peters6d6c1a32001-08-02 04:15:00 +00008702static PyObject *
8703unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8704{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008705 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008706 static char *kwlist[] = {"string", "encoding", "errors", 0};
8707 char *encoding = NULL;
8708 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008709
Benjamin Peterson857ce152009-01-31 16:29:18 +00008710 if (type != &PyUnicode_Type)
8711 return unicode_subtype_new(type, args, kwds);
8712 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008713 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008714 return NULL;
8715 if (x == NULL)
8716 return (PyObject *)_PyUnicode_New(0);
8717 if (encoding == NULL && errors == NULL)
8718 return PyObject_Unicode(x);
8719 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008720 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008721}
8722
Guido van Rossume023fe02001-08-30 03:12:59 +00008723static PyObject *
8724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8725{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008726 PyUnicodeObject *tmp, *pnew;
8727 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008728
Benjamin Peterson857ce152009-01-31 16:29:18 +00008729 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8730 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8731 if (tmp == NULL)
8732 return NULL;
8733 assert(PyUnicode_Check(tmp));
8734 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8735 if (pnew == NULL) {
8736 Py_DECREF(tmp);
8737 return NULL;
8738 }
8739 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8740 if (pnew->str == NULL) {
8741 _Py_ForgetReference((PyObject *)pnew);
8742 PyObject_Del(pnew);
8743 Py_DECREF(tmp);
8744 return PyErr_NoMemory();
8745 }
8746 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8747 pnew->length = n;
8748 pnew->hash = tmp->hash;
8749 Py_DECREF(tmp);
8750 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008751}
8752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008753PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008754 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008755\n\
8756Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008757encoding defaults to the current default string encoding.\n\
8758errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008761 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008762 "unicode", /* tp_name */
8763 sizeof(PyUnicodeObject), /* tp_size */
8764 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008766 (destructor)unicode_dealloc, /* tp_dealloc */
8767 0, /* tp_print */
8768 0, /* tp_getattr */
8769 0, /* tp_setattr */
8770 0, /* tp_compare */
8771 unicode_repr, /* tp_repr */
8772 &unicode_as_number, /* tp_as_number */
8773 &unicode_as_sequence, /* tp_as_sequence */
8774 &unicode_as_mapping, /* tp_as_mapping */
8775 (hashfunc) unicode_hash, /* tp_hash*/
8776 0, /* tp_call*/
8777 (reprfunc) unicode_str, /* tp_str */
8778 PyObject_GenericGetAttr, /* tp_getattro */
8779 0, /* tp_setattro */
8780 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008781 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008782 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008783 unicode_doc, /* tp_doc */
8784 0, /* tp_traverse */
8785 0, /* tp_clear */
8786 PyUnicode_RichCompare, /* tp_richcompare */
8787 0, /* tp_weaklistoffset */
8788 0, /* tp_iter */
8789 0, /* tp_iternext */
8790 unicode_methods, /* tp_methods */
8791 0, /* tp_members */
8792 0, /* tp_getset */
8793 &PyBaseString_Type, /* tp_base */
8794 0, /* tp_dict */
8795 0, /* tp_descr_get */
8796 0, /* tp_descr_set */
8797 0, /* tp_dictoffset */
8798 0, /* tp_init */
8799 0, /* tp_alloc */
8800 unicode_new, /* tp_new */
8801 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802};
8803
8804/* Initialize the Unicode implementation */
8805
Thomas Wouters78890102000-07-22 19:25:51 +00008806void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008808 int i;
8809
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008810 /* XXX - move this array to unicodectype.c ? */
8811 Py_UNICODE linebreak[] = {
8812 0x000A, /* LINE FEED */
8813 0x000D, /* CARRIAGE RETURN */
8814 0x001C, /* FILE SEPARATOR */
8815 0x001D, /* GROUP SEPARATOR */
8816 0x001E, /* RECORD SEPARATOR */
8817 0x0085, /* NEXT LINE */
8818 0x2028, /* LINE SEPARATOR */
8819 0x2029, /* PARAGRAPH SEPARATOR */
8820 };
8821
Fred Drakee4315f52000-05-09 19:53:39 +00008822 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008823 free_list = NULL;
8824 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008826 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008827 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008828
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008829 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008830 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008831 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008832 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008833 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008834
8835 /* initialize the linebreak bloom filter */
8836 bloom_linebreak = make_bloom_mask(
8837 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8838 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008839
8840 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841}
8842
8843/* Finalize the Unicode implementation */
8844
Christian Heimes3b718a72008-02-14 12:47:33 +00008845int
8846PyUnicode_ClearFreeList(void)
8847{
8848 int freelist_size = numfree;
8849 PyUnicodeObject *u;
8850
8851 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008852 PyUnicodeObject *v = u;
8853 u = *(PyUnicodeObject **)u;
8854 if (v->str)
8855 PyObject_DEL(v->str);
8856 Py_XDECREF(v->defenc);
8857 PyObject_Del(v);
8858 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008859 }
8860 free_list = NULL;
8861 assert(numfree == 0);
8862 return freelist_size;
8863}
8864
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865void
Thomas Wouters78890102000-07-22 19:25:51 +00008866_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008868 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008870 Py_XDECREF(unicode_empty);
8871 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008872
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008873 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008874 if (unicode_latin1[i]) {
8875 Py_DECREF(unicode_latin1[i]);
8876 unicode_latin1[i] = NULL;
8877 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008878 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008879 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008881
Anthony Baxterac6bd462006-04-13 02:06:09 +00008882#ifdef __cplusplus
8883}
8884#endif