blob: 47249cb7fe25b8730635c6f7db9e8dbbe30416f9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson339f8c62009-01-31 22:25:08 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000247 if (unicode == unicode_empty ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Benjamin Peterson828a7062008-12-27 17:05:29 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Benjamin Peterson828a7062008-12-27 17:05:29 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Benjamin Peterson828a7062008-12-27 17:05:29 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000443 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000492 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
532PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534{
535 PyUnicodeObject *unicode;
536
537 if (w == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000538 PyErr_BadInternalCall();
539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540 }
541
542 unicode = _PyUnicode_New(size);
543 if (!unicode)
544 return NULL;
545
546 /* Copy the wchar_t data into the new object */
547#ifdef HAVE_USABLE_WCHAR_T
548 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000549#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000551 register Py_UNICODE *u;
552 register Py_ssize_t i;
553 u = PyUnicode_AS_UNICODE(unicode);
554 for (i = size; i > 0; i--)
555 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000556 }
557#endif
558
559 return (PyObject *)unicode;
560}
561
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000562static void
563makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
564{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000565 *fmt++ = '%';
566 if (width) {
567 if (zeropad)
568 *fmt++ = '0';
569 fmt += sprintf(fmt, "%d", width);
570 }
571 if (precision)
572 fmt += sprintf(fmt, ".%d", precision);
573 if (longflag)
574 *fmt++ = 'l';
575 else if (size_tflag) {
576 char *f = PY_FORMAT_SIZE_T;
577 while (*f)
578 *fmt++ = *f++;
579 }
580 *fmt++ = c;
581 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000582}
583
584#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
585
586PyObject *
587PyUnicode_FromFormatV(const char *format, va_list vargs)
588{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000589 va_list count;
590 Py_ssize_t callcount = 0;
591 PyObject **callresults = NULL;
592 PyObject **callresult = NULL;
593 Py_ssize_t n = 0;
594 int width = 0;
595 int precision = 0;
596 int zeropad;
597 const char* f;
598 Py_UNICODE *s;
599 PyObject *string;
600 /* used by sprintf */
601 char buffer[21];
602 /* use abuffer instead of buffer, if we need more space
603 * (which can happen if there's a format specifier with width). */
604 char *abuffer = NULL;
605 char *realbuffer;
606 Py_ssize_t abuffersize = 0;
607 char fmt[60]; /* should be enough for %0width.precisionld */
608 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000609
610#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000611 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000612#else
613#ifdef __va_copy
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000614 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000615#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000616 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000617#endif
618#endif
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000619 /* step 1: count the number of %S/%R/%s format specifications
620 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
621 * objects once during step 3 and put the result in an array) */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000622 for (f = format; *f; f++) {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000623 if (*f == '%') {
624 if (*(f+1)=='%')
625 continue;
Walter Dörwald67032252009-05-03 22:46:50 +0000626 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000627 ++callcount;
628 while (isdigit((unsigned)*f))
629 width = (width*10) + *f++ - '0';
630 while (*++f && *f != '%' && !isalpha((unsigned)*f))
631 ;
632 if (*f == 's')
633 ++callcount;
634 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000635 }
636 /* step 2: allocate memory for the results of
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000637 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000638 if (callcount) {
639 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
640 if (!callresults) {
641 PyErr_NoMemory();
642 return NULL;
643 }
644 callresult = callresults;
645 }
646 /* step 3: figure out how large a buffer we need */
647 for (f = format; *f; f++) {
648 if (*f == '%') {
649 const char* p = f;
650 width = 0;
651 while (isdigit((unsigned)*f))
652 width = (width*10) + *f++ - '0';
653 while (*++f && *f != '%' && !isalpha((unsigned)*f))
654 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000655
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000656 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
657 * they don't affect the amount of space we reserve.
658 */
659 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000660 (f[1] == 'd' || f[1] == 'u'))
661 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000662
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000663 switch (*f) {
664 case 'c':
665 (void)va_arg(count, int);
666 /* fall through... */
667 case '%':
668 n++;
669 break;
670 case 'd': case 'u': case 'i': case 'x':
671 (void) va_arg(count, int);
672 /* 20 bytes is enough to hold a 64-bit
673 integer. Decimal takes the most space.
674 This isn't enough for octal.
675 If a width is specified we need more
676 (which we allocate later). */
677 if (width < 20)
678 width = 20;
679 n += width;
680 if (abuffersize < width)
681 abuffersize = width;
682 break;
683 case 's':
684 {
685 /* UTF-8 */
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000686 unsigned char *s = va_arg(count, unsigned char*);
687 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
688 if (!str)
689 goto fail;
690 n += PyUnicode_GET_SIZE(str);
691 /* Remember the str and switch to the next slot */
692 *callresult++ = str;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000693 break;
694 }
695 case 'U':
696 {
697 PyObject *obj = va_arg(count, PyObject *);
698 assert(obj && PyUnicode_Check(obj));
699 n += PyUnicode_GET_SIZE(obj);
700 break;
701 }
702 case 'V':
703 {
704 PyObject *obj = va_arg(count, PyObject *);
705 const char *str = va_arg(count, const char *);
706 assert(obj || str);
707 assert(!obj || PyUnicode_Check(obj));
708 if (obj)
709 n += PyUnicode_GET_SIZE(obj);
710 else
711 n += strlen(str);
712 break;
713 }
714 case 'S':
715 {
716 PyObject *obj = va_arg(count, PyObject *);
717 PyObject *str;
718 assert(obj);
719 str = PyObject_Str(obj);
720 if (!str)
721 goto fail;
722 n += PyUnicode_GET_SIZE(str);
723 /* Remember the str and switch to the next slot */
724 *callresult++ = str;
725 break;
726 }
727 case 'R':
728 {
729 PyObject *obj = va_arg(count, PyObject *);
730 PyObject *repr;
731 assert(obj);
732 repr = PyObject_Repr(obj);
733 if (!repr)
734 goto fail;
735 n += PyUnicode_GET_SIZE(repr);
736 /* Remember the repr and switch to the next slot */
737 *callresult++ = repr;
738 break;
739 }
740 case 'p':
741 (void) va_arg(count, int);
742 /* maximum 64-bit pointer representation:
743 * 0xffffffffffffffff
744 * so 19 characters is enough.
745 * XXX I count 18 -- what's the extra for?
746 */
747 n += 19;
748 break;
749 default:
750 /* if we stumble upon an unknown
751 formatting code, copy the rest of
752 the format string to the output
753 string. (we cannot just skip the
754 code, since there's no way to know
755 what's in the argument list) */
756 n += strlen(p);
757 goto expand;
758 }
759 } else
760 n++;
761 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000762 expand:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000763 if (abuffersize > 20) {
764 abuffer = PyObject_Malloc(abuffersize);
765 if (!abuffer) {
766 PyErr_NoMemory();
767 goto fail;
768 }
769 realbuffer = abuffer;
770 }
771 else
772 realbuffer = buffer;
773 /* step 4: fill the buffer */
774 /* Since we've analyzed how much space we need for the worst case,
775 we don't have to resize the string.
776 There can be no errors beyond this point. */
777 string = PyUnicode_FromUnicode(NULL, n);
778 if (!string)
779 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000780
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000781 s = PyUnicode_AS_UNICODE(string);
782 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000783
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000784 for (f = format; *f; f++) {
785 if (*f == '%') {
786 const char* p = f++;
787 int longflag = 0;
788 int size_tflag = 0;
789 zeropad = (*f == '0');
790 /* parse the width.precision part */
791 width = 0;
792 while (isdigit((unsigned)*f))
793 width = (width*10) + *f++ - '0';
794 precision = 0;
795 if (*f == '.') {
796 f++;
797 while (isdigit((unsigned)*f))
798 precision = (precision*10) + *f++ - '0';
799 }
800 /* handle the long flag, but only for %ld and %lu.
801 others can be added when necessary. */
802 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
803 longflag = 1;
804 ++f;
805 }
806 /* handle the size_t flag. */
807 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
808 size_tflag = 1;
809 ++f;
810 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000811
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000812 switch (*f) {
813 case 'c':
814 *s++ = va_arg(vargs, int);
815 break;
816 case 'd':
817 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
818 if (longflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, long));
820 else if (size_tflag)
821 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
822 else
823 sprintf(realbuffer, fmt, va_arg(vargs, int));
824 appendstring(realbuffer);
825 break;
826 case 'u':
827 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
828 if (longflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
830 else if (size_tflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
832 else
833 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
834 appendstring(realbuffer);
835 break;
836 case 'i':
837 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
838 sprintf(realbuffer, fmt, va_arg(vargs, int));
839 appendstring(realbuffer);
840 break;
841 case 'x':
842 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
843 sprintf(realbuffer, fmt, va_arg(vargs, int));
844 appendstring(realbuffer);
845 break;
846 case 's':
847 {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000848 /* unused, since we already have the result */
849 (void) va_arg(vargs, char *);
850 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
851 PyUnicode_GET_SIZE(*callresult));
852 s += PyUnicode_GET_SIZE(*callresult);
853 /* We're done with the unicode()/repr() => forget it */
854 Py_DECREF(*callresult);
855 /* switch to next unicode()/repr() result */
856 ++callresult;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000857 break;
858 }
859 case 'U':
860 {
861 PyObject *obj = va_arg(vargs, PyObject *);
862 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
863 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
864 s += size;
865 break;
866 }
867 case 'V':
868 {
869 PyObject *obj = va_arg(vargs, PyObject *);
870 const char *str = va_arg(vargs, const char *);
871 if (obj) {
872 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
873 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
874 s += size;
875 } else {
876 appendstring(str);
877 }
878 break;
879 }
880 case 'S':
881 case 'R':
882 {
883 Py_UNICODE *ucopy;
884 Py_ssize_t usize;
885 Py_ssize_t upos;
886 /* unused, since we already have the result */
887 (void) va_arg(vargs, PyObject *);
888 ucopy = PyUnicode_AS_UNICODE(*callresult);
889 usize = PyUnicode_GET_SIZE(*callresult);
890 for (upos = 0; upos<usize;)
891 *s++ = ucopy[upos++];
892 /* We're done with the unicode()/repr() => forget it */
893 Py_DECREF(*callresult);
894 /* switch to next unicode()/repr() result */
895 ++callresult;
896 break;
897 }
898 case 'p':
899 sprintf(buffer, "%p", va_arg(vargs, void*));
900 /* %p is ill-defined: ensure leading 0x. */
901 if (buffer[1] == 'X')
902 buffer[1] = 'x';
903 else if (buffer[1] != 'x') {
904 memmove(buffer+2, buffer, strlen(buffer)+1);
905 buffer[0] = '0';
906 buffer[1] = 'x';
907 }
908 appendstring(buffer);
909 break;
910 case '%':
911 *s++ = '%';
912 break;
913 default:
914 appendstring(p);
915 goto end;
916 }
917 } else
918 *s++ = *f;
919 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000920
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000921 end:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000922 if (callresults)
923 PyObject_Free(callresults);
924 if (abuffer)
925 PyObject_Free(abuffer);
926 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
927 return string;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000928 fail:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000929 if (callresults) {
930 PyObject **callresult2 = callresults;
931 while (callresult2 < callresult) {
932 Py_DECREF(*callresult2);
933 ++callresult2;
934 }
935 PyObject_Free(callresults);
936 }
937 if (abuffer)
938 PyObject_Free(abuffer);
939 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000940}
941
942#undef appendstring
943
944PyObject *
945PyUnicode_FromFormat(const char *format, ...)
946{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000947 PyObject* ret;
948 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000949
950#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000951 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000952#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000953 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000954#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000955 ret = PyUnicode_FromFormatV(format, vargs);
956 va_end(vargs);
957 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000958}
959
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000961 wchar_t *w,
962 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963{
964 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000965 PyErr_BadInternalCall();
966 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000968
969 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000971 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000972
Guido van Rossumd57fd912000-03-10 22:53:23 +0000973#ifdef HAVE_USABLE_WCHAR_T
974 memcpy(w, unicode->str, size * sizeof(wchar_t));
975#else
976 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000977 register Py_UNICODE *u;
978 register Py_ssize_t i;
979 u = PyUnicode_AS_UNICODE(unicode);
980 for (i = size; i > 0; i--)
981 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982 }
983#endif
984
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000985 if (size > PyUnicode_GET_SIZE(unicode))
986 return PyUnicode_GET_SIZE(unicode);
987 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000988 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989}
990
991#endif
992
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000993PyObject *PyUnicode_FromOrdinal(int ordinal)
994{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000995 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000996
997#ifdef Py_UNICODE_WIDE
998 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000999 PyErr_SetString(PyExc_ValueError,
1000 "unichr() arg not in range(0x110000) "
1001 "(wide Python build)");
1002 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001003 }
1004#else
1005 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001006 PyErr_SetString(PyExc_ValueError,
1007 "unichr() arg not in range(0x10000) "
1008 "(narrow Python build)");
1009 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001010 }
1011#endif
1012
Hye-Shik Chang40574832004-04-06 07:24:51 +00001013 s[0] = (Py_UNICODE)ordinal;
1014 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001015}
1016
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017PyObject *PyUnicode_FromObject(register PyObject *obj)
1018{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001019 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001020 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001022 Py_INCREF(obj);
1023 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001024 }
1025 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001026 /* For a Unicode subtype that's not a Unicode object,
1027 return a true Unicode object with the same data. */
1028 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1029 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001030 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001031 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1032}
1033
1034PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001035 const char *encoding,
1036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001037{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001038 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001039 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001040 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (obj == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001043 PyErr_BadInternalCall();
1044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001046
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001047#if 0
1048 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001049 that no encodings is given and then redirect to
1050 PyObject_Unicode() which then applies the additional logic for
1051 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001052
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001053 NOTE: This API should really only be used for object which
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001054 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001055
1056 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001057 if (PyUnicode_Check(obj)) {
1058 if (encoding) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001059 PyErr_SetString(PyExc_TypeError,
1060 "decoding Unicode is not supported");
1061 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001062 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001063 return PyObject_Unicode(obj);
1064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065#else
1066 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001067 PyErr_SetString(PyExc_TypeError,
1068 "decoding Unicode is not supported");
1069 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001070 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001071#endif
1072
1073 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001074 if (PyString_Check(obj)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001075 s = PyString_AS_STRING(obj);
1076 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001077 }
Christian Heimes3497f942008-05-26 12:29:14 +00001078 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001079 /* Python 2.x specific */
1080 PyErr_Format(PyExc_TypeError,
1081 "decoding bytearray is not supported");
1082 return NULL;
1083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001085 /* Overwrite the error message with something more useful in
1086 case of a TypeError. */
1087 if (PyErr_ExceptionMatches(PyExc_TypeError))
1088 PyErr_Format(PyExc_TypeError,
1089 "coercing to Unicode: need string or buffer, "
1090 "%.80s found",
1091 Py_TYPE(obj)->tp_name);
1092 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001093 }
Tim Petersced69f82003-09-16 20:30:58 +00001094
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (len == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001097 Py_INCREF(unicode_empty);
1098 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 }
Tim Petersced69f82003-09-16 20:30:58 +00001100 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001101 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001102
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001103 return v;
1104
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001105 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107}
1108
1109PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001110 Py_ssize_t size,
1111 const char *encoding,
1112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113{
1114 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001115
1116 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001117 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001118
1119 /* Shortcuts for common default encodings */
1120 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001122 else if (strcmp(encoding, "latin-1") == 0)
1123 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001124#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1125 else if (strcmp(encoding, "mbcs") == 0)
1126 return PyUnicode_DecodeMBCS(s, size, errors);
1127#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001128 else if (strcmp(encoding, "ascii") == 0)
1129 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130
1131 /* Decode via the codec registry */
1132 buffer = PyBuffer_FromMemory((void *)s, size);
1133 if (buffer == NULL)
1134 goto onError;
1135 unicode = PyCodec_Decode(buffer, encoding, errors);
1136 if (unicode == NULL)
1137 goto onError;
1138 if (!PyUnicode_Check(unicode)) {
1139 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001140 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001141 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 Py_DECREF(unicode);
1143 goto onError;
1144 }
1145 Py_DECREF(buffer);
1146 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001147
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 Py_XDECREF(buffer);
1150 return NULL;
1151}
1152
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001153PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1154 const char *encoding,
1155 const char *errors)
1156{
1157 PyObject *v;
1158
1159 if (!PyUnicode_Check(unicode)) {
1160 PyErr_BadArgument();
1161 goto onError;
1162 }
1163
1164 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001165 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001166
1167 /* Decode via the codec registry */
1168 v = PyCodec_Decode(unicode, encoding, errors);
1169 if (v == NULL)
1170 goto onError;
1171 return v;
1172
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001173 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001174 return NULL;
1175}
1176
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001178 Py_ssize_t size,
1179 const char *encoding,
1180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181{
1182 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001183
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 unicode = PyUnicode_FromUnicode(s, size);
1185 if (unicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1188 Py_DECREF(unicode);
1189 return v;
1190}
1191
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001192PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1193 const char *encoding,
1194 const char *errors)
1195{
1196 PyObject *v;
1197
1198 if (!PyUnicode_Check(unicode)) {
1199 PyErr_BadArgument();
1200 goto onError;
1201 }
1202
1203 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001204 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001205
1206 /* Encode via the codec registry */
1207 v = PyCodec_Encode(unicode, encoding, errors);
1208 if (v == NULL)
1209 goto onError;
1210 return v;
1211
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001212 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001213 return NULL;
1214}
1215
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1217 const char *encoding,
1218 const char *errors)
1219{
1220 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 goto onError;
1225 }
Fred Drakee4315f52000-05-09 19:53:39 +00001226
Tim Petersced69f82003-09-16 20:30:58 +00001227 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001228 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001229
1230 /* Shortcuts for common default encodings */
1231 if (errors == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001232 if (strcmp(encoding, "utf-8") == 0)
1233 return PyUnicode_AsUTF8String(unicode);
1234 else if (strcmp(encoding, "latin-1") == 0)
1235 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001236#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001237 else if (strcmp(encoding, "mbcs") == 0)
1238 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001239#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001240 else if (strcmp(encoding, "ascii") == 0)
1241 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 /* Encode via the codec registry */
1245 v = PyCodec_Encode(unicode, encoding, errors);
1246 if (v == NULL)
1247 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001248 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001250 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001251 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 Py_DECREF(v);
1253 goto onError;
1254 }
1255 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001256
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001257 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 return NULL;
1259}
1260
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001261PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001262 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001263{
1264 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1265
1266 if (v)
1267 return v;
1268 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1269 if (v && errors == NULL)
1270 ((PyUnicodeObject *)unicode)->defenc = v;
1271 return v;
1272}
1273
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1275{
1276 if (!PyUnicode_Check(unicode)) {
1277 PyErr_BadArgument();
1278 goto onError;
1279 }
1280 return PyUnicode_AS_UNICODE(unicode);
1281
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001282 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 return NULL;
1284}
1285
Martin v. Löwis18e16552006-02-15 17:27:45 +00001286Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 if (!PyUnicode_Check(unicode)) {
1289 PyErr_BadArgument();
1290 goto onError;
1291 }
1292 return PyUnicode_GET_SIZE(unicode);
1293
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 return -1;
1296}
1297
Thomas Wouters78890102000-07-22 19:25:51 +00001298const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001299{
1300 return unicode_default_encoding;
1301}
1302
1303int PyUnicode_SetDefaultEncoding(const char *encoding)
1304{
1305 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001306
Fred Drakee4315f52000-05-09 19:53:39 +00001307 /* Make sure the encoding is valid. As side effect, this also
1308 loads the encoding into the codec registry cache. */
1309 v = _PyCodec_Lookup(encoding);
1310 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001311 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001312 Py_DECREF(v);
1313 strncpy(unicode_default_encoding,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001314 encoding,
1315 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001316 return 0;
1317
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001318 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001319 return -1;
1320}
1321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001322/* error handling callback helper:
1323 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001324 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325 and adjust various state variables.
1326 return 0 on success, -1 on error
1327*/
1328
1329static
1330int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001331 const char *encoding, const char *reason,
1332 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1333 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1334 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337
1338 PyObject *restuple = NULL;
1339 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001340 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1341 Py_ssize_t requiredsize;
1342 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001344 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345 int res = -1;
1346
1347 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001348 *errorHandler = PyCodec_LookupError(errors);
1349 if (*errorHandler == NULL)
1350 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 }
1352
1353 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001354 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001355 encoding, input, insize, *startinpos, *endinpos, reason);
1356 if (*exceptionObject == NULL)
1357 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 }
1359 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001360 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1361 goto onError;
1362 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1363 goto onError;
1364 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1365 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 }
1367
1368 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1369 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001370 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00001372 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 }
1375 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001376 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 if (newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001378 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001379 if (newpos<0 || newpos>insize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001380 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1381 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383
1384 /* need more space? (at least enough for what we
1385 have+the replacement+the rest of the string (starting
1386 at the new input position), so we won't have to check space
1387 when there are no errors in the rest of the string) */
1388 repptr = PyUnicode_AS_UNICODE(repunicode);
1389 repsize = PyUnicode_GET_SIZE(repunicode);
1390 requiredsize = *outpos + repsize + insize-newpos;
1391 if (requiredsize > outsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001392 if (requiredsize<2*outsize)
1393 requiredsize = 2*outsize;
1394 if (_PyUnicode_Resize(output, requiredsize) < 0)
1395 goto onError;
1396 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 }
1398 *endinpos = newpos;
1399 *inptr = input + newpos;
1400 Py_UNICODE_COPY(*outptr, repptr, repsize);
1401 *outptr += repsize;
1402 *outpos += repsize;
1403 /* we made it! */
1404 res = 0;
1405
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407 Py_XDECREF(restuple);
1408 return res;
1409}
1410
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411/* --- UTF-7 Codec -------------------------------------------------------- */
1412
1413/* see RFC2152 for details */
1414
Tim Petersced69f82003-09-16 20:30:58 +00001415static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001416char utf7_special[128] = {
1417 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1418 encoded:
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001419 0 - not special
1420 1 - special
1421 2 - whitespace (optional)
1422 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1424 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1425 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1427 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1429 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1431
1432};
1433
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001434/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1435 warnings about the comparison always being false; since
1436 utf7_special[0] is 1, we can safely make that one comparison
1437 true */
1438
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001439#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001440 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001441 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442 (encodeO && (utf7_special[(c)] == 3)))
1443
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001444#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001445 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001446#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001447 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001448#define UB64(c) \
1449 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001450 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001452#define ENCODE(out, ch, bits) \
1453 while (bits >= 6) { \
1454 *out++ = B64(ch >> (bits-6)); \
1455 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 }
1457
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001458#define DECODE(out, ch, bits, surrogate) \
1459 while (bits >= 16) { \
1460 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1461 bits -= 16; \
1462 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001463 /* We have already generated an error for the high surrogate \
1464 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001465 surrogate = 0; \
1466 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001468 it in a 16-bit character */ \
1469 surrogate = 1; \
1470 errmsg = "code pairs are not supported"; \
1471 goto utf7Error; \
1472 } else { \
1473 *out++ = outCh; \
1474 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001478 Py_ssize_t size,
1479 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001481 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1482}
1483
1484PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001485 Py_ssize_t size,
1486 const char *errors,
1487 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001490 Py_ssize_t startinpos;
1491 Py_ssize_t endinpos;
1492 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 const char *e;
1494 PyUnicodeObject *unicode;
1495 Py_UNICODE *p;
1496 const char *errmsg = "";
1497 int inShift = 0;
1498 unsigned int bitsleft = 0;
1499 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 int surrogate = 0;
1501 PyObject *errorHandler = NULL;
1502 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503
1504 unicode = _PyUnicode_New(size);
1505 if (!unicode)
1506 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001507 if (size == 0) {
1508 if (consumed)
1509 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001511 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512
1513 p = unicode->str;
1514 e = s + size;
1515
1516 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 Py_UNICODE ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001518 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001519 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520
1521 if (inShift) {
1522 if ((ch == '-') || !B64CHAR(ch)) {
1523 inShift = 0;
1524 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001525
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1527 if (bitsleft >= 6) {
1528 /* The shift sequence has a partial character in it. If
1529 bitsleft < 6 then we could just classify it as padding
1530 but that is not the case here */
1531
1532 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001533 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 }
1535 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001536 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 here so indicate the potential of a misencoded character. */
1538
1539 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1540 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1541 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001542 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 }
1544
1545 if (ch == '-') {
1546 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001547 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 inShift = 1;
1549 }
1550 } else if (SPECIAL(ch,0,0)) {
1551 errmsg = "unexpected special character";
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001552 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 } else {
1554 *p++ = ch;
1555 }
1556 } else {
1557 charsleft = (charsleft << 6) | UB64(ch);
1558 bitsleft += 6;
1559 s++;
1560 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1561 }
1562 }
1563 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 s++;
1566 if (s < e && *s == '-') {
1567 s++;
1568 *p++ = '+';
1569 } else
1570 {
1571 inShift = 1;
1572 bitsleft = 0;
1573 }
1574 }
1575 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001576 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 errmsg = "unexpected special character";
1578 s++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001579 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 }
1581 else {
1582 *p++ = ch;
1583 s++;
1584 }
1585 continue;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001586 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001587 outpos = p-PyUnicode_AS_UNICODE(unicode);
1588 endinpos = s-starts;
1589 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001590 errors, &errorHandler,
1591 "utf7", errmsg,
1592 starts, size, &startinpos, &endinpos, &exc, &s,
1593 &unicode, &outpos, &p))
1594 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 }
1596
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001597 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 outpos = p-PyUnicode_AS_UNICODE(unicode);
1599 endinpos = size;
1600 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001601 errors, &errorHandler,
1602 "utf7", "unterminated shift sequence",
1603 starts, size, &startinpos, &endinpos, &exc, &s,
1604 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 if (s < e)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001607 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001609 if (consumed) {
1610 if(inShift)
1611 *consumed = startinpos;
1612 else
1613 *consumed = s-starts;
1614 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001616 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 goto onError;
1618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619 Py_XDECREF(errorHandler);
1620 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 return (PyObject *)unicode;
1622
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001623 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 Py_XDECREF(errorHandler);
1625 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 Py_DECREF(unicode);
1627 return NULL;
1628}
1629
1630
1631PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001632 Py_ssize_t size,
1633 int encodeSetO,
1634 int encodeWhiteSpace,
1635 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636{
1637 PyObject *v;
1638 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001639 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 unsigned int bitsleft = 0;
1643 unsigned long charsleft = 0;
1644 char * out;
1645 char * start;
1646
Neal Norwitze7d8be82008-07-31 17:17:14 +00001647 if (cbAllocated / 5 != size)
1648 return PyErr_NoMemory();
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 if (size == 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001651 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001653 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 if (v == NULL)
1655 return NULL;
1656
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001657 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 for (;i < size; ++i) {
1659 Py_UNICODE ch = s[i];
1660
1661 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001662 if (ch == '+') {
1663 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 *out++ = '-';
1665 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1666 charsleft = ch;
1667 bitsleft = 16;
1668 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001669 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001671 } else {
1672 *out++ = (char) ch;
1673 }
1674 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1676 *out++ = B64(charsleft << (6-bitsleft));
1677 charsleft = 0;
1678 bitsleft = 0;
1679 /* Characters not in the BASE64 set implicitly unshift the sequence
1680 so no '-' is required, except if the character is itself a '-' */
1681 if (B64CHAR(ch) || ch == '-') {
1682 *out++ = '-';
1683 }
1684 inShift = 0;
1685 *out++ = (char) ch;
1686 } else {
1687 bitsleft += 16;
1688 charsleft = (charsleft << 16) | ch;
1689 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1690
Jesus Cea585ad8a2009-07-02 15:37:21 +00001691 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001692 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 or '-' then the shift sequence will be terminated implicitly and we
1694 don't have to insert a '-'. */
1695
1696 if (bitsleft == 0) {
1697 if (i + 1 < size) {
1698 Py_UNICODE ch2 = s[i+1];
1699
1700 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001701
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 } else if (B64CHAR(ch2) || ch2 == '-') {
1703 *out++ = '-';
1704 inShift = 0;
1705 } else {
1706 inShift = 0;
1707 }
1708
1709 }
1710 else {
1711 *out++ = '-';
1712 inShift = 0;
1713 }
1714 }
Tim Petersced69f82003-09-16 20:30:58 +00001715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001717 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 if (bitsleft) {
1719 *out++= B64(charsleft << (6-bitsleft) );
1720 *out++ = '-';
1721 }
1722
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001723 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 return v;
1725}
1726
1727#undef SPECIAL
1728#undef B64
1729#undef B64CHAR
1730#undef UB64
1731#undef ENCODE
1732#undef DECODE
1733
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734/* --- UTF-8 Codec -------------------------------------------------------- */
1735
Tim Petersced69f82003-09-16 20:30:58 +00001736static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737char utf8_code_length[256] = {
1738 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1739 illegal prefix. see RFC 2279 for details */
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1752 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1753 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1754 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1755 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1756};
1757
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001759 Py_ssize_t size,
1760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761{
Walter Dörwald69652032004-09-07 20:24:22 +00001762 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1763}
1764
1765PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001766 Py_ssize_t size,
1767 const char *errors,
1768 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t startinpos;
1773 Py_ssize_t endinpos;
1774 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 const char *e;
1776 PyUnicodeObject *unicode;
1777 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 PyObject *errorHandler = NULL;
1780 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781
1782 /* Note: size will always be longer than the resulting Unicode
1783 character count */
1784 unicode = _PyUnicode_New(size);
1785 if (!unicode)
1786 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001787 if (size == 0) {
1788 if (consumed)
1789 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792
1793 /* Unpack UTF-8 encoded data */
1794 p = unicode->str;
1795 e = s + size;
1796
1797 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001798 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
1800 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001801 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 s++;
1803 continue;
1804 }
1805
1806 n = utf8_code_length[ch];
1807
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 if (s + n > e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001809 if (consumed)
1810 break;
1811 else {
1812 errmsg = "unexpected end of data";
1813 startinpos = s-starts;
1814 endinpos = size;
1815 goto utf8Error;
1816 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818
1819 switch (n) {
1820
1821 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001822 errmsg = "unexpected code byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001823 startinpos = s-starts;
1824 endinpos = startinpos+1;
1825 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
1827 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001828 errmsg = "internal error";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001829 startinpos = s-starts;
1830 endinpos = startinpos+1;
1831 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832
1833 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 if ((s[1] & 0xc0) != 0x80) {
1835 errmsg = "invalid data";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001836 startinpos = s-starts;
1837 endinpos = startinpos+2;
1838 goto utf8Error;
1839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 if (ch < 0x80) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001842 startinpos = s-starts;
1843 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "illegal encoding";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001845 goto utf8Error;
1846 }
1847 else
1848 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 break;
1850
1851 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001852 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 (s[2] & 0xc0) != 0x80) {
1854 errmsg = "invalid data";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001855 startinpos = s-starts;
1856 endinpos = startinpos+3;
1857 goto utf8Error;
1858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001860 if (ch < 0x0800) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001861 /* Note: UTF-8 encodings of surrogates are considered
1862 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001863
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001864 XXX For wide builds (UCS-4) we should probably try
1865 to recombine the surrogates into a single code
1866 unit.
1867 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001868 errmsg = "illegal encoding";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001869 startinpos = s-starts;
1870 endinpos = startinpos+3;
1871 goto utf8Error;
1872 }
1873 else
1874 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001875 break;
1876
1877 case 4:
1878 if ((s[1] & 0xc0) != 0x80 ||
1879 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 (s[3] & 0xc0) != 0x80) {
1881 errmsg = "invalid data";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001882 startinpos = s-starts;
1883 endinpos = startinpos+4;
1884 goto utf8Error;
1885 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001886 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001887 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001888 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001889 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001890 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001891 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001892 UTF-16 */
1893 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001894 errmsg = "illegal encoding";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001895 startinpos = s-starts;
1896 endinpos = startinpos+4;
1897 goto utf8Error;
1898 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001899#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001900 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001903
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001904 /* translate from 10000..10FFFF to 0..FFFF */
1905 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001906
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001907 /* high surrogate = top 10 bits added to D800 */
1908 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001909
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001910 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 break;
1914
1915 default:
1916 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001917 errmsg = "unsupported Unicode code range";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001918 startinpos = s-starts;
1919 endinpos = startinpos+n;
1920 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 }
1922 s += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001923 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001924
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001925 utf8Error:
1926 outpos = p-PyUnicode_AS_UNICODE(unicode);
1927 if (unicode_decode_call_errorhandler(
1928 errors, &errorHandler,
1929 "utf8", errmsg,
1930 starts, size, &startinpos, &endinpos, &exc, &s,
1931 &unicode, &outpos, &p))
1932 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
Walter Dörwald69652032004-09-07 20:24:22 +00001934 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001935 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936
1937 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001938 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 goto onError;
1940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 Py_XDECREF(errorHandler);
1942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 return (PyObject *)unicode;
1944
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001945 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 Py_DECREF(unicode);
1949 return NULL;
1950}
1951
Tim Peters602f7402002-04-27 18:03:26 +00001952/* Allocation strategy: if the string is short, convert into a stack buffer
1953 and allocate exactly as much space needed at the end. Else allocate the
1954 maximum possible needed (4 result bytes per Unicode character), and return
1955 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001956*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001957PyObject *
1958PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001959 Py_ssize_t size,
1960 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961{
Tim Peters602f7402002-04-27 18:03:26 +00001962#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001963
Martin v. Löwis18e16552006-02-15 17:27:45 +00001964 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001965 PyObject *v; /* result string object */
1966 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001968 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001969 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001970
Tim Peters602f7402002-04-27 18:03:26 +00001971 assert(s != NULL);
1972 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
Tim Peters602f7402002-04-27 18:03:26 +00001974 if (size <= MAX_SHORT_UNICHARS) {
1975 /* Write into the stack buffer; nallocated can't overflow.
1976 * At the end, we'll allocate exactly as much heap space as it
1977 * turns out we need.
1978 */
1979 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1980 v = NULL; /* will allocate after we're done */
1981 p = stackbuf;
1982 }
1983 else {
1984 /* Overallocate on the heap, and give the excess back at the end. */
1985 nallocated = size * 4;
1986 if (nallocated / 4 != size) /* overflow! */
1987 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001988 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001989 if (v == NULL)
1990 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001991 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001992 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001993
Tim Peters602f7402002-04-27 18:03:26 +00001994 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001995 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001996
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001997 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001998 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002000
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002002 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002003 *p++ = (char)(0xc0 | (ch >> 6));
2004 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002005 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002006 else {
Tim Peters602f7402002-04-27 18:03:26 +00002007 /* Encode UCS2 Unicode ordinals */
2008 if (ch < 0x10000) {
2009 /* Special case: check for high surrogate */
2010 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2011 Py_UCS4 ch2 = s[i];
2012 /* Check for low surrogate and combine the two to
2013 form a UCS4 value */
2014 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002015 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002016 i++;
2017 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002018 }
Tim Peters602f7402002-04-27 18:03:26 +00002019 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002020 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002021 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002022 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2023 *p++ = (char)(0x80 | (ch & 0x3f));
2024 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002025 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002026 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002027 /* Encode UCS4 Unicode ordinals */
2028 *p++ = (char)(0xf0 | (ch >> 18));
2029 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2030 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2031 *p++ = (char)(0x80 | (ch & 0x3f));
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002034
Tim Peters602f7402002-04-27 18:03:26 +00002035 if (v == NULL) {
2036 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002037 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002038 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002039 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002040 }
2041 else {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002042 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002043 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002044 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002045 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002048
Tim Peters602f7402002-04-27 18:03:26 +00002049#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050}
2051
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2053{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 if (!PyUnicode_Check(unicode)) {
2055 PyErr_BadArgument();
2056 return NULL;
2057 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002058 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002059 PyUnicode_GET_SIZE(unicode),
2060 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061}
2062
Walter Dörwald6e390802007-08-17 16:41:28 +00002063/* --- UTF-32 Codec ------------------------------------------------------- */
2064
2065PyObject *
2066PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002067 Py_ssize_t size,
2068 const char *errors,
2069 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002070{
2071 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2072}
2073
2074PyObject *
2075PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002076 Py_ssize_t size,
2077 const char *errors,
2078 int *byteorder,
2079 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002080{
2081 const char *starts = s;
2082 Py_ssize_t startinpos;
2083 Py_ssize_t endinpos;
2084 Py_ssize_t outpos;
2085 PyUnicodeObject *unicode;
2086 Py_UNICODE *p;
2087#ifndef Py_UNICODE_WIDE
Antoine Pitrou4595e512010-06-11 21:48:02 +00002088 int pairs = 0;
Walter Dörwald6e390802007-08-17 16:41:28 +00002089#else
2090 const int pairs = 0;
2091#endif
Antoine Pitrou4595e512010-06-11 21:48:02 +00002092 const unsigned char *q, *e, *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002093 int bo = 0; /* assume native ordering by default */
2094 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002095 /* Offsets from q for retrieving bytes in the right order. */
2096#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2097 int iorder[] = {0, 1, 2, 3};
2098#else
2099 int iorder[] = {3, 2, 1, 0};
2100#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002101 PyObject *errorHandler = NULL;
2102 PyObject *exc = NULL;
Antoine Pitrou4595e512010-06-11 21:48:02 +00002103
Walter Dörwald6e390802007-08-17 16:41:28 +00002104 q = (unsigned char *)s;
2105 e = q + size;
2106
2107 if (byteorder)
2108 bo = *byteorder;
2109
2110 /* Check for BOM marks (U+FEFF) in the input and adjust current
2111 byte order setting accordingly. In native mode, the leading BOM
2112 mark is skipped, in all other modes, it is copied to the output
2113 stream as-is (giving a ZWNBSP character). */
2114 if (bo == 0) {
2115 if (size >= 4) {
2116 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002117 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002118#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002119 if (bom == 0x0000FEFF) {
2120 q += 4;
2121 bo = -1;
2122 }
2123 else if (bom == 0xFFFE0000) {
2124 q += 4;
2125 bo = 1;
2126 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002127#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002128 if (bom == 0x0000FEFF) {
2129 q += 4;
2130 bo = 1;
2131 }
2132 else if (bom == 0xFFFE0000) {
2133 q += 4;
2134 bo = -1;
2135 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002136#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002137 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002138 }
2139
2140 if (bo == -1) {
2141 /* force LE */
2142 iorder[0] = 0;
2143 iorder[1] = 1;
2144 iorder[2] = 2;
2145 iorder[3] = 3;
2146 }
2147 else if (bo == 1) {
2148 /* force BE */
2149 iorder[0] = 3;
2150 iorder[1] = 2;
2151 iorder[2] = 1;
2152 iorder[3] = 0;
2153 }
2154
Antoine Pitrou4595e512010-06-11 21:48:02 +00002155 /* On narrow builds we split characters outside the BMP into two
2156 codepoints => count how much extra space we need. */
2157#ifndef Py_UNICODE_WIDE
2158 for (qq = q; qq < e; qq += 4)
2159 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2160 pairs++;
2161#endif
2162
2163 /* This might be one to much, because of a BOM */
2164 unicode = _PyUnicode_New((size+3)/4+pairs);
2165 if (!unicode)
2166 return NULL;
2167 if (size == 0)
2168 return (PyObject *)unicode;
2169
2170 /* Unpack UTF-32 encoded data */
2171 p = unicode->str;
2172
Walter Dörwald6e390802007-08-17 16:41:28 +00002173 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002174 Py_UCS4 ch;
2175 /* remaining bytes at the end? (size should be divisible by 4) */
2176 if (e-q<4) {
2177 if (consumed)
2178 break;
2179 errmsg = "truncated data";
2180 startinpos = ((const char *)q)-starts;
2181 endinpos = ((const char *)e)-starts;
2182 goto utf32Error;
2183 /* The remaining input chars are ignored if the callback
2184 chooses to skip the input */
2185 }
2186 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2187 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002188
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002189 if (ch >= 0x110000)
2190 {
2191 errmsg = "codepoint not in range(0x110000)";
2192 startinpos = ((const char *)q)-starts;
2193 endinpos = startinpos+4;
2194 goto utf32Error;
2195 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002196#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002197 if (ch >= 0x10000)
2198 {
2199 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2200 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2201 }
2202 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002203#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002204 *p++ = ch;
2205 q += 4;
2206 continue;
2207 utf32Error:
2208 outpos = p-PyUnicode_AS_UNICODE(unicode);
2209 if (unicode_decode_call_errorhandler(
2210 errors, &errorHandler,
2211 "utf32", errmsg,
Georg Brandlf7a09be2009-09-17 11:33:31 +00002212 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002213 &unicode, &outpos, &p))
2214 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002215 }
2216
2217 if (byteorder)
2218 *byteorder = bo;
2219
2220 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002221 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002222
2223 /* Adjust length */
2224 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2225 goto onError;
2226
2227 Py_XDECREF(errorHandler);
2228 Py_XDECREF(exc);
2229 return (PyObject *)unicode;
2230
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002231 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002232 Py_DECREF(unicode);
2233 Py_XDECREF(errorHandler);
2234 Py_XDECREF(exc);
2235 return NULL;
2236}
2237
2238PyObject *
2239PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002240 Py_ssize_t size,
2241 const char *errors,
2242 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002243{
2244 PyObject *v;
2245 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002246 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002247#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002248 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002249#else
2250 const int pairs = 0;
2251#endif
2252 /* Offsets from p for storing byte pairs in the right order. */
2253#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2254 int iorder[] = {0, 1, 2, 3};
2255#else
2256 int iorder[] = {3, 2, 1, 0};
2257#endif
2258
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002259#define STORECHAR(CH) \
2260 do { \
2261 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2262 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2263 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2264 p[iorder[0]] = (CH) & 0xff; \
2265 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002266 } while(0)
2267
2268 /* In narrow builds we can output surrogate pairs as one codepoint,
2269 so we need less space. */
2270#ifndef Py_UNICODE_WIDE
2271 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002272 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2273 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2274 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002275#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002276 nsize = (size - pairs + (byteorder == 0));
2277 bytesize = nsize * 4;
2278 if (bytesize / 4 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002279 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002280 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002281 if (v == NULL)
2282 return NULL;
2283
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002284 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002285 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002286 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002287 if (size == 0)
2288 return v;
2289
2290 if (byteorder == -1) {
2291 /* force LE */
2292 iorder[0] = 0;
2293 iorder[1] = 1;
2294 iorder[2] = 2;
2295 iorder[3] = 3;
2296 }
2297 else if (byteorder == 1) {
2298 /* force BE */
2299 iorder[0] = 3;
2300 iorder[1] = 2;
2301 iorder[2] = 1;
2302 iorder[3] = 0;
2303 }
2304
2305 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002306 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002307#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002308 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2309 Py_UCS4 ch2 = *s;
2310 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2311 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2312 s++;
2313 size--;
2314 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002315 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002316#endif
2317 STORECHAR(ch);
2318 }
2319 return v;
2320#undef STORECHAR
2321}
2322
2323PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2324{
2325 if (!PyUnicode_Check(unicode)) {
2326 PyErr_BadArgument();
2327 return NULL;
2328 }
2329 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002330 PyUnicode_GET_SIZE(unicode),
2331 NULL,
2332 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002333}
2334
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335/* --- UTF-16 Codec ------------------------------------------------------- */
2336
Tim Peters772747b2001-08-09 22:21:55 +00002337PyObject *
2338PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002339 Py_ssize_t size,
2340 const char *errors,
2341 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342{
Walter Dörwald69652032004-09-07 20:24:22 +00002343 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2344}
2345
2346PyObject *
2347PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002348 Py_ssize_t size,
2349 const char *errors,
2350 int *byteorder,
2351 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002354 Py_ssize_t startinpos;
2355 Py_ssize_t endinpos;
2356 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 PyUnicodeObject *unicode;
2358 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002359 const unsigned char *q, *e;
2360 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002361 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002362 /* Offsets from q for retrieving byte pairs in the right order. */
2363#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2364 int ihi = 1, ilo = 0;
2365#else
2366 int ihi = 0, ilo = 1;
2367#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002368 PyObject *errorHandler = NULL;
2369 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370
2371 /* Note: size will always be longer than the resulting Unicode
2372 character count */
2373 unicode = _PyUnicode_New(size);
2374 if (!unicode)
2375 return NULL;
2376 if (size == 0)
2377 return (PyObject *)unicode;
2378
2379 /* Unpack UTF-16 encoded data */
2380 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002381 q = (unsigned char *)s;
2382 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383
2384 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002385 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002387 /* Check for BOM marks (U+FEFF) in the input and adjust current
2388 byte order setting accordingly. In native mode, the leading BOM
2389 mark is skipped, in all other modes, it is copied to the output
2390 stream as-is (giving a ZWNBSP character). */
2391 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002392 if (size >= 2) {
2393 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002394#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002395 if (bom == 0xFEFF) {
2396 q += 2;
2397 bo = -1;
2398 }
2399 else if (bom == 0xFFFE) {
2400 q += 2;
2401 bo = 1;
2402 }
Tim Petersced69f82003-09-16 20:30:58 +00002403#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002404 if (bom == 0xFEFF) {
2405 q += 2;
2406 bo = 1;
2407 }
2408 else if (bom == 0xFFFE) {
2409 q += 2;
2410 bo = -1;
2411 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002412#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002413 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415
Tim Peters772747b2001-08-09 22:21:55 +00002416 if (bo == -1) {
2417 /* force LE */
2418 ihi = 1;
2419 ilo = 0;
2420 }
2421 else if (bo == 1) {
2422 /* force BE */
2423 ihi = 0;
2424 ilo = 1;
2425 }
2426
2427 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002428 Py_UNICODE ch;
2429 /* remaining bytes at the end? (size should be even) */
2430 if (e-q<2) {
2431 if (consumed)
2432 break;
2433 errmsg = "truncated data";
2434 startinpos = ((const char *)q)-starts;
2435 endinpos = ((const char *)e)-starts;
2436 goto utf16Error;
2437 /* The remaining input chars are ignored if the callback
2438 chooses to skip the input */
2439 }
2440 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002441
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002442 q += 2;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002443
2444 if (ch < 0xD800 || ch > 0xDFFF) {
2445 *p++ = ch;
2446 continue;
2447 }
2448
2449 /* UTF-16 code pair: */
2450 if (q >= e) {
2451 errmsg = "unexpected end of data";
2452 startinpos = (((const char *)q)-2)-starts;
2453 endinpos = ((const char *)e)-starts;
2454 goto utf16Error;
2455 }
2456 if (0xD800 <= ch && ch <= 0xDBFF) {
2457 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2458 q += 2;
2459 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002460#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002461 *p++ = ch;
2462 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002463#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002464 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002465#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002466 continue;
2467 }
2468 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002469 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002470 startinpos = (((const char *)q)-4)-starts;
2471 endinpos = startinpos+2;
2472 goto utf16Error;
2473 }
2474
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002475 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002476 errmsg = "illegal encoding";
2477 startinpos = (((const char *)q)-2)-starts;
2478 endinpos = startinpos+2;
2479 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002480
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002481 utf16Error:
2482 outpos = p-PyUnicode_AS_UNICODE(unicode);
2483 if (unicode_decode_call_errorhandler(
2484 errors, &errorHandler,
2485 "utf16", errmsg,
2486 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2487 &unicode, &outpos, &p))
2488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 }
2490
2491 if (byteorder)
2492 *byteorder = bo;
2493
Walter Dörwald69652032004-09-07 20:24:22 +00002494 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002495 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002496
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002498 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 goto onError;
2500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 return (PyObject *)unicode;
2504
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002505 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 Py_XDECREF(errorHandler);
2508 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 return NULL;
2510}
2511
Tim Peters772747b2001-08-09 22:21:55 +00002512PyObject *
2513PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002514 Py_ssize_t size,
2515 const char *errors,
2516 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517{
2518 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002519 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002520 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002521#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002522 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002523#else
2524 const int pairs = 0;
2525#endif
Tim Peters772747b2001-08-09 22:21:55 +00002526 /* Offsets from p for storing byte pairs in the right order. */
2527#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2528 int ihi = 1, ilo = 0;
2529#else
2530 int ihi = 0, ilo = 1;
2531#endif
2532
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002533#define STORECHAR(CH) \
2534 do { \
2535 p[ihi] = ((CH) >> 8) & 0xff; \
2536 p[ilo] = (CH) & 0xff; \
2537 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002538 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002540#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002541 for (i = pairs = 0; i < size; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002542 if (s[i] >= 0x10000)
2543 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002544#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002545 /* 2 * (size + pairs + (byteorder == 0)) */
2546 if (size > PY_SSIZE_T_MAX ||
2547 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002548 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002549 nsize = size + pairs + (byteorder == 0);
2550 bytesize = nsize * 2;
2551 if (bytesize / 2 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002552 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002553 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 if (v == NULL)
2555 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002557 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002559 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002560 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002561 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002562
2563 if (byteorder == -1) {
2564 /* force LE */
2565 ihi = 1;
2566 ilo = 0;
2567 }
2568 else if (byteorder == 1) {
2569 /* force BE */
2570 ihi = 0;
2571 ilo = 1;
2572 }
2573
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002575 Py_UNICODE ch = *s++;
2576 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002577#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002578 if (ch >= 0x10000) {
2579 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2580 ch = 0xD800 | ((ch-0x10000) >> 10);
2581 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002582#endif
Tim Peters772747b2001-08-09 22:21:55 +00002583 STORECHAR(ch);
2584 if (ch2)
2585 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002588#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589}
2590
2591PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2592{
2593 if (!PyUnicode_Check(unicode)) {
2594 PyErr_BadArgument();
2595 return NULL;
2596 }
2597 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002598 PyUnicode_GET_SIZE(unicode),
2599 NULL,
2600 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601}
2602
2603/* --- Unicode Escape Codec ----------------------------------------------- */
2604
Fredrik Lundh06d12682001-01-24 07:59:11 +00002605static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002606
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002608 Py_ssize_t size,
2609 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002612 Py_ssize_t startinpos;
2613 Py_ssize_t endinpos;
2614 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002619 char* message;
2620 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 PyObject *errorHandler = NULL;
2622 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002623
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 /* Escaped strings will always be longer than the resulting
2625 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 length after conversion to the true value.
2627 (but if the error callback returns a long replacement string
2628 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 v = _PyUnicode_New(size);
2630 if (v == NULL)
2631 goto onError;
2632 if (size == 0)
2633 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002637
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 while (s < end) {
2639 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002640 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002641 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642
2643 /* Non-escape characters are interpreted as Unicode ordinals */
2644 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002645 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 continue;
2647 }
2648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 /* \ - Escapes */
2651 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002652 c = *s++;
2653 if (s > end)
2654 c = '\0'; /* Invalid after \ */
2655 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002657 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 case '\n': break;
2659 case '\\': *p++ = '\\'; break;
2660 case '\'': *p++ = '\''; break;
2661 case '\"': *p++ = '\"'; break;
2662 case 'b': *p++ = '\b'; break;
2663 case 'f': *p++ = '\014'; break; /* FF */
2664 case 't': *p++ = '\t'; break;
2665 case 'n': *p++ = '\n'; break;
2666 case 'r': *p++ = '\r'; break;
2667 case 'v': *p++ = '\013'; break; /* VT */
2668 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2669
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002670 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 case '0': case '1': case '2': case '3':
2672 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002673 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002674 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002675 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002676 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002677 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002679 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 break;
2681
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002682 /* hex escapes */
2683 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002685 digits = 2;
2686 message = "truncated \\xXX escape";
2687 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002689 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002691 digits = 4;
2692 message = "truncated \\uXXXX escape";
2693 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002695 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002696 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002697 digits = 8;
2698 message = "truncated \\UXXXXXXXX escape";
2699 hexescape:
2700 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 outpos = p-PyUnicode_AS_UNICODE(v);
2702 if (s+digits>end) {
2703 endinpos = size;
2704 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002705 errors, &errorHandler,
2706 "unicodeescape", "end of string in escape sequence",
2707 starts, size, &startinpos, &endinpos, &exc, &s,
2708 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 goto onError;
2710 goto nextByte;
2711 }
2712 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002713 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002714 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 endinpos = (s+i+1)-starts;
2716 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002717 errors, &errorHandler,
2718 "unicodeescape", message,
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002723 }
2724 chr = (chr<<4) & ~0xF;
2725 if (c >= '0' && c <= '9')
2726 chr += c - '0';
2727 else if (c >= 'a' && c <= 'f')
2728 chr += 10 + c - 'a';
2729 else
2730 chr += 10 + c - 'A';
2731 }
2732 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002733 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 /* _decoding_error will have already written into the
2735 target buffer. */
2736 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002737 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002738 /* when we get here, chr is a 32-bit unicode character */
2739 if (chr <= 0xffff)
2740 /* UCS-2 character */
2741 *p++ = (Py_UNICODE) chr;
2742 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002743 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002744 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002745#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002746 *p++ = chr;
2747#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002748 chr -= 0x10000L;
2749 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002750 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002751#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 endinpos = s-starts;
2754 outpos = p-PyUnicode_AS_UNICODE(v);
2755 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002756 errors, &errorHandler,
2757 "unicodeescape", "illegal Unicode character",
2758 starts, size, &startinpos, &endinpos, &exc, &s,
2759 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002760 goto onError;
2761 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 break;
2763
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002764 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 case 'N':
2766 message = "malformed \\N character escape";
2767 if (ucnhash_CAPI == NULL) {
2768 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002769 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002770 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 if (m == NULL)
2772 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002773 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002774 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002775 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002776 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002777 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002778 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002779 if (ucnhash_CAPI == NULL)
2780 goto ucnhashError;
2781 }
2782 if (*s == '{') {
2783 const char *start = s+1;
2784 /* look for the closing brace */
2785 while (*s != '}' && s < end)
2786 s++;
2787 if (s > start && s < end && *s == '}') {
2788 /* found a name. look it up in the unicode database */
2789 message = "unknown Unicode character name";
2790 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002791 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 goto store;
2793 }
2794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002795 endinpos = s-starts;
2796 outpos = p-PyUnicode_AS_UNICODE(v);
2797 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002798 errors, &errorHandler,
2799 "unicodeescape", message,
2800 starts, size, &startinpos, &endinpos, &exc, &s,
2801 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002802 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002803 break;
2804
2805 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002806 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 message = "\\ at end of string";
2808 s--;
2809 endinpos = s-starts;
2810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002812 errors, &errorHandler,
2813 "unicodeescape", message,
2814 starts, size, &startinpos, &endinpos, &exc, &s,
2815 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002816 goto onError;
2817 }
2818 else {
2819 *p++ = '\\';
2820 *p++ = (unsigned char)s[-1];
2821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002824 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002827 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002829 Py_XDECREF(errorHandler);
2830 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002832
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002833 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002834 PyErr_SetString(
2835 PyExc_UnicodeError,
2836 "\\N escapes not supported (can't load unicodedata module)"
2837 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839 Py_XDECREF(errorHandler);
2840 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002841 return NULL;
2842
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 Py_XDECREF(errorHandler);
2846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 return NULL;
2848}
2849
2850/* Return a Unicode-Escape string version of the Unicode object.
2851
2852 If quotes is true, the string is enclosed in u"" or u'' quotes as
2853 appropriate.
2854
2855*/
2856
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002857Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002858 Py_ssize_t size,
2859 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002860{
2861 /* like wcschr, but doesn't stop at NULL characters */
2862
2863 while (size-- > 0) {
2864 if (*s == ch)
2865 return s;
2866 s++;
2867 }
2868
2869 return NULL;
2870}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002871
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872static
2873PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002874 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 int quotes)
2876{
2877 PyObject *repr;
2878 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002880 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002881#ifdef Py_UNICODE_WIDE
2882 const Py_ssize_t expandsize = 10;
2883#else
2884 const Py_ssize_t expandsize = 6;
2885#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
Neal Norwitz17753ec2006-08-21 22:21:19 +00002887 /* XXX(nnorwitz): rather than over-allocating, it would be
2888 better to choose a different scheme. Perhaps scan the
2889 first N-chars of the string and allocate based on that size.
2890 */
2891 /* Initial allocation is based on the longest-possible unichr
2892 escape.
2893
2894 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2895 unichr, so in this case it's the longest unichr escape. In
2896 narrow (UTF-16) builds this is five chars per source unichr
2897 since there are two unichrs in the surrogate pair, so in narrow
2898 (UTF-16) builds it's not the longest unichr escape.
2899
2900 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2901 so in the narrow (UTF-16) build case it's the longest unichr
2902 escape.
2903 */
2904
Neal Norwitze7d8be82008-07-31 17:17:14 +00002905 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002906 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002907
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002908 repr = PyString_FromStringAndSize(NULL,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002909 2
2910 + expandsize*size
2911 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 if (repr == NULL)
2913 return NULL;
2914
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002915 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916
2917 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002919 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 !findchar(s, size, '"')) ? '"' : '\'';
2921 }
2922 while (size-- > 0) {
2923 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002924
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002925 /* Escape quotes and backslashes */
2926 if ((quotes &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002927 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 *p++ = '\\';
2929 *p++ = (char) ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002930 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002931 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002932
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002933#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002934 /* Map 21-bit characters to '\U00xxxxxx' */
2935 else if (ch >= 0x10000) {
2936 *p++ = '\\';
2937 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002938 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2939 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2940 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2941 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2942 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2943 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2944 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002945 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002946 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002947 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002948#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002949 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2950 else if (ch >= 0xD800 && ch < 0xDC00) {
2951 Py_UNICODE ch2;
2952 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002953
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002954 ch2 = *s++;
2955 size--;
2956 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2957 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2958 *p++ = '\\';
2959 *p++ = 'U';
2960 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2961 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2962 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2963 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2964 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2965 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2966 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2967 *p++ = hexdigit[ucs & 0x0000000F];
2968 continue;
2969 }
2970 /* Fall through: isolated surrogates are copied as-is */
2971 s--;
2972 size++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002973 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002974#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002975
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002977 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 *p++ = '\\';
2979 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002980 *p++ = hexdigit[(ch >> 12) & 0x000F];
2981 *p++ = hexdigit[(ch >> 8) & 0x000F];
2982 *p++ = hexdigit[(ch >> 4) & 0x000F];
2983 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002985
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002986 /* Map special whitespace to '\t', \n', '\r' */
2987 else if (ch == '\t') {
2988 *p++ = '\\';
2989 *p++ = 't';
2990 }
2991 else if (ch == '\n') {
2992 *p++ = '\\';
2993 *p++ = 'n';
2994 }
2995 else if (ch == '\r') {
2996 *p++ = '\\';
2997 *p++ = 'r';
2998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002999
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003000 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003001 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003003 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003004 *p++ = hexdigit[(ch >> 4) & 0x000F];
3005 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003006 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003007
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 /* Copy everything else as-is */
3009 else
3010 *p++ = (char) ch;
3011 }
3012 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003013 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
3015 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003016 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 return repr;
3018}
3019
3020PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003021 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022{
3023 return unicodeescape_string(s, size, 0);
3024}
3025
3026PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3027{
3028 if (!PyUnicode_Check(unicode)) {
3029 PyErr_BadArgument();
3030 return NULL;
3031 }
3032 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003033 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034}
3035
3036/* --- Raw Unicode Escape Codec ------------------------------------------- */
3037
3038PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003039 Py_ssize_t size,
3040 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003043 Py_ssize_t startinpos;
3044 Py_ssize_t endinpos;
3045 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 const char *end;
3049 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 PyObject *errorHandler = NULL;
3051 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 /* Escaped strings will always be longer than the resulting
3054 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 length after conversion to the true value. (But decoding error
3056 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 v = _PyUnicode_New(size);
3058 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003059 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003061 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 end = s + size;
3064 while (s < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003065 unsigned char c;
3066 Py_UCS4 x;
3067 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003068 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003070 /* Non-escape characters are interpreted as Unicode ordinals */
3071 if (*s != '\\') {
3072 *p++ = (unsigned char)*s++;
3073 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003074 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003075 startinpos = s-starts;
3076
3077 /* \u-escapes are only interpreted iff the number of leading
3078 backslashes if odd */
3079 bs = s;
3080 for (;s < end;) {
3081 if (*s != '\\')
3082 break;
3083 *p++ = (unsigned char)*s++;
3084 }
3085 if (((s - bs) & 1) == 0 ||
3086 s >= end ||
3087 (*s != 'u' && *s != 'U')) {
3088 continue;
3089 }
3090 p--;
3091 count = *s=='u' ? 4 : 8;
3092 s++;
3093
3094 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3095 outpos = p-PyUnicode_AS_UNICODE(v);
3096 for (x = 0, i = 0; i < count; ++i, ++s) {
3097 c = (unsigned char)*s;
3098 if (!isxdigit(c)) {
3099 endinpos = s-starts;
3100 if (unicode_decode_call_errorhandler(
3101 errors, &errorHandler,
3102 "rawunicodeescape", "truncated \\uXXXX",
3103 starts, size, &startinpos, &endinpos, &exc, &s,
3104 &v, &outpos, &p))
3105 goto onError;
3106 goto nextByte;
3107 }
3108 x = (x<<4) & ~0xF;
3109 if (c >= '0' && c <= '9')
3110 x += c - '0';
3111 else if (c >= 'a' && c <= 'f')
3112 x += 10 + c - 'a';
3113 else
3114 x += 10 + c - 'A';
3115 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003116 if (x <= 0xffff)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003117 /* UCS-2 character */
3118 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003119 else if (x <= 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003120 /* UCS-4 character. Either store directly, or as
3121 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003122#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003123 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003124#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003125 x -= 0x10000L;
3126 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3127 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003128#endif
3129 } else {
3130 endinpos = s-starts;
3131 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003132 if (unicode_decode_call_errorhandler(
3133 errors, &errorHandler,
3134 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003135 starts, size, &startinpos, &endinpos, &exc, &s,
3136 &v, &outpos, &p))
3137 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003138 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003139 nextByte:
3140 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003142 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003143 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 Py_XDECREF(errorHandler);
3145 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003147
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 Py_XDECREF(errorHandler);
3151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 return NULL;
3153}
3154
3155PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003156 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157{
3158 PyObject *repr;
3159 char *p;
3160 char *q;
3161
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003162 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003163#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003164 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003165#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003166 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003167#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003168
Neal Norwitze7d8be82008-07-31 17:17:14 +00003169 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003170 return PyErr_NoMemory();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003171
Neal Norwitze7d8be82008-07-31 17:17:14 +00003172 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (repr == NULL)
3174 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003175 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003176 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003178 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 while (size-- > 0) {
3180 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003182 /* Map 32-bit characters to '\Uxxxxxxxx' */
3183 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003184 *p++ = '\\';
3185 *p++ = 'U';
3186 *p++ = hexdigit[(ch >> 28) & 0xf];
3187 *p++ = hexdigit[(ch >> 24) & 0xf];
3188 *p++ = hexdigit[(ch >> 20) & 0xf];
3189 *p++ = hexdigit[(ch >> 16) & 0xf];
3190 *p++ = hexdigit[(ch >> 12) & 0xf];
3191 *p++ = hexdigit[(ch >> 8) & 0xf];
3192 *p++ = hexdigit[(ch >> 4) & 0xf];
3193 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003194 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003195 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003196#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003197 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3198 if (ch >= 0xD800 && ch < 0xDC00) {
3199 Py_UNICODE ch2;
3200 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003201
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003202 ch2 = *s++;
3203 size--;
3204 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3205 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3206 *p++ = '\\';
3207 *p++ = 'U';
3208 *p++ = hexdigit[(ucs >> 28) & 0xf];
3209 *p++ = hexdigit[(ucs >> 24) & 0xf];
3210 *p++ = hexdigit[(ucs >> 20) & 0xf];
3211 *p++ = hexdigit[(ucs >> 16) & 0xf];
3212 *p++ = hexdigit[(ucs >> 12) & 0xf];
3213 *p++ = hexdigit[(ucs >> 8) & 0xf];
3214 *p++ = hexdigit[(ucs >> 4) & 0xf];
3215 *p++ = hexdigit[ucs & 0xf];
3216 continue;
3217 }
3218 /* Fall through: isolated surrogates are copied as-is */
3219 s--;
3220 size++;
3221 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003222#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003223 /* Map 16-bit characters to '\uxxxx' */
3224 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 *p++ = '\\';
3226 *p++ = 'u';
3227 *p++ = hexdigit[(ch >> 12) & 0xf];
3228 *p++ = hexdigit[(ch >> 8) & 0xf];
3229 *p++ = hexdigit[(ch >> 4) & 0xf];
3230 *p++ = hexdigit[ch & 15];
3231 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003232 /* Copy everything else as-is */
3233 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 *p++ = (char) ch;
3235 }
3236 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003237 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 return repr;
3239}
3240
3241PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3242{
3243 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003244 PyErr_BadArgument();
3245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
3247 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003248 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249}
3250
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003251/* --- Unicode Internal Codec ------------------------------------------- */
3252
3253PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003254 Py_ssize_t size,
3255 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003256{
3257 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003258 Py_ssize_t startinpos;
3259 Py_ssize_t endinpos;
3260 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003261 PyUnicodeObject *v;
3262 Py_UNICODE *p;
3263 const char *end;
3264 const char *reason;
3265 PyObject *errorHandler = NULL;
3266 PyObject *exc = NULL;
3267
Neal Norwitzd43069c2006-01-08 01:12:10 +00003268#ifdef Py_UNICODE_WIDE
3269 Py_UNICODE unimax = PyUnicode_GetMax();
3270#endif
3271
Armin Rigo7ccbca92006-10-04 12:17:45 +00003272 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003273 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3274 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003275 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003276 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003277 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003278 p = PyUnicode_AS_UNICODE(v);
3279 end = s + size;
3280
3281 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003282 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003283 /* We have to sanity check the raw data, otherwise doom looms for
3284 some malformed UCS-4 data. */
3285 if (
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003286#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003287 *p > unimax || *p < 0 ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003288#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003289 end-s < Py_UNICODE_SIZE
3290 )
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003291 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003292 startinpos = s - starts;
3293 if (end-s < Py_UNICODE_SIZE) {
3294 endinpos = end-starts;
3295 reason = "truncated input";
3296 }
3297 else {
3298 endinpos = s - starts + Py_UNICODE_SIZE;
3299 reason = "illegal code point (> 0x10FFFF)";
3300 }
3301 outpos = p - PyUnicode_AS_UNICODE(v);
3302 if (unicode_decode_call_errorhandler(
3303 errors, &errorHandler,
3304 "unicode_internal", reason,
3305 starts, size, &startinpos, &endinpos, &exc, &s,
Benjamin Peterson828a7062008-12-27 17:05:29 +00003306 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003307 goto onError;
3308 }
3309 }
3310 else {
3311 p++;
3312 s += Py_UNICODE_SIZE;
3313 }
3314 }
3315
Martin v. Löwis412fb672006-04-13 06:34:32 +00003316 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003317 goto onError;
3318 Py_XDECREF(errorHandler);
3319 Py_XDECREF(exc);
3320 return (PyObject *)v;
3321
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003322 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003323 Py_XDECREF(v);
3324 Py_XDECREF(errorHandler);
3325 Py_XDECREF(exc);
3326 return NULL;
3327}
3328
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329/* --- Latin-1 Codec ------------------------------------------------------ */
3330
3331PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003332 Py_ssize_t size,
3333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334{
3335 PyUnicodeObject *v;
3336 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003339 if (size == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003340 Py_UNICODE r = *(unsigned char*)s;
3341 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003342 }
3343
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 v = _PyUnicode_New(size);
3345 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003346 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003348 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 p = PyUnicode_AS_UNICODE(v);
3350 while (size-- > 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003351 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003353
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003354 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 Py_XDECREF(v);
3356 return NULL;
3357}
3358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359/* create or adjust a UnicodeEncodeError */
3360static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003361 const char *encoding,
3362 const Py_UNICODE *unicode, Py_ssize_t size,
3363 Py_ssize_t startpos, Py_ssize_t endpos,
3364 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 if (*exceptionObject == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003367 *exceptionObject = PyUnicodeEncodeError_Create(
3368 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 }
3370 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003371 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3372 goto onError;
3373 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3374 goto onError;
3375 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3376 goto onError;
3377 return;
3378 onError:
3379 Py_DECREF(*exceptionObject);
3380 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 }
3382}
3383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384/* raises a UnicodeEncodeError */
3385static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003386 const char *encoding,
3387 const Py_UNICODE *unicode, Py_ssize_t size,
3388 Py_ssize_t startpos, Py_ssize_t endpos,
3389 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390{
3391 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003392 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003394 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395}
3396
3397/* error handling callback helper:
3398 build arguments, call the callback and check the arguments,
3399 put the result into newpos and return the replacement string, which
3400 has to be freed by the caller */
3401static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003402 PyObject **errorHandler,
3403 const char *encoding, const char *reason,
3404 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3405 Py_ssize_t startpos, Py_ssize_t endpos,
3406 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003408 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409
3410 PyObject *restuple;
3411 PyObject *resunicode;
3412
3413 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003414 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003416 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 }
3418
3419 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003420 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423
3424 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003425 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00003429 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003430 Py_DECREF(restuple);
3431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 }
3433 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003434 &resunicode, newpos)) {
3435 Py_DECREF(restuple);
3436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 }
3438 if (*newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003439 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003440 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003441 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3442 Py_DECREF(restuple);
3443 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 Py_INCREF(resunicode);
3446 Py_DECREF(restuple);
3447 return resunicode;
3448}
3449
3450static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003451 Py_ssize_t size,
3452 const char *errors,
3453 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454{
3455 /* output object */
3456 PyObject *res;
3457 /* pointers to the beginning and end+1 of input */
3458 const Py_UNICODE *startp = p;
3459 const Py_UNICODE *endp = p + size;
3460 /* pointer to the beginning of the unencodable characters */
3461 /* const Py_UNICODE *badp = NULL; */
3462 /* pointer into the output */
3463 char *str;
3464 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003465 Py_ssize_t respos = 0;
3466 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003467 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3468 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 PyObject *errorHandler = NULL;
3470 PyObject *exc = NULL;
3471 /* the following variable is used for caching string comparisons
3472 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3473 int known_errorHandler = -1;
3474
3475 /* allocate enough for a simple encoding without
3476 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003477 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (res == NULL)
3479 goto onError;
3480 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003481 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003482 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 ressize = size;
3484
3485 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003486 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003488 /* can we encode this? */
3489 if (c<limit) {
3490 /* no overflow check, because we know that the space is enough */
3491 *str++ = (char)c;
3492 ++p;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003493 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003494 else {
3495 Py_ssize_t unicodepos = p-startp;
3496 Py_ssize_t requiredsize;
3497 PyObject *repunicode;
3498 Py_ssize_t repsize;
3499 Py_ssize_t newpos;
3500 Py_ssize_t respos;
3501 Py_UNICODE *uni2;
3502 /* startpos for collecting unencodable chars */
3503 const Py_UNICODE *collstart = p;
3504 const Py_UNICODE *collend = p;
3505 /* find all unecodable characters */
3506 while ((collend < endp) && ((*collend)>=limit))
3507 ++collend;
3508 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3509 if (known_errorHandler==-1) {
3510 if ((errors==NULL) || (!strcmp(errors, "strict")))
3511 known_errorHandler = 1;
3512 else if (!strcmp(errors, "replace"))
3513 known_errorHandler = 2;
3514 else if (!strcmp(errors, "ignore"))
3515 known_errorHandler = 3;
3516 else if (!strcmp(errors, "xmlcharrefreplace"))
3517 known_errorHandler = 4;
3518 else
3519 known_errorHandler = 0;
3520 }
3521 switch (known_errorHandler) {
3522 case 1: /* strict */
3523 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3524 goto onError;
3525 case 2: /* replace */
3526 while (collstart++<collend)
3527 *str++ = '?'; /* fall through */
3528 case 3: /* ignore */
3529 p = collend;
3530 break;
3531 case 4: /* xmlcharrefreplace */
3532 respos = str-PyString_AS_STRING(res);
3533 /* determine replacement size (temporarily (mis)uses p) */
3534 for (p = collstart, repsize = 0; p < collend; ++p) {
3535 if (*p<10)
3536 repsize += 2+1+1;
3537 else if (*p<100)
3538 repsize += 2+2+1;
3539 else if (*p<1000)
3540 repsize += 2+3+1;
3541 else if (*p<10000)
3542 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003543#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003544 else
3545 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003546#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003547 else if (*p<100000)
3548 repsize += 2+5+1;
3549 else if (*p<1000000)
3550 repsize += 2+6+1;
3551 else
3552 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003553#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003554 }
3555 requiredsize = respos+repsize+(endp-collend);
3556 if (requiredsize > ressize) {
3557 if (requiredsize<2*ressize)
3558 requiredsize = 2*ressize;
3559 if (_PyString_Resize(&res, requiredsize))
3560 goto onError;
3561 str = PyString_AS_STRING(res) + respos;
3562 ressize = requiredsize;
3563 }
3564 /* generate replacement (temporarily (mis)uses p) */
3565 for (p = collstart; p < collend; ++p) {
3566 str += sprintf(str, "&#%d;", (int)*p);
3567 }
3568 p = collend;
3569 break;
3570 default:
3571 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3572 encoding, reason, startp, size, &exc,
3573 collstart-startp, collend-startp, &newpos);
3574 if (repunicode == NULL)
3575 goto onError;
3576 /* need more space? (at least enough for what we
3577 have+the replacement+the rest of the string, so
3578 we won't have to check space for encodable characters) */
3579 respos = str-PyString_AS_STRING(res);
3580 repsize = PyUnicode_GET_SIZE(repunicode);
3581 requiredsize = respos+repsize+(endp-collend);
3582 if (requiredsize > ressize) {
3583 if (requiredsize<2*ressize)
3584 requiredsize = 2*ressize;
3585 if (_PyString_Resize(&res, requiredsize)) {
3586 Py_DECREF(repunicode);
3587 goto onError;
3588 }
3589 str = PyString_AS_STRING(res) + respos;
3590 ressize = requiredsize;
3591 }
3592 /* check if there is anything unencodable in the replacement
3593 and copy it to the output */
3594 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3595 c = *uni2;
3596 if (c >= limit) {
3597 raise_encode_exception(&exc, encoding, startp, size,
3598 unicodepos, unicodepos+1, reason);
3599 Py_DECREF(repunicode);
3600 goto onError;
3601 }
3602 *str = (char)c;
3603 }
3604 p = startp + newpos;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003605 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003606 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003607 }
3608 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003610 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 if (respos<ressize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003612 /* If this falls res will be NULL */
3613 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 Py_XDECREF(errorHandler);
3615 Py_XDECREF(exc);
3616 return res;
3617
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003618 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 Py_XDECREF(res);
3620 Py_XDECREF(errorHandler);
3621 Py_XDECREF(exc);
3622 return NULL;
3623}
3624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003626 Py_ssize_t size,
3627 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630}
3631
3632PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3633{
3634 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003635 PyErr_BadArgument();
3636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 }
3638 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003639 PyUnicode_GET_SIZE(unicode),
3640 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641}
3642
3643/* --- 7-bit ASCII Codec -------------------------------------------------- */
3644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003646 Py_ssize_t size,
3647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 PyUnicodeObject *v;
3651 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003652 Py_ssize_t startinpos;
3653 Py_ssize_t endinpos;
3654 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 const char *e;
3656 PyObject *errorHandler = NULL;
3657 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003660 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003661 Py_UNICODE r = *(unsigned char*)s;
3662 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003663 }
Tim Petersced69f82003-09-16 20:30:58 +00003664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 v = _PyUnicode_New(size);
3666 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003667 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003669 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 e = s + size;
3672 while (s < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003673 register unsigned char c = (unsigned char)*s;
3674 if (c < 128) {
3675 *p++ = c;
3676 ++s;
3677 }
3678 else {
3679 startinpos = s-starts;
3680 endinpos = startinpos + 1;
3681 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3682 if (unicode_decode_call_errorhandler(
3683 errors, &errorHandler,
3684 "ascii", "ordinal not in range(128)",
3685 starts, size, &startinpos, &endinpos, &exc, &s,
3686 &v, &outpos, &p))
3687 goto onError;
3688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003690 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003691 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3692 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 Py_XDECREF(errorHandler);
3694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003696
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003697 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 Py_XDECREF(errorHandler);
3700 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 return NULL;
3702}
3703
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003705 Py_ssize_t size,
3706 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709}
3710
3711PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3712{
3713 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003714 PyErr_BadArgument();
3715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 }
3717 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003718 PyUnicode_GET_SIZE(unicode),
3719 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720}
3721
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003722#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003723
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003724/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003725
Hirokazu Yamamoto68e075e2009-03-21 13:04:41 +00003726#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003727#define NEED_RETRY
3728#endif
3729
3730/* XXX This code is limited to "true" double-byte encodings, as
3731 a) it assumes an incomplete character consists of a single byte, and
3732 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003733 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003734
3735static int is_dbcs_lead_byte(const char *s, int offset)
3736{
3737 const char *curr = s + offset;
3738
3739 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003740 const char *prev = CharPrev(s, curr);
3741 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003742 }
3743 return 0;
3744}
3745
3746/*
3747 * Decode MBCS string into unicode object. If 'final' is set, converts
3748 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3749 */
3750static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003751 const char *s, /* MBCS string */
3752 int size, /* sizeof MBCS string */
3753 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003754{
3755 Py_UNICODE *p;
3756 Py_ssize_t n = 0;
3757 int usize = 0;
3758
3759 assert(size >= 0);
3760
3761 /* Skip trailing lead-byte unless 'final' is set */
3762 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003763 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003764
3765 /* First get the size of the result */
3766 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003767 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3768 if (usize == 0) {
3769 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3770 return -1;
3771 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003772 }
3773
3774 if (*v == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003775 /* Create unicode object */
3776 *v = _PyUnicode_New(usize);
3777 if (*v == NULL)
3778 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003779 }
3780 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003781 /* Extend unicode object */
3782 n = PyUnicode_GET_SIZE(*v);
3783 if (_PyUnicode_Resize(v, n + usize) < 0)
3784 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003785 }
3786
3787 /* Do the conversion */
3788 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003789 p = PyUnicode_AS_UNICODE(*v) + n;
3790 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3791 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3792 return -1;
3793 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003794 }
3795
3796 return size;
3797}
3798
3799PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003800 Py_ssize_t size,
3801 const char *errors,
3802 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003803{
3804 PyUnicodeObject *v = NULL;
3805 int done;
3806
3807 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003808 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003809
3810#ifdef NEED_RETRY
3811 retry:
3812 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003813 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003814 else
3815#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003816 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003817
3818 if (done < 0) {
3819 Py_XDECREF(v);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003820 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003821 }
3822
3823 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003824 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003825
3826#ifdef NEED_RETRY
3827 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003828 s += done;
3829 size -= done;
3830 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003831 }
3832#endif
3833
3834 return (PyObject *)v;
3835}
3836
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003837PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003838 Py_ssize_t size,
3839 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003840{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003841 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3842}
3843
3844/*
3845 * Convert unicode into string object (MBCS).
3846 * Returns 0 if succeed, -1 otherwise.
3847 */
3848static int encode_mbcs(PyObject **repr,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003849 const Py_UNICODE *p, /* unicode */
3850 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851{
3852 int mbcssize = 0;
3853 Py_ssize_t n = 0;
3854
3855 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003856
3857 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003858 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003859 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3860 if (mbcssize == 0) {
3861 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3862 return -1;
3863 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003864 }
3865
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866 if (*repr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003867 /* Create string object */
3868 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3869 if (*repr == NULL)
3870 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003871 }
3872 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003873 /* Extend string object */
3874 n = PyString_Size(*repr);
3875 if (_PyString_Resize(repr, n + mbcssize) < 0)
3876 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877 }
3878
3879 /* Do the conversion */
3880 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003881 char *s = PyString_AS_STRING(*repr) + n;
3882 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3883 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3884 return -1;
3885 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 }
3887
3888 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003889}
3890
3891PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003892 Py_ssize_t size,
3893 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003894{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895 PyObject *repr = NULL;
3896 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003897
Martin v. Löwisd8251432006-06-14 05:21:04 +00003898#ifdef NEED_RETRY
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003899 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003900 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003901 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003902 else
3903#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003904 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003905
Martin v. Löwisd8251432006-06-14 05:21:04 +00003906 if (ret < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003907 Py_XDECREF(repr);
3908 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003909 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910
3911#ifdef NEED_RETRY
3912 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003913 p += INT_MAX;
3914 size -= INT_MAX;
3915 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003916 }
3917#endif
3918
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003919 return repr;
3920}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003921
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003922PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3923{
3924 if (!PyUnicode_Check(unicode)) {
3925 PyErr_BadArgument();
3926 return NULL;
3927 }
3928 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003929 PyUnicode_GET_SIZE(unicode),
3930 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003931}
3932
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933#undef NEED_RETRY
3934
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003935#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003936
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937/* --- Character Mapping Codec -------------------------------------------- */
3938
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003940 Py_ssize_t size,
3941 PyObject *mapping,
3942 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003945 Py_ssize_t startinpos;
3946 Py_ssize_t endinpos;
3947 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 PyUnicodeObject *v;
3950 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003951 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 PyObject *errorHandler = NULL;
3953 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003954 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003955 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003956
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 /* Default to Latin-1 */
3958 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003959 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960
3961 v = _PyUnicode_New(size);
3962 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003965 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003968 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003969 mapstring = PyUnicode_AS_UNICODE(mapping);
3970 maplen = PyUnicode_GET_SIZE(mapping);
3971 while (s < e) {
3972 unsigned char ch = *s;
3973 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003975 if (ch < maplen)
3976 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003978 if (x == 0xfffe) {
3979 /* undefined mapping */
3980 outpos = p-PyUnicode_AS_UNICODE(v);
3981 startinpos = s-starts;
3982 endinpos = startinpos+1;
3983 if (unicode_decode_call_errorhandler(
3984 errors, &errorHandler,
3985 "charmap", "character maps to <undefined>",
3986 starts, size, &startinpos, &endinpos, &exc, &s,
3987 &v, &outpos, &p)) {
3988 goto onError;
3989 }
3990 continue;
3991 }
3992 *p++ = x;
3993 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003994 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003995 }
3996 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003997 while (s < e) {
3998 unsigned char ch = *s;
3999 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004000
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004001 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4002 w = PyInt_FromLong((long)ch);
4003 if (w == NULL)
4004 goto onError;
4005 x = PyObject_GetItem(mapping, w);
4006 Py_DECREF(w);
4007 if (x == NULL) {
4008 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4009 /* No mapping found means: mapping is undefined. */
4010 PyErr_Clear();
4011 x = Py_None;
4012 Py_INCREF(x);
4013 } else
4014 goto onError;
4015 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004016
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004017 /* Apply mapping */
4018 if (PyInt_Check(x)) {
4019 long value = PyInt_AS_LONG(x);
4020 if (value < 0 || value > 65535) {
4021 PyErr_SetString(PyExc_TypeError,
4022 "character mapping must be in range(65536)");
4023 Py_DECREF(x);
4024 goto onError;
4025 }
4026 *p++ = (Py_UNICODE)value;
4027 }
4028 else if (x == Py_None) {
4029 /* undefined mapping */
4030 outpos = p-PyUnicode_AS_UNICODE(v);
4031 startinpos = s-starts;
4032 endinpos = startinpos+1;
4033 if (unicode_decode_call_errorhandler(
4034 errors, &errorHandler,
4035 "charmap", "character maps to <undefined>",
4036 starts, size, &startinpos, &endinpos, &exc, &s,
4037 &v, &outpos, &p)) {
4038 Py_DECREF(x);
4039 goto onError;
4040 }
4041 Py_DECREF(x);
4042 continue;
4043 }
4044 else if (PyUnicode_Check(x)) {
4045 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004046
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004047 if (targetsize == 1)
4048 /* 1-1 mapping */
4049 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004050
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004051 else if (targetsize > 1) {
4052 /* 1-n mapping */
4053 if (targetsize > extrachars) {
4054 /* resize first */
4055 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4056 Py_ssize_t needed = (targetsize - extrachars) + \
4057 (targetsize << 2);
4058 extrachars += needed;
4059 /* XXX overflow detection missing */
4060 if (_PyUnicode_Resize(&v,
4061 PyUnicode_GET_SIZE(v) + needed) < 0) {
4062 Py_DECREF(x);
4063 goto onError;
4064 }
4065 p = PyUnicode_AS_UNICODE(v) + oldpos;
4066 }
4067 Py_UNICODE_COPY(p,
4068 PyUnicode_AS_UNICODE(x),
4069 targetsize);
4070 p += targetsize;
4071 extrachars -= targetsize;
4072 }
4073 /* 1-0 mapping: skip the character */
4074 }
4075 else {
4076 /* wrong return value */
4077 PyErr_SetString(PyExc_TypeError,
4078 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004079 Py_DECREF(x);
4080 goto onError;
4081 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004082 Py_DECREF(x);
4083 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 }
4086 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004087 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 Py_XDECREF(errorHandler);
4090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004092
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004093 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 Py_XDECREF(errorHandler);
4095 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 Py_XDECREF(v);
4097 return NULL;
4098}
4099
Martin v. Löwis3f767792006-06-04 19:36:28 +00004100/* Charmap encoding: the lookup table */
4101
4102struct encoding_map{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004103 PyObject_HEAD
4104 unsigned char level1[32];
4105 int count2, count3;
4106 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004107};
4108
4109static PyObject*
4110encoding_map_size(PyObject *obj, PyObject* args)
4111{
4112 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004113 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004114 128*map->count3);
4115}
4116
4117static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004118 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004119 PyDoc_STR("Return the size (in bytes) of this object") },
4120 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004121};
4122
4123static void
4124encoding_map_dealloc(PyObject* o)
4125{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004126 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004127}
4128
4129static PyTypeObject EncodingMapType = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004130 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004131 "EncodingMap", /*tp_name*/
4132 sizeof(struct encoding_map), /*tp_basicsize*/
4133 0, /*tp_itemsize*/
4134 /* methods */
4135 encoding_map_dealloc, /*tp_dealloc*/
4136 0, /*tp_print*/
4137 0, /*tp_getattr*/
4138 0, /*tp_setattr*/
4139 0, /*tp_compare*/
4140 0, /*tp_repr*/
4141 0, /*tp_as_number*/
4142 0, /*tp_as_sequence*/
4143 0, /*tp_as_mapping*/
4144 0, /*tp_hash*/
4145 0, /*tp_call*/
4146 0, /*tp_str*/
4147 0, /*tp_getattro*/
4148 0, /*tp_setattro*/
4149 0, /*tp_as_buffer*/
4150 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4151 0, /*tp_doc*/
4152 0, /*tp_traverse*/
4153 0, /*tp_clear*/
4154 0, /*tp_richcompare*/
4155 0, /*tp_weaklistoffset*/
4156 0, /*tp_iter*/
4157 0, /*tp_iternext*/
4158 encoding_map_methods, /*tp_methods*/
4159 0, /*tp_members*/
4160 0, /*tp_getset*/
4161 0, /*tp_base*/
4162 0, /*tp_dict*/
4163 0, /*tp_descr_get*/
4164 0, /*tp_descr_set*/
4165 0, /*tp_dictoffset*/
4166 0, /*tp_init*/
4167 0, /*tp_alloc*/
4168 0, /*tp_new*/
4169 0, /*tp_free*/
4170 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004171};
4172
4173PyObject*
4174PyUnicode_BuildEncodingMap(PyObject* string)
4175{
4176 Py_UNICODE *decode;
4177 PyObject *result;
4178 struct encoding_map *mresult;
4179 int i;
4180 int need_dict = 0;
4181 unsigned char level1[32];
4182 unsigned char level2[512];
4183 unsigned char *mlevel1, *mlevel2, *mlevel3;
4184 int count2 = 0, count3 = 0;
4185
4186 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4187 PyErr_BadArgument();
4188 return NULL;
4189 }
4190 decode = PyUnicode_AS_UNICODE(string);
4191 memset(level1, 0xFF, sizeof level1);
4192 memset(level2, 0xFF, sizeof level2);
4193
4194 /* If there isn't a one-to-one mapping of NULL to \0,
4195 or if there are non-BMP characters, we need to use
4196 a mapping dictionary. */
4197 if (decode[0] != 0)
4198 need_dict = 1;
4199 for (i = 1; i < 256; i++) {
4200 int l1, l2;
4201 if (decode[i] == 0
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004202#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004203 || decode[i] > 0xFFFF
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004204#endif
4205 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004206 need_dict = 1;
4207 break;
4208 }
4209 if (decode[i] == 0xFFFE)
4210 /* unmapped character */
4211 continue;
4212 l1 = decode[i] >> 11;
4213 l2 = decode[i] >> 7;
4214 if (level1[l1] == 0xFF)
4215 level1[l1] = count2++;
4216 if (level2[l2] == 0xFF)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004217 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004218 }
4219
4220 if (count2 >= 0xFF || count3 >= 0xFF)
4221 need_dict = 1;
4222
4223 if (need_dict) {
4224 PyObject *result = PyDict_New();
4225 PyObject *key, *value;
4226 if (!result)
4227 return NULL;
4228 for (i = 0; i < 256; i++) {
4229 key = value = NULL;
4230 key = PyInt_FromLong(decode[i]);
4231 value = PyInt_FromLong(i);
4232 if (!key || !value)
4233 goto failed1;
4234 if (PyDict_SetItem(result, key, value) == -1)
4235 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004236 Py_DECREF(key);
4237 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004238 }
4239 return result;
4240 failed1:
4241 Py_XDECREF(key);
4242 Py_XDECREF(value);
4243 Py_DECREF(result);
4244 return NULL;
4245 }
4246
4247 /* Create a three-level trie */
4248 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4249 16*count2 + 128*count3 - 1);
4250 if (!result)
4251 return PyErr_NoMemory();
4252 PyObject_Init(result, &EncodingMapType);
4253 mresult = (struct encoding_map*)result;
4254 mresult->count2 = count2;
4255 mresult->count3 = count3;
4256 mlevel1 = mresult->level1;
4257 mlevel2 = mresult->level23;
4258 mlevel3 = mresult->level23 + 16*count2;
4259 memcpy(mlevel1, level1, 32);
4260 memset(mlevel2, 0xFF, 16*count2);
4261 memset(mlevel3, 0, 128*count3);
4262 count3 = 0;
4263 for (i = 1; i < 256; i++) {
4264 int o1, o2, o3, i2, i3;
4265 if (decode[i] == 0xFFFE)
4266 /* unmapped character */
4267 continue;
4268 o1 = decode[i]>>11;
4269 o2 = (decode[i]>>7) & 0xF;
4270 i2 = 16*mlevel1[o1] + o2;
4271 if (mlevel2[i2] == 0xFF)
4272 mlevel2[i2] = count3++;
4273 o3 = decode[i] & 0x7F;
4274 i3 = 128*mlevel2[i2] + o3;
4275 mlevel3[i3] = i;
4276 }
4277 return result;
4278}
4279
4280static int
4281encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4282{
4283 struct encoding_map *map = (struct encoding_map*)mapping;
4284 int l1 = c>>11;
4285 int l2 = (c>>7) & 0xF;
4286 int l3 = c & 0x7F;
4287 int i;
4288
4289#ifdef Py_UNICODE_WIDE
4290 if (c > 0xFFFF) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004291 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004292 }
4293#endif
4294 if (c == 0)
4295 return 0;
4296 /* level 1*/
4297 i = map->level1[l1];
4298 if (i == 0xFF) {
4299 return -1;
4300 }
4301 /* level 2*/
4302 i = map->level23[16*i+l2];
4303 if (i == 0xFF) {
4304 return -1;
4305 }
4306 /* level 3 */
4307 i = map->level23[16*map->count2 + 128*i + l3];
4308 if (i == 0) {
4309 return -1;
4310 }
4311 return i;
4312}
4313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314/* Lookup the character ch in the mapping. If the character
4315 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004316 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 PyObject *w = PyInt_FromLong((long)c);
4320 PyObject *x;
4321
4322 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004323 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324 x = PyObject_GetItem(mapping, w);
4325 Py_DECREF(w);
4326 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004327 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4328 /* No mapping found means: mapping is undefined. */
4329 PyErr_Clear();
4330 x = Py_None;
4331 Py_INCREF(x);
4332 return x;
4333 } else
4334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004336 else if (x == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004337 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004339 long value = PyInt_AS_LONG(x);
4340 if (value < 0 || value > 255) {
4341 PyErr_SetString(PyExc_TypeError,
4342 "character mapping must be in range(256)");
4343 Py_DECREF(x);
4344 return NULL;
4345 }
4346 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004348 else if (PyString_Check(x))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004349 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004351 /* wrong return value */
4352 PyErr_SetString(PyExc_TypeError,
4353 "character mapping must return integer, None or str");
4354 Py_DECREF(x);
4355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356 }
4357}
4358
Martin v. Löwis3f767792006-06-04 19:36:28 +00004359static int
4360charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4361{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004362 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4363 /* exponentially overallocate to minimize reallocations */
4364 if (requiredsize < 2*outsize)
4365 requiredsize = 2*outsize;
4366 if (_PyString_Resize(outobj, requiredsize)) {
4367 return 0;
4368 }
4369 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004370}
4371
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004372typedef enum charmapencode_result {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004373 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004374}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375/* lookup the character, put the result in the output string and adjust
4376 various state variables. Reallocate the output string if not enough
4377 space is available. Return a new reference to the object that
4378 was put in the output buffer, or Py_None, if the mapping was undefined
4379 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004380 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004382charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004383 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004385 PyObject *rep;
4386 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004387 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388
Christian Heimese93237d2007-12-19 02:37:44 +00004389 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004390 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004391 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004392 if (res == -1)
4393 return enc_FAILED;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004394 if (outsize<requiredsize)
4395 if (!charmapencode_resize(outobj, outpos, requiredsize))
4396 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004397 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004398 outstart[(*outpos)++] = (char)res;
4399 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004400 }
4401
4402 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 if (rep==NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004404 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 else if (rep==Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004406 Py_DECREF(rep);
4407 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004409 if (PyInt_Check(rep)) {
4410 Py_ssize_t requiredsize = *outpos+1;
4411 if (outsize<requiredsize)
4412 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4413 Py_DECREF(rep);
4414 return enc_EXCEPTION;
4415 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004416 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004417 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004418 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004419 else {
4420 const char *repchars = PyString_AS_STRING(rep);
4421 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4422 Py_ssize_t requiredsize = *outpos+repsize;
4423 if (outsize<requiredsize)
4424 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4425 Py_DECREF(rep);
4426 return enc_EXCEPTION;
4427 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004428 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004429 memcpy(outstart + *outpos, repchars, repsize);
4430 *outpos += repsize;
4431 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 }
Georg Brandl9f167602006-06-04 21:46:16 +00004433 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004434 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435}
4436
4437/* handle an error in PyUnicode_EncodeCharmap
4438 Return 0 on success, -1 on error */
4439static
4440int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004443 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445{
4446 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447 Py_ssize_t repsize;
4448 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 Py_UNICODE *uni2;
4450 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004451 Py_ssize_t collstartpos = *inpos;
4452 Py_ssize_t collendpos = *inpos+1;
4453 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 char *encoding = "charmap";
4455 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004456 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 /* find all unencodable characters */
4459 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004460 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004461 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004462 int res = encoding_map_lookup(p[collendpos], mapping);
4463 if (res != -1)
4464 break;
4465 ++collendpos;
4466 continue;
4467 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004468
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004469 rep = charmapencode_lookup(p[collendpos], mapping);
4470 if (rep==NULL)
4471 return -1;
4472 else if (rep!=Py_None) {
4473 Py_DECREF(rep);
4474 break;
4475 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004476 Py_DECREF(rep);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004477 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 }
4479 /* cache callback name lookup
4480 * (if not done yet, i.e. it's the first error) */
4481 if (*known_errorHandler==-1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004482 if ((errors==NULL) || (!strcmp(errors, "strict")))
4483 *known_errorHandler = 1;
4484 else if (!strcmp(errors, "replace"))
4485 *known_errorHandler = 2;
4486 else if (!strcmp(errors, "ignore"))
4487 *known_errorHandler = 3;
4488 else if (!strcmp(errors, "xmlcharrefreplace"))
4489 *known_errorHandler = 4;
4490 else
4491 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 }
4493 switch (*known_errorHandler) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004494 case 1: /* strict */
4495 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4496 return -1;
4497 case 2: /* replace */
4498 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004499 x = charmapencode_output('?', mapping, res, respos);
4500 if (x==enc_EXCEPTION) {
4501 return -1;
4502 }
4503 else if (x==enc_FAILED) {
4504 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4505 return -1;
4506 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004507 }
4508 /* fall through */
4509 case 3: /* ignore */
4510 *inpos = collendpos;
4511 break;
4512 case 4: /* xmlcharrefreplace */
4513 /* generate replacement (temporarily (mis)uses p) */
4514 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004515 char buffer[2+29+1+1];
4516 char *cp;
4517 sprintf(buffer, "&#%d;", (int)p[collpos]);
4518 for (cp = buffer; *cp; ++cp) {
4519 x = charmapencode_output(*cp, mapping, res, respos);
4520 if (x==enc_EXCEPTION)
4521 return -1;
4522 else if (x==enc_FAILED) {
4523 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4524 return -1;
4525 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004526 }
4527 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004528 *inpos = collendpos;
4529 break;
4530 default:
4531 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004532 encoding, reason, p, size, exceptionObject,
4533 collstartpos, collendpos, &newpos);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004534 if (repunicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004535 return -1;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004536 /* generate replacement */
4537 repsize = PyUnicode_GET_SIZE(repunicode);
4538 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004539 x = charmapencode_output(*uni2, mapping, res, respos);
4540 if (x==enc_EXCEPTION) {
4541 return -1;
4542 }
4543 else if (x==enc_FAILED) {
4544 Py_DECREF(repunicode);
4545 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4546 return -1;
4547 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004548 }
4549 *inpos = newpos;
4550 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 }
4552 return 0;
4553}
4554
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004556 Py_ssize_t size,
4557 PyObject *mapping,
4558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 /* output object */
4561 PyObject *res = NULL;
4562 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 PyObject *errorHandler = NULL;
4567 PyObject *exc = NULL;
4568 /* the following variable is used for caching string comparisons
4569 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4570 * 3=ignore, 4=xmlcharrefreplace */
4571 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572
4573 /* Default to Latin-1 */
4574 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004575 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 /* allocate enough for a simple encoding without
4578 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004579 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 if (res == NULL)
4581 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004582 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004583 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 while (inpos<size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004586 /* try to encode it */
4587 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4588 if (x==enc_EXCEPTION) /* error */
4589 goto onError;
4590 if (x==enc_FAILED) { /* unencodable character */
4591 if (charmap_encoding_error(p, size, &inpos, mapping,
4592 &exc,
4593 &known_errorHandler, &errorHandler, errors,
4594 &res, &respos)) {
4595 goto onError;
4596 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004597 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004598 else
4599 /* done with this character => adjust input position */
4600 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004604 if (respos<PyString_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004605 if (_PyString_Resize(&res, respos))
4606 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 }
4608 Py_XDECREF(exc);
4609 Py_XDECREF(errorHandler);
4610 return res;
4611
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004612 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 Py_XDECREF(res);
4614 Py_XDECREF(exc);
4615 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 return NULL;
4617}
4618
4619PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004620 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621{
4622 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004623 PyErr_BadArgument();
4624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 }
4626 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004627 PyUnicode_GET_SIZE(unicode),
4628 mapping,
4629 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630}
4631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632/* create or adjust a UnicodeTranslateError */
4633static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004634 const Py_UNICODE *unicode, Py_ssize_t size,
4635 Py_ssize_t startpos, Py_ssize_t endpos,
4636 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004639 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004640 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641 }
4642 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004643 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4644 goto onError;
4645 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4646 goto onError;
4647 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4648 goto onError;
4649 return;
4650 onError:
4651 Py_DECREF(*exceptionObject);
4652 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 }
4654}
4655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656/* raises a UnicodeTranslateError */
4657static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004658 const Py_UNICODE *unicode, Py_ssize_t size,
4659 Py_ssize_t startpos, Py_ssize_t endpos,
4660 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661{
4662 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004663 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004665 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666}
4667
4668/* error handling callback helper:
4669 build arguments, call the callback and check the arguments,
4670 put the result into newpos and return the replacement string, which
4671 has to be freed by the caller */
4672static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004673 PyObject **errorHandler,
4674 const char *reason,
4675 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4676 Py_ssize_t startpos, Py_ssize_t endpos,
4677 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004679 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680
Martin v. Löwis412fb672006-04-13 06:34:32 +00004681 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 PyObject *restuple;
4683 PyObject *resunicode;
4684
4685 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004686 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 }
4690
4691 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004692 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695
4696 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004697 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004699 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00004701 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004702 Py_DECREF(restuple);
4703 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 }
4705 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004706 &resunicode, &i_newpos)) {
4707 Py_DECREF(restuple);
4708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004710 if (i_newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004711 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004712 else
4713 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004714 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004715 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4716 Py_DECREF(restuple);
4717 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 Py_INCREF(resunicode);
4720 Py_DECREF(restuple);
4721 return resunicode;
4722}
4723
4724/* Lookup the character ch in the mapping and put the result in result,
4725 which must be decrefed by the caller.
4726 Return 0 on success, -1 on error */
4727static
4728int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4729{
4730 PyObject *w = PyInt_FromLong((long)c);
4731 PyObject *x;
4732
4733 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004734 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 x = PyObject_GetItem(mapping, w);
4736 Py_DECREF(w);
4737 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004738 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4739 /* No mapping found means: use 1:1 mapping. */
4740 PyErr_Clear();
4741 *result = NULL;
4742 return 0;
4743 } else
4744 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 }
4746 else if (x == Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004747 *result = x;
4748 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 }
4750 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004751 long value = PyInt_AS_LONG(x);
4752 long max = PyUnicode_GetMax();
4753 if (value < 0 || value > max) {
4754 PyErr_Format(PyExc_TypeError,
4755 "character mapping must be in range(0x%lx)", max+1);
4756 Py_DECREF(x);
4757 return -1;
4758 }
4759 *result = x;
4760 return 0;
4761 }
4762 else if (PyUnicode_Check(x)) {
4763 *result = x;
4764 return 0;
4765 }
4766 else {
4767 /* wrong return value */
4768 PyErr_SetString(PyExc_TypeError,
4769 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004770 Py_DECREF(x);
4771 return -1;
4772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773}
4774/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004775 if not reallocate and adjust various state variables.
4776 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777static
Walter Dörwald4894c302003-10-24 14:25:28 +00004778int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004779 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004782 if (requiredsize > oldsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004783 /* remember old output position */
4784 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4785 /* exponentially overallocate to minimize reallocations */
4786 if (requiredsize < 2 * oldsize)
4787 requiredsize = 2 * oldsize;
4788 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4789 return -1;
4790 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 }
4792 return 0;
4793}
4794/* lookup the character, put the result in the output string and adjust
4795 various state variables. Return a new reference to the object that
4796 was put in the output buffer in *result, or Py_None, if the mapping was
4797 undefined (in which case no character was written).
4798 The called must decref result.
4799 Return 0 on success, -1 on error. */
4800static
Walter Dörwald4894c302003-10-24 14:25:28 +00004801int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004802 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4803 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804{
Walter Dörwald4894c302003-10-24 14:25:28 +00004805 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004806 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 if (*res==NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004808 /* not found => default to 1:1 mapping */
4809 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 }
4811 else if (*res==Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004812 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 else if (PyInt_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004814 /* no overflow check, because we know that the space is enough */
4815 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 }
4817 else if (PyUnicode_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004818 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4819 if (repsize==1) {
4820 /* no overflow check, because we know that the space is enough */
4821 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4822 }
4823 else if (repsize!=0) {
4824 /* more than one character */
4825 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4826 (insize - (curinp-startinp)) +
4827 repsize - 1;
4828 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4829 return -1;
4830 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4831 *outp += repsize;
4832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 }
4834 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004835 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004836 return 0;
4837}
4838
4839PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004840 Py_ssize_t size,
4841 PyObject *mapping,
4842 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 /* output object */
4845 PyObject *res = NULL;
4846 /* pointers to the beginning and end+1 of input */
4847 const Py_UNICODE *startp = p;
4848 const Py_UNICODE *endp = p + size;
4849 /* pointer into the output */
4850 Py_UNICODE *str;
4851 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 char *reason = "character maps to <undefined>";
4854 PyObject *errorHandler = NULL;
4855 PyObject *exc = NULL;
4856 /* the following variable is used for caching string comparisons
4857 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4858 * 3=ignore, 4=xmlcharrefreplace */
4859 int known_errorHandler = -1;
4860
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 if (mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004862 PyErr_BadArgument();
4863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865
4866 /* allocate enough for a simple 1:1 translation without
4867 replacements, if we need more, we'll resize */
4868 res = PyUnicode_FromUnicode(NULL, size);
4869 if (res == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004870 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004872 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004876 /* try to encode it */
4877 PyObject *x = NULL;
4878 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4879 Py_XDECREF(x);
4880 goto onError;
4881 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004882 Py_XDECREF(x);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004883 if (x!=Py_None) /* it worked => adjust input pointer */
4884 ++p;
4885 else { /* untranslatable character */
4886 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4887 Py_ssize_t repsize;
4888 Py_ssize_t newpos;
4889 Py_UNICODE *uni2;
4890 /* startpos for collecting untranslatable chars */
4891 const Py_UNICODE *collstart = p;
4892 const Py_UNICODE *collend = p+1;
4893 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004895 /* find all untranslatable characters */
4896 while (collend < endp) {
4897 if (charmaptranslate_lookup(*collend, mapping, &x))
4898 goto onError;
4899 Py_XDECREF(x);
4900 if (x!=Py_None)
4901 break;
4902 ++collend;
4903 }
4904 /* cache callback name lookup
4905 * (if not done yet, i.e. it's the first error) */
4906 if (known_errorHandler==-1) {
4907 if ((errors==NULL) || (!strcmp(errors, "strict")))
4908 known_errorHandler = 1;
4909 else if (!strcmp(errors, "replace"))
4910 known_errorHandler = 2;
4911 else if (!strcmp(errors, "ignore"))
4912 known_errorHandler = 3;
4913 else if (!strcmp(errors, "xmlcharrefreplace"))
4914 known_errorHandler = 4;
4915 else
4916 known_errorHandler = 0;
4917 }
4918 switch (known_errorHandler) {
4919 case 1: /* strict */
4920 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004921 goto onError;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004922 case 2: /* replace */
4923 /* No need to check for space, this is a 1:1 replacement */
4924 for (coll = collstart; coll<collend; ++coll)
4925 *str++ = '?';
4926 /* fall through */
4927 case 3: /* ignore */
4928 p = collend;
4929 break;
4930 case 4: /* xmlcharrefreplace */
4931 /* generate replacement (temporarily (mis)uses p) */
4932 for (p = collstart; p < collend; ++p) {
4933 char buffer[2+29+1+1];
4934 char *cp;
4935 sprintf(buffer, "&#%d;", (int)*p);
4936 if (charmaptranslate_makespace(&res, &str,
4937 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4938 goto onError;
4939 for (cp = buffer; *cp; ++cp)
4940 *str++ = *cp;
4941 }
4942 p = collend;
4943 break;
4944 default:
4945 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4946 reason, startp, size, &exc,
4947 collstart-startp, collend-startp, &newpos);
4948 if (repunicode == NULL)
4949 goto onError;
4950 /* generate replacement */
4951 repsize = PyUnicode_GET_SIZE(repunicode);
4952 if (charmaptranslate_makespace(&res, &str,
4953 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4954 Py_DECREF(repunicode);
4955 goto onError;
4956 }
4957 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4958 *str++ = *uni2;
4959 p = startp + newpos;
4960 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004961 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004962 }
4963 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 /* Resize if we allocated to much */
4965 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004966 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004967 if (PyUnicode_Resize(&res, respos) < 0)
4968 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 }
4970 Py_XDECREF(exc);
4971 Py_XDECREF(errorHandler);
4972 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004974 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 Py_XDECREF(res);
4976 Py_XDECREF(exc);
4977 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 return NULL;
4979}
4980
4981PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004982 PyObject *mapping,
4983 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984{
4985 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004986
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 str = PyUnicode_FromObject(str);
4988 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004991 PyUnicode_GET_SIZE(str),
4992 mapping,
4993 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 Py_DECREF(str);
4995 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004996
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004997 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 Py_XDECREF(str);
4999 return NULL;
5000}
Tim Petersced69f82003-09-16 20:30:58 +00005001
Guido van Rossum9e896b32000-04-05 20:11:21 +00005002/* --- Decimal Encoder ---------------------------------------------------- */
5003
5004int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005005 Py_ssize_t length,
5006 char *output,
5007 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005008{
5009 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 PyObject *errorHandler = NULL;
5011 PyObject *exc = NULL;
5012 const char *encoding = "decimal";
5013 const char *reason = "invalid decimal Unicode string";
5014 /* the following variable is used for caching string comparisons
5015 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5016 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005017
5018 if (output == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005019 PyErr_BadArgument();
5020 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005021 }
5022
5023 p = s;
5024 end = s + length;
5025 while (p < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005026 register Py_UNICODE ch = *p;
5027 int decimal;
5028 PyObject *repunicode;
5029 Py_ssize_t repsize;
5030 Py_ssize_t newpos;
5031 Py_UNICODE *uni2;
5032 Py_UNICODE *collstart;
5033 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005034
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005035 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005036 *output++ = ' ';
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005037 ++p;
5038 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005039 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005040 decimal = Py_UNICODE_TODECIMAL(ch);
5041 if (decimal >= 0) {
5042 *output++ = '0' + decimal;
5043 ++p;
5044 continue;
5045 }
5046 if (0 < ch && ch < 256) {
5047 *output++ = (char)ch;
5048 ++p;
5049 continue;
5050 }
5051 /* All other characters are considered unencodable */
5052 collstart = p;
5053 collend = p+1;
5054 while (collend < end) {
5055 if ((0 < *collend && *collend < 256) ||
5056 !Py_UNICODE_ISSPACE(*collend) ||
5057 Py_UNICODE_TODECIMAL(*collend))
5058 break;
5059 }
5060 /* cache callback name lookup
5061 * (if not done yet, i.e. it's the first error) */
5062 if (known_errorHandler==-1) {
5063 if ((errors==NULL) || (!strcmp(errors, "strict")))
5064 known_errorHandler = 1;
5065 else if (!strcmp(errors, "replace"))
5066 known_errorHandler = 2;
5067 else if (!strcmp(errors, "ignore"))
5068 known_errorHandler = 3;
5069 else if (!strcmp(errors, "xmlcharrefreplace"))
5070 known_errorHandler = 4;
5071 else
5072 known_errorHandler = 0;
5073 }
5074 switch (known_errorHandler) {
5075 case 1: /* strict */
5076 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5077 goto onError;
5078 case 2: /* replace */
5079 for (p = collstart; p < collend; ++p)
5080 *output++ = '?';
5081 /* fall through */
5082 case 3: /* ignore */
5083 p = collend;
5084 break;
5085 case 4: /* xmlcharrefreplace */
5086 /* generate replacement (temporarily (mis)uses p) */
5087 for (p = collstart; p < collend; ++p)
5088 output += sprintf(output, "&#%d;", (int)*p);
5089 p = collend;
5090 break;
5091 default:
5092 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5093 encoding, reason, s, length, &exc,
5094 collstart-s, collend-s, &newpos);
5095 if (repunicode == NULL)
5096 goto onError;
5097 /* generate replacement */
5098 repsize = PyUnicode_GET_SIZE(repunicode);
5099 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5100 Py_UNICODE ch = *uni2;
5101 if (Py_UNICODE_ISSPACE(ch))
5102 *output++ = ' ';
5103 else {
5104 decimal = Py_UNICODE_TODECIMAL(ch);
5105 if (decimal >= 0)
5106 *output++ = '0' + decimal;
5107 else if (0 < ch && ch < 256)
5108 *output++ = (char)ch;
5109 else {
5110 Py_DECREF(repunicode);
5111 raise_encode_exception(&exc, encoding,
5112 s, length, collstart-s, collend-s, reason);
5113 goto onError;
5114 }
5115 }
5116 }
5117 p = s + newpos;
5118 Py_DECREF(repunicode);
5119 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005120 }
5121 /* 0-terminate the output string */
5122 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 Py_XDECREF(exc);
5124 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005125 return 0;
5126
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005127 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 Py_XDECREF(exc);
5129 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005130 return -1;
5131}
5132
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133/* --- Helpers ------------------------------------------------------------ */
5134
Eric Smitha9f7d622008-02-17 19:46:49 +00005135#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005136
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005137#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005138
Fredrik Lundha50d2012006-05-26 17:04:58 +00005139#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005140
5141#include "stringlib/count.h"
5142#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005143#include "stringlib/partition.h"
5144
Fredrik Lundhc8162812006-05-26 19:33:03 +00005145/* helper macro to fixup start/end slice values */
5146#define FIX_START_END(obj) \
5147 if (start < 0) \
5148 start += (obj)->length; \
5149 if (start < 0) \
5150 start = 0; \
5151 if (end > (obj)->length) \
5152 end = (obj)->length; \
5153 if (end < 0) \
5154 end += (obj)->length; \
5155 if (end < 0) \
5156 end = 0;
5157
Martin v. Löwis18e16552006-02-15 17:27:45 +00005158Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005159 PyObject *substr,
5160 Py_ssize_t start,
5161 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005163 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005164 PyUnicodeObject* str_obj;
5165 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005166
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005167 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5168 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005169 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005170 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5171 if (!sub_obj) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005172 Py_DECREF(str_obj);
5173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 }
Tim Petersced69f82003-09-16 20:30:58 +00005175
Fredrik Lundhc8162812006-05-26 19:33:03 +00005176 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005177
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005178 result = stringlib_count(
5179 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5180 );
5181
5182 Py_DECREF(sub_obj);
5183 Py_DECREF(str_obj);
5184
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 return result;
5186}
5187
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005189 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005190 Py_ssize_t start,
5191 Py_ssize_t end,
5192 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005194 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005195
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005196 str = PyUnicode_FromObject(str);
5197 if (!str)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005198 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005199 sub = PyUnicode_FromObject(sub);
5200 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005201 Py_DECREF(str);
5202 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 }
Tim Petersced69f82003-09-16 20:30:58 +00005204
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005205 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005206 result = stringlib_find_slice(
5207 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5208 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5209 start, end
5210 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005211 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005212 result = stringlib_rfind_slice(
5213 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5214 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5215 start, end
5216 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005217
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005218 Py_DECREF(str);
5219 Py_DECREF(sub);
5220
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 return result;
5222}
5223
Tim Petersced69f82003-09-16 20:30:58 +00005224static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225int tailmatch(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005226 PyUnicodeObject *substring,
5227 Py_ssize_t start,
5228 Py_ssize_t end,
5229 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 if (substring->length == 0)
5232 return 1;
5233
Fredrik Lundhc8162812006-05-26 19:33:03 +00005234 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235
5236 end -= substring->length;
5237 if (end < start)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005238 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239
5240 if (direction > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005241 if (Py_UNICODE_MATCH(self, end, substring))
5242 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 } else {
5244 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005245 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 }
5247
5248 return 0;
5249}
5250
Martin v. Löwis18e16552006-02-15 17:27:45 +00005251Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005252 PyObject *substr,
5253 Py_ssize_t start,
5254 Py_ssize_t end,
5255 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005257 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 str = PyUnicode_FromObject(str);
5260 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005261 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 substr = PyUnicode_FromObject(substr);
5263 if (substr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005264 Py_DECREF(str);
5265 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 }
Tim Petersced69f82003-09-16 20:30:58 +00005267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005269 (PyUnicodeObject *)substr,
5270 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 Py_DECREF(str);
5272 Py_DECREF(substr);
5273 return result;
5274}
5275
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276/* Apply fixfct filter to the Unicode object self and return a
5277 reference to the modified object */
5278
Tim Petersced69f82003-09-16 20:30:58 +00005279static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005281 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282{
5283
5284 PyUnicodeObject *u;
5285
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005286 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005288 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005289
5290 Py_UNICODE_COPY(u->str, self->str, self->length);
5291
Tim Peters7a29bd52001-09-12 03:03:31 +00005292 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005293 /* fixfct should return TRUE if it modified the buffer. If
5294 FALSE, return a reference to the original buffer instead
5295 (to save space, not time) */
5296 Py_INCREF(self);
5297 Py_DECREF(u);
5298 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 }
5300 return (PyObject*) u;
5301}
5302
Tim Petersced69f82003-09-16 20:30:58 +00005303static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304int fixupper(PyUnicodeObject *self)
5305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 Py_UNICODE *s = self->str;
5308 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005309
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005311 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005312
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005313 ch = Py_UNICODE_TOUPPER(*s);
5314 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005316 *s = ch;
5317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 s++;
5319 }
5320
5321 return status;
5322}
5323
Tim Petersced69f82003-09-16 20:30:58 +00005324static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325int fixlower(PyUnicodeObject *self)
5326{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005327 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 Py_UNICODE *s = self->str;
5329 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005332 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005333
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005334 ch = Py_UNICODE_TOLOWER(*s);
5335 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005337 *s = ch;
5338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 s++;
5340 }
5341
5342 return status;
5343}
5344
Tim Petersced69f82003-09-16 20:30:58 +00005345static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346int fixswapcase(PyUnicodeObject *self)
5347{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_UNICODE *s = self->str;
5350 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005351
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 while (len-- > 0) {
5353 if (Py_UNICODE_ISUPPER(*s)) {
5354 *s = Py_UNICODE_TOLOWER(*s);
5355 status = 1;
5356 } else if (Py_UNICODE_ISLOWER(*s)) {
5357 *s = Py_UNICODE_TOUPPER(*s);
5358 status = 1;
5359 }
5360 s++;
5361 }
5362
5363 return status;
5364}
5365
Tim Petersced69f82003-09-16 20:30:58 +00005366static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367int fixcapitalize(PyUnicodeObject *self)
5368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005370 Py_UNICODE *s = self->str;
5371 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005372
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005373 if (len == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005374 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005375 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005376 *s = Py_UNICODE_TOUPPER(*s);
5377 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005379 s++;
5380 while (--len > 0) {
5381 if (Py_UNICODE_ISUPPER(*s)) {
5382 *s = Py_UNICODE_TOLOWER(*s);
5383 status = 1;
5384 }
5385 s++;
5386 }
5387 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388}
5389
5390static
5391int fixtitle(PyUnicodeObject *self)
5392{
5393 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5394 register Py_UNICODE *e;
5395 int previous_is_cased;
5396
5397 /* Shortcut for single character strings */
5398 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005399 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5400 if (*p != ch) {
5401 *p = ch;
5402 return 1;
5403 }
5404 else
5405 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 }
Tim Petersced69f82003-09-16 20:30:58 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 e = p + PyUnicode_GET_SIZE(self);
5409 previous_is_cased = 0;
5410 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005411 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005413 if (previous_is_cased)
5414 *p = Py_UNICODE_TOLOWER(ch);
5415 else
5416 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005417
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005418 if (Py_UNICODE_ISLOWER(ch) ||
5419 Py_UNICODE_ISUPPER(ch) ||
5420 Py_UNICODE_ISTITLE(ch))
5421 previous_is_cased = 1;
5422 else
5423 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 }
5425 return 1;
5426}
5427
Tim Peters8ce9f162004-08-27 01:49:32 +00005428PyObject *
5429PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Tim Peters8ce9f162004-08-27 01:49:32 +00005431 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005432 const Py_UNICODE blank = ' ';
5433 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005434 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005436 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5437 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005438 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5439 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005440 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005441 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005442 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
Tim Peters05eba1f2004-08-27 21:32:02 +00005444 fseq = PySequence_Fast(seq, "");
5445 if (fseq == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005446 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005447 }
5448
Tim Peters91879ab2004-08-27 22:35:44 +00005449 /* Grrrr. A codec may be invoked to convert str objects to
5450 * Unicode, and so it's possible to call back into Python code
5451 * during PyUnicode_FromObject(), and so it's possible for a sick
5452 * codec to change the size of fseq (if seq is a list). Therefore
5453 * we have to keep refetching the size -- can't assume seqlen
5454 * is invariant.
5455 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 seqlen = PySequence_Fast_GET_SIZE(fseq);
5457 /* If empty sequence, return u"". */
5458 if (seqlen == 0) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005459 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5460 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005461 }
5462 /* If singleton sequence with an exact Unicode, return that. */
5463 if (seqlen == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005464 item = PySequence_Fast_GET_ITEM(fseq, 0);
5465 if (PyUnicode_CheckExact(item)) {
5466 Py_INCREF(item);
5467 res = (PyUnicodeObject *)item;
5468 goto Done;
5469 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005470 }
5471
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 /* At least two items to join, or one that isn't exact Unicode. */
5473 if (seqlen > 1) {
5474 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005475 if (separator == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005476 sep = &blank;
5477 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005478 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005479 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005480 internal_separator = PyUnicode_FromObject(separator);
5481 if (internal_separator == NULL)
5482 goto onError;
5483 sep = PyUnicode_AS_UNICODE(internal_separator);
5484 seplen = PyUnicode_GET_SIZE(internal_separator);
5485 /* In case PyUnicode_FromObject() mutated seq. */
5486 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 }
5488 }
5489
5490 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005491 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005492 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005493 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 res_p = PyUnicode_AS_UNICODE(res);
5495 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005496
Tim Peters05eba1f2004-08-27 21:32:02 +00005497 for (i = 0; i < seqlen; ++i) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005498 Py_ssize_t itemlen;
5499 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005500
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005501 item = PySequence_Fast_GET_ITEM(fseq, i);
5502 /* Convert item to Unicode. */
5503 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5504 PyErr_Format(PyExc_TypeError,
5505 "sequence item %zd: expected string or Unicode,"
5506 " %.80s found",
5507 i, Py_TYPE(item)->tp_name);
5508 goto onError;
5509 }
5510 item = PyUnicode_FromObject(item);
5511 if (item == NULL)
5512 goto onError;
5513 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005514
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005515 /* In case PyUnicode_FromObject() mutated seq. */
5516 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005517
Tim Peters8ce9f162004-08-27 01:49:32 +00005518 /* Make sure we have enough space for the separator and the item. */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005519 itemlen = PyUnicode_GET_SIZE(item);
5520 new_res_used = res_used + itemlen;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005521 if (new_res_used < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005522 goto Overflow;
5523 if (i < seqlen - 1) {
5524 new_res_used += seplen;
5525 if (new_res_used < 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005526 goto Overflow;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005527 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005528 if (new_res_used > res_alloc) {
5529 /* double allocated size until it's big enough */
5530 do {
5531 res_alloc += res_alloc;
5532 if (res_alloc <= 0)
5533 goto Overflow;
5534 } while (new_res_used > res_alloc);
5535 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5536 Py_DECREF(item);
5537 goto onError;
5538 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005539 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005540 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005541
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005542 /* Copy item, and maybe the separator. */
5543 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5544 res_p += itemlen;
5545 if (i < seqlen - 1) {
5546 Py_UNICODE_COPY(res_p, sep, seplen);
5547 res_p += seplen;
5548 }
5549 Py_DECREF(item);
5550 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005551 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005552
Tim Peters05eba1f2004-08-27 21:32:02 +00005553 /* Shrink res to match the used area; this probably can't fail,
5554 * but it's cheap to check.
5555 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005556 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005557 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005558
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005559 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005560 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005561 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 return (PyObject *)res;
5563
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005564 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005565 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005566 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005567 Py_DECREF(item);
5568 /* fall through */
5569
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005570 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005571 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005572 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005573 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return NULL;
5575}
5576
Tim Petersced69f82003-09-16 20:30:58 +00005577static
5578PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005579 Py_ssize_t left,
5580 Py_ssize_t right,
5581 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582{
5583 PyUnicodeObject *u;
5584
5585 if (left < 0)
5586 left = 0;
5587 if (right < 0)
5588 right = 0;
5589
Tim Peters7a29bd52001-09-12 03:03:31 +00005590 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 Py_INCREF(self);
5592 return self;
5593 }
5594
Neal Norwitze7d8be82008-07-31 17:17:14 +00005595 if (left > PY_SSIZE_T_MAX - self->length ||
5596 right > PY_SSIZE_T_MAX - (left + self->length)) {
5597 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5598 return NULL;
5599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 u = _PyUnicode_New(left + self->length + right);
5601 if (u) {
5602 if (left)
5603 Py_UNICODE_FILL(u->str, fill, left);
5604 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5605 if (right)
5606 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5607 }
5608
5609 return u;
5610}
5611
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005612#define SPLIT_APPEND(data, left, right) \
5613 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5614 if (!str) \
5615 goto onError; \
5616 if (PyList_Append(list, str)) { \
5617 Py_DECREF(str); \
5618 goto onError; \
5619 } \
5620 else \
5621 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622
5623static
5624PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005625 PyObject *list,
5626 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005628 register Py_ssize_t i;
5629 register Py_ssize_t j;
5630 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005632 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633
5634 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005635 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005636 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005637 i++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005638 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005639 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5640 i++;
5641 if (j < i) {
5642 if (maxcount-- <= 0)
5643 break;
5644 SPLIT_APPEND(buf, j, i);
5645 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5646 i++;
5647 j = i;
5648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 }
5650 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005651 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 }
5653 return list;
5654
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005655 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 Py_DECREF(list);
5657 return NULL;
5658}
5659
5660PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005661 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005663 register Py_ssize_t i;
5664 register Py_ssize_t j;
5665 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 PyObject *list;
5667 PyObject *str;
5668 Py_UNICODE *data;
5669
5670 string = PyUnicode_FromObject(string);
5671 if (string == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 data = PyUnicode_AS_UNICODE(string);
5674 len = PyUnicode_GET_SIZE(string);
5675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 list = PyList_New(0);
5677 if (!list)
5678 goto onError;
5679
5680 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005681 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005682
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005683 /* Find a line and append it */
5684 while (i < len && !BLOOM_LINEBREAK(data[i]))
5685 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005687 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005688 eol = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005689 if (i < len) {
5690 if (data[i] == '\r' && i + 1 < len &&
5691 data[i+1] == '\n')
5692 i += 2;
5693 else
5694 i++;
5695 if (keepends)
5696 eol = i;
5697 }
5698 SPLIT_APPEND(data, j, eol);
5699 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
5701 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005702 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
5704
5705 Py_DECREF(string);
5706 return list;
5707
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005708 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005709 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 Py_DECREF(string);
5711 return NULL;
5712}
5713
Tim Petersced69f82003-09-16 20:30:58 +00005714static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005716 PyObject *list,
5717 Py_UNICODE ch,
5718 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 register Py_ssize_t i;
5721 register Py_ssize_t j;
5722 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005724 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
5726 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005727 if (buf[i] == ch) {
5728 if (maxcount-- <= 0)
5729 break;
5730 SPLIT_APPEND(buf, j, i);
5731 i = j = i + 1;
5732 } else
5733 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 }
5735 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005736 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 }
5738 return list;
5739
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005740 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 Py_DECREF(list);
5742 return NULL;
5743}
5744
Tim Petersced69f82003-09-16 20:30:58 +00005745static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005747 PyObject *list,
5748 PyUnicodeObject *substring,
5749 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 register Py_ssize_t i;
5752 register Py_ssize_t j;
5753 Py_ssize_t len = self->length;
5754 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 PyObject *str;
5756
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005757 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005758 if (Py_UNICODE_MATCH(self, i, substring)) {
5759 if (maxcount-- <= 0)
5760 break;
5761 SPLIT_APPEND(self->str, j, i);
5762 i = j = i + sublen;
5763 } else
5764 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 }
5766 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005767 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 }
5769 return list;
5770
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005771 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 Py_DECREF(list);
5773 return NULL;
5774}
5775
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005776static
5777PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005778 PyObject *list,
5779 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005781 register Py_ssize_t i;
5782 register Py_ssize_t j;
5783 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005785 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786
5787 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005788 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005789 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005790 i--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005791 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005792 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5793 i--;
5794 if (j > i) {
5795 if (maxcount-- <= 0)
5796 break;
5797 SPLIT_APPEND(buf, i + 1, j + 1);
5798 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5799 i--;
5800 j = i;
5801 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005802 }
5803 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005804 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005806 if (PyList_Reverse(list) < 0)
5807 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005808 return list;
5809
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005810 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811 Py_DECREF(list);
5812 return NULL;
5813}
5814
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005815static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005816PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005817 PyObject *list,
5818 Py_UNICODE ch,
5819 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005821 register Py_ssize_t i;
5822 register Py_ssize_t j;
5823 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005824 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005825 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005826
5827 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005828 if (buf[i] == ch) {
5829 if (maxcount-- <= 0)
5830 break;
5831 SPLIT_APPEND(buf, i + 1, j + 1);
5832 j = i = i - 1;
5833 } else
5834 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005836 if (j >= -1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005837 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005839 if (PyList_Reverse(list) < 0)
5840 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841 return list;
5842
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005843 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844 Py_DECREF(list);
5845 return NULL;
5846}
5847
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005848static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005849PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005850 PyObject *list,
5851 PyUnicodeObject *substring,
5852 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854 register Py_ssize_t i;
5855 register Py_ssize_t j;
5856 Py_ssize_t len = self->length;
5857 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858 PyObject *str;
5859
5860 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005861 if (Py_UNICODE_MATCH(self, i, substring)) {
5862 if (maxcount-- <= 0)
5863 break;
5864 SPLIT_APPEND(self->str, i + sublen, j);
5865 j = i;
5866 i -= sublen;
5867 } else
5868 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869 }
5870 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005871 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005873 if (PyList_Reverse(list) < 0)
5874 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005875 return list;
5876
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005877 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005878 Py_DECREF(list);
5879 return NULL;
5880}
5881
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882#undef SPLIT_APPEND
5883
5884static
5885PyObject *split(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005886 PyUnicodeObject *substring,
5887 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
5889 PyObject *list;
5890
5891 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005892 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
5894 list = PyList_New(0);
5895 if (!list)
5896 return NULL;
5897
5898 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005899 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900
5901 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005902 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
5904 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005905 Py_DECREF(list);
5906 PyErr_SetString(PyExc_ValueError, "empty separator");
5907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 }
5909 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005910 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911}
5912
Tim Petersced69f82003-09-16 20:30:58 +00005913static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005914PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005915 PyUnicodeObject *substring,
5916 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005917{
5918 PyObject *list;
5919
5920 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005921 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005922
5923 list = PyList_New(0);
5924 if (!list)
5925 return NULL;
5926
5927 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005928 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005929
5930 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005931 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005932
5933 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005934 Py_DECREF(list);
5935 PyErr_SetString(PyExc_ValueError, "empty separator");
5936 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005937 }
5938 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005939 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005940}
5941
5942static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005944 PyUnicodeObject *str1,
5945 PyUnicodeObject *str2,
5946 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
5948 PyUnicodeObject *u;
5949
5950 if (maxcount < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005951 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
Fredrik Lundh347ee272006-05-24 16:35:18 +00005953 if (str1->length == str2->length) {
5954 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005955 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005956 if (str1->length == 1) {
5957 /* replace characters */
5958 Py_UNICODE u1, u2;
5959 if (!findchar(self->str, self->length, str1->str[0]))
5960 goto nothing;
5961 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5962 if (!u)
5963 return NULL;
5964 Py_UNICODE_COPY(u->str, self->str, self->length);
5965 u1 = str1->str[0];
5966 u2 = str2->str[0];
5967 for (i = 0; i < u->length; i++)
5968 if (u->str[i] == u1) {
5969 if (--maxcount < 0)
5970 break;
5971 u->str[i] = u2;
5972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005974 i = fastsearch(
5975 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005977 if (i < 0)
5978 goto nothing;
5979 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5980 if (!u)
5981 return NULL;
5982 Py_UNICODE_COPY(u->str, self->str, self->length);
5983 while (i <= self->length - str1->length)
5984 if (Py_UNICODE_MATCH(self, i, str1)) {
5985 if (--maxcount < 0)
5986 break;
5987 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5988 i += str1->length;
5989 } else
5990 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005993
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005994 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005995 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 Py_UNICODE *p;
5997
5998 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005999 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 if (n > maxcount)
6001 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006002 if (n == 0)
6003 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006004 /* new_size = self->length + n * (str2->length - str1->length)); */
6005 delta = (str2->length - str1->length);
6006 if (delta == 0) {
6007 new_size = self->length;
6008 } else {
6009 product = n * (str2->length - str1->length);
6010 if ((product / (str2->length - str1->length)) != n) {
6011 PyErr_SetString(PyExc_OverflowError,
6012 "replace string is too long");
6013 return NULL;
6014 }
6015 new_size = self->length + product;
6016 if (new_size < 0) {
6017 PyErr_SetString(PyExc_OverflowError,
6018 "replace string is too long");
6019 return NULL;
6020 }
6021 }
6022 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006023 if (!u)
6024 return NULL;
6025 i = 0;
6026 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006027 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006028 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006029 while (n-- > 0) {
6030 /* look for next match */
6031 j = i;
6032 while (j <= e) {
6033 if (Py_UNICODE_MATCH(self, j, str1))
6034 break;
6035 j++;
6036 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006037 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006038 if (j > e)
6039 break;
6040 /* copy unchanged part [i:j] */
6041 Py_UNICODE_COPY(p, self->str+i, j-i);
6042 p += j - i;
6043 }
6044 /* copy substitution string */
6045 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006046 Py_UNICODE_COPY(p, str2->str, str2->length);
6047 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006048 }
6049 i = j + str1->length;
6050 }
6051 if (i < self->length)
6052 /* copy tail [i:] */
6053 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006054 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006055 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006056 while (n > 0) {
6057 Py_UNICODE_COPY(p, str2->str, str2->length);
6058 p += str2->length;
6059 if (--n <= 0)
6060 break;
6061 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006063 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 }
6065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006067
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006068 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006069 /* nothing to replace; return original string (when possible) */
6070 if (PyUnicode_CheckExact(self)) {
6071 Py_INCREF(self);
6072 return (PyObject *) self;
6073 }
6074 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075}
6076
6077/* --- Unicode Object Methods --------------------------------------------- */
6078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006079PyDoc_STRVAR(title__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006080 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081\n\
6082Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
6085static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006086unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 return fixup(self, fixtitle);
6089}
6090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006092 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093\n\
6094Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
6097static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006098unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 return fixup(self, fixcapitalize);
6101}
6102
6103#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006104PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006105 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106\n\
6107Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006108normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
6110static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006111unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112{
6113 PyObject *list;
6114 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 /* Split into words */
6118 list = split(self, NULL, -1);
6119 if (!list)
6120 return NULL;
6121
6122 /* Capitalize each word */
6123 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6124 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006125 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 if (item == NULL)
6127 goto onError;
6128 Py_DECREF(PyList_GET_ITEM(list, i));
6129 PyList_SET_ITEM(list, i, item);
6130 }
6131
6132 /* Join the words to form a new string */
6133 item = PyUnicode_Join(NULL, list);
6134
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006135 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 Py_DECREF(list);
6137 return (PyObject *)item;
6138}
6139#endif
6140
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006141/* Argument converter. Coerces to a single unicode character */
6142
6143static int
6144convert_uc(PyObject *obj, void *addr)
6145{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006146 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6147 PyObject *uniobj;
6148 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006149
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006150 uniobj = PyUnicode_FromObject(obj);
6151 if (uniobj == NULL) {
6152 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006153 "The fill character cannot be converted to Unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006154 return 0;
6155 }
6156 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6157 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006158 "The fill character must be exactly one character long");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006159 Py_DECREF(uniobj);
6160 return 0;
6161 }
6162 unistr = PyUnicode_AS_UNICODE(uniobj);
6163 *fillcharloc = unistr[0];
6164 Py_DECREF(uniobj);
6165 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006166}
6167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006168PyDoc_STRVAR(center__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006169 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006171Return S centered in a Unicode string of length width. Padding is\n\
6172done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
6174static PyObject *
6175unicode_center(PyUnicodeObject *self, PyObject *args)
6176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006177 Py_ssize_t marg, left;
6178 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006179 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Thomas Woutersde017742006-02-16 19:34:37 +00006181 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return NULL;
6183
Tim Peters7a29bd52001-09-12 03:03:31 +00006184 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 Py_INCREF(self);
6186 return (PyObject*) self;
6187 }
6188
6189 marg = width - self->length;
6190 left = marg / 2 + (marg & width & 1);
6191
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006192 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193}
6194
Marc-André Lemburge5034372000-08-08 08:04:29 +00006195#if 0
6196
6197/* This code should go into some future Unicode collation support
6198 module. The basic comparison should compare ordinals on a naive
Georg Brandla3c242c2009-10-27 14:19:50 +00006199 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006200
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006201/* speedy UTF-16 code point order comparison */
6202/* gleaned from: */
6203/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6204
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006205static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006206{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006207 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006208 0, 0, 0, 0, 0, 0, 0, 0,
6209 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006210 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006211};
6212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213static int
6214unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006216 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006217
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 Py_UNICODE *s1 = str1->str;
6219 Py_UNICODE *s2 = str2->str;
6220
6221 len1 = str1->length;
6222 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006223
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006225 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006226
6227 c1 = *s1++;
6228 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006229
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006230 if (c1 > (1<<11) * 26)
6231 c1 += utf16Fixup[c1>>11];
6232 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006233 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006234 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006235
6236 if (c1 != c2)
6237 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006238
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006239 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 }
6241
6242 return (len1 < len2) ? -1 : (len1 != len2);
6243}
6244
Marc-André Lemburge5034372000-08-08 08:04:29 +00006245#else
6246
6247static int
6248unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006250 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006251
6252 Py_UNICODE *s1 = str1->str;
6253 Py_UNICODE *s2 = str2->str;
6254
6255 len1 = str1->length;
6256 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006257
Marc-André Lemburge5034372000-08-08 08:04:29 +00006258 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006259 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006260
Fredrik Lundh45714e92001-06-26 16:39:36 +00006261 c1 = *s1++;
6262 c2 = *s2++;
6263
6264 if (c1 != c2)
6265 return (c1 < c2) ? -1 : 1;
6266
Marc-André Lemburge5034372000-08-08 08:04:29 +00006267 len1--; len2--;
6268 }
6269
6270 return (len1 < len2) ? -1 : (len1 != len2);
6271}
6272
6273#endif
6274
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275int PyUnicode_Compare(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006276 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277{
6278 PyUnicodeObject *u = NULL, *v = NULL;
6279 int result;
6280
6281 /* Coerce the two arguments */
6282 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6283 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006284 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6286 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006287 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
Thomas Wouters7e474022000-07-16 12:04:32 +00006289 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 if (v == u) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006291 Py_DECREF(u);
6292 Py_DECREF(v);
6293 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 }
6295
6296 result = unicode_compare(u, v);
6297
6298 Py_DECREF(u);
6299 Py_DECREF(v);
6300 return result;
6301
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006302 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_XDECREF(u);
6304 Py_XDECREF(v);
6305 return -1;
6306}
6307
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006308PyObject *PyUnicode_RichCompare(PyObject *left,
6309 PyObject *right,
6310 int op)
6311{
6312 int result;
6313
6314 result = PyUnicode_Compare(left, right);
6315 if (result == -1 && PyErr_Occurred())
6316 goto onError;
6317
6318 /* Convert the return value to a Boolean */
6319 switch (op) {
6320 case Py_EQ:
6321 result = (result == 0);
6322 break;
6323 case Py_NE:
6324 result = (result != 0);
6325 break;
6326 case Py_LE:
6327 result = (result <= 0);
6328 break;
6329 case Py_GE:
6330 result = (result >= 0);
6331 break;
6332 case Py_LT:
6333 result = (result == -1);
6334 break;
6335 case Py_GT:
6336 result = (result == 1);
6337 break;
6338 }
6339 return PyBool_FromLong(result);
6340
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006341 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006342
6343 /* Standard case
6344
6345 Type errors mean that PyUnicode_FromObject() could not convert
6346 one of the arguments (usually the right hand side) to Unicode,
6347 ie. we can't handle the comparison request. However, it is
6348 possible that the other object knows a comparison method, which
6349 is why we return Py_NotImplemented to give the other object a
6350 chance.
6351
6352 */
6353 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6354 PyErr_Clear();
6355 Py_INCREF(Py_NotImplemented);
6356 return Py_NotImplemented;
6357 }
6358 if (op != Py_EQ && op != Py_NE)
6359 return NULL;
6360
6361 /* Equality comparison.
6362
6363 This is a special case: we silence any PyExc_UnicodeDecodeError
6364 and instead turn it into a PyErr_UnicodeWarning.
6365
6366 */
6367 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6368 return NULL;
6369 PyErr_Clear();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006370 if (PyErr_Warn(PyExc_UnicodeWarning,
6371 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006372 "Unicode equal comparison "
6373 "failed to convert both arguments to Unicode - "
6374 "interpreting them as being unequal" :
6375 "Unicode unequal comparison "
6376 "failed to convert both arguments to Unicode - "
6377 "interpreting them as being unequal"
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006378 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006379 return NULL;
6380 result = (op == Py_NE);
6381 return PyBool_FromLong(result);
6382}
6383
Guido van Rossum403d68b2000-03-13 15:55:09 +00006384int PyUnicode_Contains(PyObject *container,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006385 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006386{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006387 PyObject *str, *sub;
6388 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006389
6390 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006391 sub = PyUnicode_FromObject(element);
6392 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006393 PyErr_SetString(PyExc_TypeError,
6394 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006395 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006396 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006397
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006398 str = PyUnicode_FromObject(container);
6399 if (!str) {
6400 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006401 return -1;
6402 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006403
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006404 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006405
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006406 Py_DECREF(str);
6407 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006408
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006409 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006410}
6411
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412/* Concat to string or Unicode object giving a new Unicode object. */
6413
6414PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006415 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
6417 PyUnicodeObject *u = NULL, *v = NULL, *w;
6418
6419 /* Coerce the two arguments */
6420 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6421 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6424 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
6427 /* Shortcuts */
6428 if (v == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006429 Py_DECREF(v);
6430 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 }
6432 if (u == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006433 Py_DECREF(u);
6434 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 }
6436
6437 /* Concat the two Unicode strings */
6438 w = _PyUnicode_New(u->length + v->length);
6439 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 Py_UNICODE_COPY(w->str, u->str, u->length);
6442 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6443
6444 Py_DECREF(u);
6445 Py_DECREF(v);
6446 return (PyObject *)w;
6447
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006448 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 Py_XDECREF(u);
6450 Py_XDECREF(v);
6451 return NULL;
6452}
6453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006454PyDoc_STRVAR(count__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006455 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006457Return the number of non-overlapping occurrences of substring sub in\n\
6458Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006459interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
6461static PyObject *
6462unicode_count(PyUnicodeObject *self, PyObject *args)
6463{
6464 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006465 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006466 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 PyObject *result;
6468
Guido van Rossumb8872e62000-05-09 14:14:27 +00006469 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006470 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 return NULL;
6472
6473 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006474 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006476 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006477
Fredrik Lundhc8162812006-05-26 19:33:03 +00006478 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006480 result = PyInt_FromSsize_t(
6481 stringlib_count(self->str + start, end - start,
6482 substring->str, substring->length)
6483 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484
6485 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006486
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 return result;
6488}
6489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006490PyDoc_STRVAR(encode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006491 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006493Encodes S using the codec registered for encoding. encoding defaults\n\
6494to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006495handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6497'xmlcharrefreplace' as well as any other name registered with\n\
6498codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499
6500static PyObject *
6501unicode_encode(PyUnicodeObject *self, PyObject *args)
6502{
6503 char *encoding = NULL;
6504 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006505 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006506
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6508 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006509 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006510 if (v == NULL)
6511 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006512 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006513 PyErr_Format(PyExc_TypeError,
6514 "encoder did not return a string/unicode object "
6515 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006516 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006517 Py_DECREF(v);
6518 return NULL;
6519 }
6520 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006521
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006522 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006523 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006524}
6525
6526PyDoc_STRVAR(decode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006527 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006528\n\
6529Decodes S using the codec registered for encoding. encoding defaults\n\
6530to the default encoding. errors may be given to set a different error\n\
6531handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6532a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6533as well as any other name registerd with codecs.register_error that is\n\
6534able to handle UnicodeDecodeErrors.");
6535
6536static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006537unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006538{
6539 char *encoding = NULL;
6540 char *errors = NULL;
6541 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006542
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006543 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6544 return NULL;
6545 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006546 if (v == NULL)
6547 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006548 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006549 PyErr_Format(PyExc_TypeError,
6550 "decoder did not return a string/unicode object "
6551 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006552 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006553 Py_DECREF(v);
6554 return NULL;
6555 }
6556 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006557
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006558 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560}
6561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006562PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006563 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564\n\
6565Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
6568static PyObject*
6569unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6570{
6571 Py_UNICODE *e;
6572 Py_UNICODE *p;
6573 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006574 Py_UNICODE *qe;
6575 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 PyUnicodeObject *u;
6577 int tabsize = 8;
6578
6579 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
Thomas Wouters7e474022000-07-16 12:04:32 +00006582 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006583 i = 0; /* chars up to and including most recent \n or \r */
6584 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6585 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 for (p = self->str; p < e; p++)
6587 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006588 if (tabsize > 0) {
6589 incr = tabsize - (j % tabsize); /* cannot overflow */
6590 if (j > PY_SSIZE_T_MAX - incr)
6591 goto overflow1;
6592 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006593 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006596 if (j > PY_SSIZE_T_MAX - 1)
6597 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 j++;
6599 if (*p == '\n' || *p == '\r') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006600 if (i > PY_SSIZE_T_MAX - j)
6601 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006603 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 }
6605 }
6606
Guido van Rossum5bdff602008-03-11 21:18:06 +00006607 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006608 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Second pass: create output string and fill it */
6611 u = _PyUnicode_New(i + j);
6612 if (!u)
6613 return NULL;
6614
Guido van Rossum5bdff602008-03-11 21:18:06 +00006615 j = 0; /* same as in first pass */
6616 q = u->str; /* next output char */
6617 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
6619 for (p = self->str; p < e; p++)
6620 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006621 if (tabsize > 0) {
6622 i = tabsize - (j % tabsize);
6623 j += i;
6624 while (i--) {
6625 if (q >= qe)
6626 goto overflow2;
6627 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006628 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006629 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006630 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006631 else {
6632 if (q >= qe)
6633 goto overflow2;
6634 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006635 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 if (*p == '\n' || *p == '\r')
6637 j = 0;
6638 }
6639
6640 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006641
6642 overflow2:
6643 Py_DECREF(u);
6644 overflow1:
6645 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649PyDoc_STRVAR(find__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006650 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651\n\
6652Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006653such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654arguments start and end are interpreted as in slice notation.\n\
6655\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006656Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
6658static PyObject *
6659unicode_find(PyUnicodeObject *self, PyObject *args)
6660{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006661 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006662 Py_ssize_t start;
6663 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006664 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
Facundo Batista57d56692007-11-16 18:04:14 +00006666 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006669 result = stringlib_find_slice(
6670 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6671 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6672 start, end
6673 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006676
6677 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
6680static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006681unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682{
6683 if (index < 0 || index >= self->length) {
6684 PyErr_SetString(PyExc_IndexError, "string index out of range");
6685 return NULL;
6686 }
6687
6688 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6689}
6690
6691static long
6692unicode_hash(PyUnicodeObject *self)
6693{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006694 /* Since Unicode objects compare equal to their ASCII string
6695 counterparts, they should use the individual character values
6696 as basis for their hash value. This is needed to assure that
6697 strings and Unicode objects behave in the same way as
6698 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
Martin v. Löwis18e16552006-02-15 17:27:45 +00006700 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006701 register Py_UNICODE *p;
6702 register long x;
6703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 if (self->hash != -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006705 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006706 len = PyUnicode_GET_SIZE(self);
6707 p = PyUnicode_AS_UNICODE(self);
6708 x = *p << 7;
6709 while (--len >= 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006710 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006711 x ^= PyUnicode_GET_SIZE(self);
6712 if (x == -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006713 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006714 self->hash = x;
6715 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716}
6717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006718PyDoc_STRVAR(index__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006719 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722
6723static PyObject *
6724unicode_index(PyUnicodeObject *self, PyObject *args)
6725{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006726 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006727 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006728 Py_ssize_t start;
6729 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
Facundo Batista57d56692007-11-16 18:04:14 +00006731 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006734 result = stringlib_find_slice(
6735 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6736 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6737 start, end
6738 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
6740 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 if (result < 0) {
6743 PyErr_SetString(PyExc_ValueError, "substring not found");
6744 return NULL;
6745 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006746
Martin v. Löwis18e16552006-02-15 17:27:45 +00006747 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748}
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(islower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006751 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006753Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006757unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
6759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6760 register const Py_UNICODE *e;
6761 int cased;
6762
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006765 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006767 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006768 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006769 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 e = p + PyUnicode_GET_SIZE(self);
6772 cased = 0;
6773 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006774 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006776 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6777 return PyBool_FromLong(0);
6778 else if (!cased && Py_UNICODE_ISLOWER(ch))
6779 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782}
6783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006785 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006787Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
6790static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006791unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792{
6793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6794 register const Py_UNICODE *e;
6795 int cased;
6796
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 /* Shortcut for single character strings */
6798 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006799 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006801 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006802 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006803 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006804
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 e = p + PyUnicode_GET_SIZE(self);
6806 cased = 0;
6807 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006808 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006809
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006810 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6811 return PyBool_FromLong(0);
6812 else if (!cased && Py_UNICODE_ISUPPER(ch))
6813 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006815 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816}
6817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006818PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006819 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006821Return True if S is a titlecased string and there is at least one\n\
6822character in S, i.e. upper- and titlecase characters may only\n\
6823follow uncased characters and lowercase characters only cased ones.\n\
6824Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
6826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006827unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828{
6829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6830 register const Py_UNICODE *e;
6831 int cased, previous_is_cased;
6832
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 /* Shortcut for single character strings */
6834 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006835 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6836 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006838 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006839 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006840 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006841
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 e = p + PyUnicode_GET_SIZE(self);
6843 cased = 0;
6844 previous_is_cased = 0;
6845 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006846 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006847
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6849 if (previous_is_cased)
6850 return PyBool_FromLong(0);
6851 previous_is_cased = 1;
6852 cased = 1;
6853 }
6854 else if (Py_UNICODE_ISLOWER(ch)) {
6855 if (!previous_is_cased)
6856 return PyBool_FromLong(0);
6857 previous_is_cased = 1;
6858 cased = 1;
6859 }
6860 else
6861 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006866PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006867 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006869Return True if all characters in S are whitespace\n\
6870and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871
6872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006873unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
6875 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6876 register const Py_UNICODE *e;
6877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 /* Shortcut for single character strings */
6879 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006880 Py_UNICODE_ISSPACE(*p))
6881 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006883 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006884 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006885 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 e = p + PyUnicode_GET_SIZE(self);
6888 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006889 if (!Py_UNICODE_ISSPACE(*p))
6890 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893}
6894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006895PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006896 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006897\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006898Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006899and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006900
6901static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006902unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006903{
6904 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6905 register const Py_UNICODE *e;
6906
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006907 /* Shortcut for single character strings */
6908 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006909 Py_UNICODE_ISALPHA(*p))
6910 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006911
6912 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006913 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006914 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915
6916 e = p + PyUnicode_GET_SIZE(self);
6917 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006918 if (!Py_UNICODE_ISALPHA(*p))
6919 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006922}
6923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006925 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006926\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006927Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006928and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006929
6930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006931unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006932{
6933 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6934 register const Py_UNICODE *e;
6935
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006936 /* Shortcut for single character strings */
6937 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006938 Py_UNICODE_ISALNUM(*p))
6939 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006940
6941 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006942 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006943 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006944
6945 e = p + PyUnicode_GET_SIZE(self);
6946 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006947 if (!Py_UNICODE_ISALNUM(*p))
6948 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006951}
6952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006953PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006954 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
6959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006960unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
6962 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6963 register const Py_UNICODE *e;
6964
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 /* Shortcut for single character strings */
6966 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006967 Py_UNICODE_ISDECIMAL(*p))
6968 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006970 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006971 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 e = p + PyUnicode_GET_SIZE(self);
6975 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006976 if (!Py_UNICODE_ISDECIMAL(*p))
6977 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006979 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980}
6981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006983 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006985Return True if all characters in S are digits\n\
6986and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987
6988static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006989unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990{
6991 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6992 register const Py_UNICODE *e;
6993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 /* Shortcut for single character strings */
6995 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006996 Py_UNICODE_ISDIGIT(*p))
6997 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006999 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007000 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007001 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 e = p + PyUnicode_GET_SIZE(self);
7004 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007005 if (!Py_UNICODE_ISDIGIT(*p))
7006 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007008 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009}
7010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007011PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007012 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007014Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007015False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016
7017static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007018unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019{
7020 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7021 register const Py_UNICODE *e;
7022
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 /* Shortcut for single character strings */
7024 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007025 Py_UNICODE_ISNUMERIC(*p))
7026 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007028 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007029 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007030 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007031
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 e = p + PyUnicode_GET_SIZE(self);
7033 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007034 if (!Py_UNICODE_ISNUMERIC(*p))
7035 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007037 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040PyDoc_STRVAR(join__doc__,
Georg Brandl5d2eb342009-10-27 15:08:27 +00007041 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042\n\
7043Return a string which is the concatenation of the strings in the\n\
Georg Brandl5d2eb342009-10-27 15:08:27 +00007044iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
7046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007049 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050}
7051
Martin v. Löwis18e16552006-02-15 17:27:45 +00007052static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053unicode_length(PyUnicodeObject *self)
7054{
7055 return self->length;
7056}
7057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007059 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007061Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007062done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064static PyObject *
7065unicode_ljust(PyUnicodeObject *self, PyObject *args)
7066{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007067 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007068 Py_UNICODE fillchar = ' ';
7069
Martin v. Löwis412fb672006-04-13 06:34:32 +00007070 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 return NULL;
7072
Tim Peters7a29bd52001-09-12 03:03:31 +00007073 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 Py_INCREF(self);
7075 return (PyObject*) self;
7076 }
7077
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007078 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079}
7080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081PyDoc_STRVAR(lower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007082 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
7086static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007087unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 return fixup(self, fixlower);
7090}
7091
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092#define LEFTSTRIP 0
7093#define RIGHTSTRIP 1
7094#define BOTHSTRIP 2
7095
7096/* Arrays indexed by above */
7097static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7098
7099#define STRIPNAME(i) (stripformat[i]+3)
7100
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101/* externally visible for str.strip(unicode) */
7102PyObject *
7103_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7104{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007105 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7106 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7107 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7108 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7109 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007111 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007112
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007113 i = 0;
7114 if (striptype != RIGHTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007115 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7116 i++;
7117 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007118 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007120 j = len;
7121 if (striptype != LEFTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007122 do {
7123 j--;
7124 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7125 j++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007126 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007127
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007128 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007129 Py_INCREF(self);
7130 return (PyObject*)self;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007131 }
7132 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007133 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134}
7135
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136
7137static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007138do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007140 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7141 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007142
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007143 i = 0;
7144 if (striptype != RIGHTSTRIP) {
7145 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7146 i++;
7147 }
7148 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007149
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007150 j = len;
7151 if (striptype != LEFTSTRIP) {
7152 do {
7153 j--;
7154 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7155 j++;
7156 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007157
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007158 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7159 Py_INCREF(self);
7160 return (PyObject*)self;
7161 }
7162 else
7163 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164}
7165
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007166
7167static PyObject *
7168do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7169{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007170 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007171
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007172 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7173 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007174
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007175 if (sep != NULL && sep != Py_None) {
7176 if (PyUnicode_Check(sep))
7177 return _PyUnicode_XStrip(self, striptype, sep);
7178 else if (PyString_Check(sep)) {
7179 PyObject *res;
7180 sep = PyUnicode_FromObject(sep);
7181 if (sep==NULL)
7182 return NULL;
7183 res = _PyUnicode_XStrip(self, striptype, sep);
7184 Py_DECREF(sep);
7185 return res;
7186 }
7187 else {
7188 PyErr_Format(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007189 "%s arg must be None, unicode or str",
7190 STRIPNAME(striptype));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007191 return NULL;
7192 }
7193 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007194
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007195 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196}
7197
7198
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007199PyDoc_STRVAR(strip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007200 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007201\n\
7202Return a copy of the string S with leading and trailing\n\
7203whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007204If chars is given and not None, remove characters in chars instead.\n\
7205If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007206
7207static PyObject *
7208unicode_strip(PyUnicodeObject *self, PyObject *args)
7209{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007210 if (PyTuple_GET_SIZE(args) == 0)
7211 return do_strip(self, BOTHSTRIP); /* Common case */
7212 else
7213 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007214}
7215
7216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007217PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007218 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007219\n\
7220Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007221If chars is given and not None, remove characters in chars instead.\n\
7222If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007223
7224static PyObject *
7225unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7226{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007227 if (PyTuple_GET_SIZE(args) == 0)
7228 return do_strip(self, LEFTSTRIP); /* Common case */
7229 else
7230 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007231}
7232
7233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007235 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007236\n\
7237Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007238If chars is given and not None, remove characters in chars instead.\n\
7239If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007240
7241static PyObject *
7242unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7243{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007244 if (PyTuple_GET_SIZE(args) == 0)
7245 return do_strip(self, RIGHTSTRIP); /* Common case */
7246 else
7247 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007248}
7249
7250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
7254 PyUnicodeObject *u;
7255 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007257 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258
7259 if (len < 0)
7260 len = 0;
7261
Tim Peters7a29bd52001-09-12 03:03:31 +00007262 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 /* no repeat, return original string */
7264 Py_INCREF(str);
7265 return (PyObject*) str;
7266 }
Tim Peters8f422462000-09-09 06:13:41 +00007267
7268 /* ensure # of chars needed doesn't overflow int and # of bytes
7269 * needed doesn't overflow size_t
7270 */
7271 nchars = len * str->length;
7272 if (len && nchars / len != str->length) {
7273 PyErr_SetString(PyExc_OverflowError,
7274 "repeated string is too long");
7275 return NULL;
7276 }
7277 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7278 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7279 PyErr_SetString(PyExc_OverflowError,
7280 "repeated string is too long");
7281 return NULL;
7282 }
7283 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 if (!u)
7285 return NULL;
7286
7287 p = u->str;
7288
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007289 if (str->length == 1 && len > 0) {
7290 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007291 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007292 Py_ssize_t done = 0; /* number of characters copied this far */
7293 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007294 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007295 done = str->length;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007296 }
7297 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007298 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007299 Py_UNICODE_COPY(p+done, p, n);
7300 done += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007301 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303
7304 return (PyObject*) u;
7305}
7306
7307PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007308 PyObject *subobj,
7309 PyObject *replobj,
7310 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311{
7312 PyObject *self;
7313 PyObject *str1;
7314 PyObject *str2;
7315 PyObject *result;
7316
7317 self = PyUnicode_FromObject(obj);
7318 if (self == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 str1 = PyUnicode_FromObject(subobj);
7321 if (str1 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007322 Py_DECREF(self);
7323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 }
7325 str2 = PyUnicode_FromObject(replobj);
7326 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007327 Py_DECREF(self);
7328 Py_DECREF(str1);
7329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 }
Tim Petersced69f82003-09-16 20:30:58 +00007331 result = replace((PyUnicodeObject *)self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007332 (PyUnicodeObject *)str1,
7333 (PyUnicodeObject *)str2,
7334 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 Py_DECREF(self);
7336 Py_DECREF(str1);
7337 Py_DECREF(str2);
7338 return result;
7339}
7340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007341PyDoc_STRVAR(replace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007342 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343\n\
7344Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007345old replaced by new. If the optional argument count is\n\
7346given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347
7348static PyObject*
7349unicode_replace(PyUnicodeObject *self, PyObject *args)
7350{
7351 PyUnicodeObject *str1;
7352 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007353 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 PyObject *result;
7355
Martin v. Löwis18e16552006-02-15 17:27:45 +00007356 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 return NULL;
7358 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7359 if (str1 == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007362 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007363 Py_DECREF(str1);
7364 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366
7367 result = replace(self, str1, str2, maxcount);
7368
7369 Py_DECREF(str1);
7370 Py_DECREF(str2);
7371 return result;
7372}
7373
7374static
7375PyObject *unicode_repr(PyObject *unicode)
7376{
7377 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007378 PyUnicode_GET_SIZE(unicode),
7379 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380}
7381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007382PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007383 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384\n\
7385Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007386such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387arguments start and end are interpreted as in slice notation.\n\
7388\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007389Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390
7391static PyObject *
7392unicode_rfind(PyUnicodeObject *self, PyObject *args)
7393{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007394 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007395 Py_ssize_t start;
7396 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007397 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
Facundo Batista57d56692007-11-16 18:04:14 +00007399 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007402 result = stringlib_rfind_slice(
7403 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7404 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7405 start, end
7406 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
7408 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007409
7410 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411}
7412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007413PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007414 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
7418static PyObject *
7419unicode_rindex(PyUnicodeObject *self, PyObject *args)
7420{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007421 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007422 Py_ssize_t start;
7423 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007424 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425
Facundo Batista57d56692007-11-16 18:04:14 +00007426 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007429 result = stringlib_rfind_slice(
7430 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7431 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7432 start, end
7433 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434
7435 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007436
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 if (result < 0) {
7438 PyErr_SetString(PyExc_ValueError, "substring not found");
7439 return NULL;
7440 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442}
7443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007444PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007445 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007447Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007448done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
7450static PyObject *
7451unicode_rjust(PyUnicodeObject *self, PyObject *args)
7452{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007453 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007454 Py_UNICODE fillchar = ' ';
7455
Martin v. Löwis412fb672006-04-13 06:34:32 +00007456 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 return NULL;
7458
Tim Peters7a29bd52001-09-12 03:03:31 +00007459 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 Py_INCREF(self);
7461 return (PyObject*) self;
7462 }
7463
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007464 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465}
7466
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007468unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469{
7470 /* standard clamping */
7471 if (start < 0)
7472 start = 0;
7473 if (end < 0)
7474 end = 0;
7475 if (end > self->length)
7476 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007477 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 /* full slice, return original string */
7479 Py_INCREF(self);
7480 return (PyObject*) self;
7481 }
7482 if (start > end)
7483 start = end;
7484 /* copy slice */
7485 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007486 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487}
7488
7489PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007490 PyObject *sep,
7491 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492{
7493 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007494
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 s = PyUnicode_FromObject(s);
7496 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007497 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007498 if (sep != NULL) {
7499 sep = PyUnicode_FromObject(sep);
7500 if (sep == NULL) {
7501 Py_DECREF(s);
7502 return NULL;
7503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 }
7505
7506 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7507
7508 Py_DECREF(s);
7509 Py_XDECREF(sep);
7510 return result;
7511}
7512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007513PyDoc_STRVAR(split__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007514 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515\n\
7516Return a list of the words in S, using sep as the\n\
7517delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007518splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007519whitespace string is a separator and empty strings are\n\
7520removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521
7522static PyObject*
7523unicode_split(PyUnicodeObject *self, PyObject *args)
7524{
7525 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007526 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
Martin v. Löwis18e16552006-02-15 17:27:45 +00007528 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 return NULL;
7530
7531 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007532 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007534 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007536 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537}
7538
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007539PyObject *
7540PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7541{
7542 PyObject* str_obj;
7543 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007544 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007545
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007546 str_obj = PyUnicode_FromObject(str_in);
7547 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007548 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007549 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007550 if (!sep_obj) {
7551 Py_DECREF(str_obj);
7552 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553 }
7554
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007555 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007556 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7557 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7558 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007559
Fredrik Lundhb9479482006-05-26 17:22:38 +00007560 Py_DECREF(sep_obj);
7561 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007562
7563 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007564}
7565
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007566
7567PyObject *
7568PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7569{
7570 PyObject* str_obj;
7571 PyObject* sep_obj;
7572 PyObject* out;
7573
7574 str_obj = PyUnicode_FromObject(str_in);
7575 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007576 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007577 sep_obj = PyUnicode_FromObject(sep_in);
7578 if (!sep_obj) {
7579 Py_DECREF(str_obj);
7580 return NULL;
7581 }
7582
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007583 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007584 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7585 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7586 );
7587
7588 Py_DECREF(sep_obj);
7589 Py_DECREF(str_obj);
7590
7591 return out;
7592}
7593
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007594PyDoc_STRVAR(partition__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007595 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007596\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007597Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007598the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007599found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007600
7601static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007602unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007603{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007604 return PyUnicode_Partition((PyObject *)self, separator);
7605}
7606
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007607PyDoc_STRVAR(rpartition__doc__,
Ezio Melottidabb5f72010-01-25 11:46:11 +00007608 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007609\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007610Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007611the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007612separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007613
7614static PyObject*
7615unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7616{
7617 return PyUnicode_RPartition((PyObject *)self, separator);
7618}
7619
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007620PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007621 PyObject *sep,
7622 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007623{
7624 PyObject *result;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007625
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007626 s = PyUnicode_FromObject(s);
7627 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007628 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007629 if (sep != NULL) {
7630 sep = PyUnicode_FromObject(sep);
7631 if (sep == NULL) {
7632 Py_DECREF(s);
7633 return NULL;
7634 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007635 }
7636
7637 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7638
7639 Py_DECREF(s);
7640 Py_XDECREF(sep);
7641 return result;
7642}
7643
7644PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007645 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007646\n\
7647Return a list of the words in S, using sep as the\n\
7648delimiter string, starting at the end of the string and\n\
7649working to the front. If maxsplit is given, at most maxsplit\n\
7650splits are done. If sep is not specified, any whitespace string\n\
7651is a separator.");
7652
7653static PyObject*
7654unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7655{
7656 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007657 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007658
Martin v. Löwis18e16552006-02-15 17:27:45 +00007659 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007660 return NULL;
7661
7662 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007663 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007664 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007665 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007666 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007667 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007668}
7669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007670PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007671 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672\n\
7673Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007674Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007675is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676
7677static PyObject*
7678unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7679{
Guido van Rossum86662912000-04-11 15:38:46 +00007680 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681
Guido van Rossum86662912000-04-11 15:38:46 +00007682 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683 return NULL;
7684
Guido van Rossum86662912000-04-11 15:38:46 +00007685 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686}
7687
7688static
7689PyObject *unicode_str(PyUnicodeObject *self)
7690{
Fred Drakee4315f52000-05-09 19:53:39 +00007691 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692}
7693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007694PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007695 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696\n\
7697Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007698and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699
7700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007701unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 return fixup(self, fixswapcase);
7704}
7705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706PyDoc_STRVAR(translate__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007707 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708\n\
7709Return a copy of the string S, where all characters have been mapped\n\
7710through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007711Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7712Unmapped characters are left untouched. Characters mapped to None\n\
7713are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
7715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007716unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717{
Tim Petersced69f82003-09-16 20:30:58 +00007718 return PyUnicode_TranslateCharmap(self->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007719 self->length,
7720 table,
7721 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007724PyDoc_STRVAR(upper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007725 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007727Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
7729static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007730unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 return fixup(self, fixupper);
7733}
7734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007735PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007736 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737\n\
Georg Brandl98064072008-09-09 19:26:00 +00007738Pad a numeric string S with zeros on the left, to fill a field\n\
7739of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
7741static PyObject *
7742unicode_zfill(PyUnicodeObject *self, PyObject *args)
7743{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007744 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 PyUnicodeObject *u;
7746
Martin v. Löwis18e16552006-02-15 17:27:45 +00007747 Py_ssize_t width;
7748 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 return NULL;
7750
7751 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007752 if (PyUnicode_CheckExact(self)) {
7753 Py_INCREF(self);
7754 return (PyObject*) self;
7755 }
7756 else
7757 return PyUnicode_FromUnicode(
7758 PyUnicode_AS_UNICODE(self),
7759 PyUnicode_GET_SIZE(self)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007760 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 }
7762
7763 fill = width - self->length;
7764
7765 u = pad(self, fill, 0, '0');
7766
Walter Dörwald068325e2002-04-15 13:36:47 +00007767 if (u == NULL)
7768 return NULL;
7769
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 if (u->str[fill] == '+' || u->str[fill] == '-') {
7771 /* move sign to beginning of string */
7772 u->str[0] = u->str[fill];
7773 u->str[fill] = '0';
7774 }
7775
7776 return (PyObject*) u;
7777}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
7779#if 0
7780static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007781free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007783 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784}
7785#endif
7786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007787PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007788 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007790Return True if S starts with the specified prefix, False otherwise.\n\
7791With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007792With optional end, stop comparing S at that position.\n\
7793prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794
7795static PyObject *
7796unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007797 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
Georg Brandl24250812006-06-09 18:45:48 +00007799 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007802 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007803 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
Georg Brandl24250812006-06-09 18:45:48 +00007805 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007806 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7807 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007808 if (PyTuple_Check(subobj)) {
7809 Py_ssize_t i;
7810 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7811 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007812 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007813 if (substring == NULL)
7814 return NULL;
7815 result = tailmatch(self, substring, start, end, -1);
7816 Py_DECREF(substring);
7817 if (result) {
7818 Py_RETURN_TRUE;
7819 }
7820 }
7821 /* nothing matched */
7822 Py_RETURN_FALSE;
7823 }
7824 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007826 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007827 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007829 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
7832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007833PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007834 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007836Return True if S ends with the specified suffix, False otherwise.\n\
7837With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007838With optional end, stop comparing S at that position.\n\
7839suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840
7841static PyObject *
7842unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007843 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844{
Georg Brandl24250812006-06-09 18:45:48 +00007845 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007847 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007848 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007849 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850
Georg Brandl24250812006-06-09 18:45:48 +00007851 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007852 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7853 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007854 if (PyTuple_Check(subobj)) {
7855 Py_ssize_t i;
7856 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7857 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007858 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007859 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007860 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007861 result = tailmatch(self, substring, start, end, +1);
7862 Py_DECREF(substring);
7863 if (result) {
7864 Py_RETURN_TRUE;
7865 }
7866 }
7867 Py_RETURN_FALSE;
7868 }
7869 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872
Georg Brandl24250812006-06-09 18:45:48 +00007873 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007875 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876}
7877
7878
Eric Smitha9f7d622008-02-17 19:46:49 +00007879/* Implements do_string_format, which is unicode because of stringlib */
7880#include "stringlib/string_format.h"
7881
7882PyDoc_STRVAR(format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007883 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007884\n\
7885");
7886
Eric Smithdc13b792008-05-30 18:10:04 +00007887static PyObject *
7888unicode__format__(PyObject *self, PyObject *args)
7889{
7890 PyObject *format_spec;
7891 PyObject *result = NULL;
7892 PyObject *tmp = NULL;
7893
7894 /* If 2.x, convert format_spec to the same type as value */
7895 /* This is to allow things like u''.format('') */
7896 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7897 goto done;
7898 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7899 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007900 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007901 goto done;
7902 }
7903 tmp = PyObject_Unicode(format_spec);
7904 if (tmp == NULL)
7905 goto done;
7906 format_spec = tmp;
7907
7908 result = _PyUnicode_FormatAdvanced(self,
7909 PyUnicode_AS_UNICODE(format_spec),
7910 PyUnicode_GET_SIZE(format_spec));
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007911 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007912 Py_XDECREF(tmp);
7913 return result;
7914}
7915
Eric Smitha9f7d622008-02-17 19:46:49 +00007916PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007917 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007918\n\
7919");
7920
Robert Schuppenies901c9972008-06-10 10:10:31 +00007921static PyObject *
7922unicode__sizeof__(PyUnicodeObject *v)
7923{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007924 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7925 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007926}
7927
7928PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007929 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007930\n\
7931");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007932
7933static PyObject *
7934unicode_getnewargs(PyUnicodeObject *v)
7935{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007936 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007937}
7938
7939
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940static PyMethodDef unicode_methods[] = {
7941
7942 /* Order is according to common usage: often used methods should
7943 appear first, since lookup is done sequentially. */
7944
Georg Brandlecdc0a92006-03-30 12:19:07 +00007945 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007946 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7947 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007948 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007949 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7950 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7951 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7952 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7953 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7954 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7955 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007956 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007957 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7958 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7959 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007960 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007961 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007962/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7963 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7964 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7965 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007966 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007967 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007968 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007969 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007970 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7971 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7972 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7973 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7974 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7975 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7976 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7977 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7978 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7979 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7980 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7981 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7982 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7983 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007985 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7986 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7987 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7988 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007989 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007990#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007991 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992#endif
7993
7994#if 0
7995 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007996 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997#endif
7998
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007999 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 {NULL, NULL}
8001};
8002
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008003static PyObject *
8004unicode_mod(PyObject *v, PyObject *w)
8005{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008006 if (!PyUnicode_Check(v)) {
8007 Py_INCREF(Py_NotImplemented);
8008 return Py_NotImplemented;
8009 }
8010 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008011}
8012
8013static PyNumberMethods unicode_as_number = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008014 0, /*nb_add*/
8015 0, /*nb_subtract*/
8016 0, /*nb_multiply*/
8017 0, /*nb_divide*/
8018 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008019};
8020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008022 (lenfunc) unicode_length, /* sq_length */
8023 PyUnicode_Concat, /* sq_concat */
8024 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8025 (ssizeargfunc) unicode_getitem, /* sq_item */
8026 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8027 0, /* sq_ass_item */
8028 0, /* sq_ass_slice */
8029 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030};
8031
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008032static PyObject*
8033unicode_subscript(PyUnicodeObject* self, PyObject* item)
8034{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008035 if (PyIndex_Check(item)) {
8036 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008037 if (i == -1 && PyErr_Occurred())
8038 return NULL;
8039 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008040 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008041 return unicode_getitem(self, i);
8042 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008043 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008044 Py_UNICODE* source_buf;
8045 Py_UNICODE* result_buf;
8046 PyObject* result;
8047
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008048 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008049 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008050 return NULL;
8051 }
8052
8053 if (slicelength <= 0) {
8054 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008055 } else if (start == 0 && step == 1 && slicelength == self->length &&
8056 PyUnicode_CheckExact(self)) {
8057 Py_INCREF(self);
8058 return (PyObject *)self;
8059 } else if (step == 1) {
8060 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008061 } else {
8062 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008063 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8064 sizeof(Py_UNICODE));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008065
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008066 if (result_buf == NULL)
8067 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008068
8069 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8070 result_buf[i] = source_buf[cur];
8071 }
Tim Petersced69f82003-09-16 20:30:58 +00008072
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008073 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008074 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008075 return result;
8076 }
8077 } else {
8078 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8079 return NULL;
8080 }
8081}
8082
8083static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008084 (lenfunc)unicode_length, /* mp_length */
8085 (binaryfunc)unicode_subscript, /* mp_subscript */
8086 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008087};
8088
Martin v. Löwis18e16552006-02-15 17:27:45 +00008089static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008091 Py_ssize_t index,
8092 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093{
8094 if (index != 0) {
8095 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008096 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 return -1;
8098 }
8099 *ptr = (void *) self->str;
8100 return PyUnicode_GET_DATA_SIZE(self);
8101}
8102
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103static Py_ssize_t
8104unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008105 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
8107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008108 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 return -1;
8110}
8111
8112static int
8113unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008114 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115{
8116 if (lenp)
8117 *lenp = PyUnicode_GET_DATA_SIZE(self);
8118 return 1;
8119}
8120
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008121static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008123 Py_ssize_t index,
8124 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125{
8126 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (index != 0) {
8129 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008130 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 return -1;
8132 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008133 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008135 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008136 *ptr = (void *) PyString_AS_STRING(str);
8137 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138}
8139
8140/* Helpers for PyUnicode_Format() */
8141
8142static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 if (argidx < arglen) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008147 (*p_argidx)++;
8148 if (arglen < 0)
8149 return args;
8150 else
8151 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 }
8153 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008154 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 return NULL;
8156}
8157
8158#define F_LJUST (1<<0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008159#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160#define F_BLANK (1<<2)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008161#define F_ALT (1<<3)
8162#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163
Martin v. Löwis18e16552006-02-15 17:27:45 +00008164static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008165strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 register Py_ssize_t i;
8168 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 for (i = len - 1; i >= 0; i--)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008170 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 return len;
8173}
8174
Neal Norwitzfc76d632006-01-10 06:03:13 +00008175static int
8176doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8177{
Tim Peters15231542006-02-16 01:08:01 +00008178 Py_ssize_t result;
8179
Neal Norwitzfc76d632006-01-10 06:03:13 +00008180 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008181 result = strtounicode(buffer, (char *)buffer);
8182 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008183}
8184
8185static int
8186longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8187{
Tim Peters15231542006-02-16 01:08:01 +00008188 Py_ssize_t result;
8189
Neal Norwitzfc76d632006-01-10 06:03:13 +00008190 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008191 result = strtounicode(buffer, (char *)buffer);
8192 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008193}
8194
Guido van Rossum078151d2002-08-11 04:24:12 +00008195/* XXX To save some code duplication, formatfloat/long/int could have been
8196 shared with stringobject.c, converting from 8-bit to Unicode after the
8197 formatting is done. */
8198
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199static int
8200formatfloat(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008201 size_t buflen,
8202 int flags,
8203 int prec,
8204 int type,
8205 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008207 /* fmt = '%#.' + `prec` + `type`
8208 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 char fmt[20];
8210 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008211
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 x = PyFloat_AsDouble(v);
8213 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008214 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 if (prec < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008216 prec = 6;
Mark Dickinson75be68b2009-08-28 20:57:42 +00008217#if SIZEOF_INT > 4
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008218 /* make sure that the decimal representation of precision really does
8219 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
Mark Dickinson75be68b2009-08-28 20:57:42 +00008220 if (prec > 0x7fffffff) {
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008221 PyErr_SetString(PyExc_OverflowError,
8222 "outrageously large precision "
8223 "for formatted float");
8224 return -1;
8225 }
Mark Dickinson75be68b2009-08-28 20:57:42 +00008226#endif
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008227
Mark Dickinsona30f3492009-03-29 15:06:29 +00008228 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008229 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008230 /* Worst case length calc to ensure no buffer overrun:
8231
8232 'g' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008233 fmt = %#.<prec>g
8234 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8235 for any double rep.)
8236 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008237
8238 'f' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008239 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8240 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008241
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008242 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008243 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008244
8245 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008246 if (((type == 'g' || type == 'G') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008247 buflen <= (size_t)10 + (size_t)prec) ||
8248 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8249 PyErr_SetString(PyExc_OverflowError,
8250 "formatted float is too long (precision too large?)");
8251 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008252 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008253 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008254 (flags&F_ALT) ? "#" : "",
8255 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008256 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257}
8258
Tim Peters38fd5b62000-09-21 05:43:11 +00008259static PyObject*
8260formatlong(PyObject *val, int flags, int prec, int type)
8261{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008262 char *buf;
8263 int i, len;
8264 PyObject *str; /* temporary string object. */
8265 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008266
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008267 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8268 if (!str)
8269 return NULL;
8270 result = _PyUnicode_New(len);
8271 if (!result) {
8272 Py_DECREF(str);
8273 return NULL;
8274 }
8275 for (i = 0; i < len; i++)
8276 result->str[i] = buf[i];
8277 result->str[len] = 0;
8278 Py_DECREF(str);
8279 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008280}
8281
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282static int
8283formatint(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008284 size_t buflen,
8285 int flags,
8286 int prec,
8287 int type,
8288 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008290 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008291 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8292 * + 1 + 1
8293 * = 24
8294 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008295 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008296 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 long x;
8298
8299 x = PyInt_AsLong(v);
8300 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008301 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008302 if (x < 0 && type == 'u') {
8303 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008304 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008305 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8306 sign = "-";
8307 else
8308 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008310 prec = 1;
8311
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008312 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8313 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008314 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008315 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008316 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008317 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008318 return -1;
8319 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008320
8321 if ((flags & F_ALT) &&
8322 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008323 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008324 * of issues that cause pain:
8325 * - when 0 is being converted, the C standard leaves off
8326 * the '0x' or '0X', which is inconsistent with other
8327 * %#x/%#X conversions and inconsistent with Python's
8328 * hex() function
8329 * - there are platforms that violate the standard and
8330 * convert 0 with the '0x' or '0X'
8331 * (Metrowerks, Compaq Tru64)
8332 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008333 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008334 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008335 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008336 * We can achieve the desired consistency by inserting our
8337 * own '0x' or '0X' prefix, and substituting %x/%X in place
8338 * of %#x/%#X.
8339 *
8340 * Note that this is the same approach as used in
8341 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008342 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008343 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8344 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008345 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008346 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008347 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8348 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008349 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008350 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008351 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008352 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008353 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008354 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355}
8356
8357static int
8358formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008359 size_t buflen,
8360 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361{
Ezio Melotti85ddea72010-02-25 17:51:33 +00008362 PyObject *unistr;
8363 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008364 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008365 if (PyUnicode_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008366 if (PyUnicode_GET_SIZE(v) != 1)
8367 goto onError;
8368 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008371 else if (PyString_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008372 if (PyString_GET_SIZE(v) != 1)
8373 goto onError;
Ezio Melotti85ddea72010-02-25 17:51:33 +00008374 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8375 with a UnicodeDecodeError if 'char' is not decodable with the
8376 default encoding (usually ASCII, but it might be something else) */
8377 str = PyString_AS_STRING(v);
8378 if ((unsigned char)str[0] > 0x7F) {
8379 /* the char is not ASCII; try to decode the string using the
8380 default encoding and return -1 to let the UnicodeDecodeError
8381 be raised if the string can't be decoded */
8382 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8383 if (unistr == NULL)
8384 return -1;
8385 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8386 Py_DECREF(unistr);
8387 }
8388 else
8389 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391
8392 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008393 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 long x;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008395 x = PyInt_AsLong(v);
8396 if (x == -1 && PyErr_Occurred())
8397 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008398#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008399 if (x < 0 || x > 0x10ffff) {
8400 PyErr_SetString(PyExc_OverflowError,
8401 "%c arg not in range(0x110000) "
8402 "(wide Python build)");
8403 return -1;
8404 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008405#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008406 if (x < 0 || x > 0xffff) {
8407 PyErr_SetString(PyExc_OverflowError,
8408 "%c arg not in range(0x10000) "
8409 "(narrow Python build)");
8410 return -1;
8411 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008412#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008413 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 }
8415 buf[1] = '\0';
8416 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008417
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008418 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008419 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008420 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008421 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422}
8423
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008424/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8425
8426 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8427 chars are formatted. XXX This is a magic number. Each formatting
8428 routine does bounds checking to ensure no overflow, but a better
8429 solution may be to malloc a buffer of appropriate size for each
8430 format. For now, the current solution is sufficient.
8431*/
8432#define FORMATBUFLEN (size_t)120
8433
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008435 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436{
8437 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008438 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 int args_owned = 0;
8440 PyUnicodeObject *result = NULL;
8441 PyObject *dict = NULL;
8442 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008443
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 if (format == NULL || args == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008445 PyErr_BadInternalCall();
8446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 }
8448 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008449 if (uformat == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 fmt = PyUnicode_AS_UNICODE(uformat);
8452 fmtcnt = PyUnicode_GET_SIZE(uformat);
8453
8454 reslen = rescnt = fmtcnt + 100;
8455 result = _PyUnicode_New(reslen);
8456 if (result == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 res = PyUnicode_AS_UNICODE(result);
8459
8460 if (PyTuple_Check(args)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008461 arglen = PyTuple_Size(args);
8462 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 }
8464 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008465 arglen = -1;
8466 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 }
Christian Heimese93237d2007-12-19 02:37:44 +00008468 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008469 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008470 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471
8472 while (--fmtcnt >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008473 if (*fmt != '%') {
8474 if (--rescnt < 0) {
8475 rescnt = fmtcnt + 100;
8476 reslen += rescnt;
8477 if (_PyUnicode_Resize(&result, reslen) < 0)
8478 goto onError;
8479 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8480 --rescnt;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008481 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008482 *res++ = *fmt++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008483 }
8484 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008485 /* Got a format specifier */
8486 int flags = 0;
8487 Py_ssize_t width = -1;
8488 int prec = -1;
8489 Py_UNICODE c = '\0';
8490 Py_UNICODE fill;
8491 int isnumok;
8492 PyObject *v = NULL;
8493 PyObject *temp = NULL;
8494 Py_UNICODE *pbuf;
8495 Py_UNICODE sign;
8496 Py_ssize_t len;
8497 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8498
8499 fmt++;
8500 if (*fmt == '(') {
8501 Py_UNICODE *keystart;
8502 Py_ssize_t keylen;
8503 PyObject *key;
8504 int pcount = 1;
8505
8506 if (dict == NULL) {
8507 PyErr_SetString(PyExc_TypeError,
8508 "format requires a mapping");
8509 goto onError;
8510 }
8511 ++fmt;
8512 --fmtcnt;
8513 keystart = fmt;
8514 /* Skip over balanced parentheses */
8515 while (pcount > 0 && --fmtcnt >= 0) {
8516 if (*fmt == ')')
8517 --pcount;
8518 else if (*fmt == '(')
8519 ++pcount;
8520 fmt++;
8521 }
8522 keylen = fmt - keystart - 1;
8523 if (fmtcnt < 0 || pcount > 0) {
8524 PyErr_SetString(PyExc_ValueError,
8525 "incomplete format key");
8526 goto onError;
8527 }
8528#if 0
8529 /* keys are converted to strings using UTF-8 and
8530 then looked up since Python uses strings to hold
8531 variables names etc. in its namespaces and we
8532 wouldn't want to break common idioms. */
8533 key = PyUnicode_EncodeUTF8(keystart,
8534 keylen,
8535 NULL);
8536#else
8537 key = PyUnicode_FromUnicode(keystart, keylen);
8538#endif
8539 if (key == NULL)
8540 goto onError;
8541 if (args_owned) {
8542 Py_DECREF(args);
8543 args_owned = 0;
8544 }
8545 args = PyObject_GetItem(dict, key);
8546 Py_DECREF(key);
8547 if (args == NULL) {
8548 goto onError;
8549 }
8550 args_owned = 1;
8551 arglen = -1;
8552 argidx = -2;
8553 }
8554 while (--fmtcnt >= 0) {
8555 switch (c = *fmt++) {
8556 case '-': flags |= F_LJUST; continue;
8557 case '+': flags |= F_SIGN; continue;
8558 case ' ': flags |= F_BLANK; continue;
8559 case '#': flags |= F_ALT; continue;
8560 case '0': flags |= F_ZERO; continue;
8561 }
8562 break;
8563 }
8564 if (c == '*') {
8565 v = getnextarg(args, arglen, &argidx);
8566 if (v == NULL)
8567 goto onError;
8568 if (!PyInt_Check(v)) {
8569 PyErr_SetString(PyExc_TypeError,
8570 "* wants int");
8571 goto onError;
8572 }
8573 width = PyInt_AsLong(v);
8574 if (width < 0) {
8575 flags |= F_LJUST;
8576 width = -width;
8577 }
8578 if (--fmtcnt >= 0)
8579 c = *fmt++;
8580 }
8581 else if (c >= '0' && c <= '9') {
8582 width = c - '0';
8583 while (--fmtcnt >= 0) {
8584 c = *fmt++;
8585 if (c < '0' || c > '9')
8586 break;
8587 if ((width*10) / 10 != width) {
8588 PyErr_SetString(PyExc_ValueError,
8589 "width too big");
8590 goto onError;
8591 }
8592 width = width*10 + (c - '0');
8593 }
8594 }
8595 if (c == '.') {
8596 prec = 0;
8597 if (--fmtcnt >= 0)
8598 c = *fmt++;
8599 if (c == '*') {
8600 v = getnextarg(args, arglen, &argidx);
8601 if (v == NULL)
8602 goto onError;
8603 if (!PyInt_Check(v)) {
8604 PyErr_SetString(PyExc_TypeError,
8605 "* wants int");
8606 goto onError;
8607 }
8608 prec = PyInt_AsLong(v);
8609 if (prec < 0)
8610 prec = 0;
8611 if (--fmtcnt >= 0)
8612 c = *fmt++;
8613 }
8614 else if (c >= '0' && c <= '9') {
8615 prec = c - '0';
8616 while (--fmtcnt >= 0) {
8617 c = Py_CHARMASK(*fmt++);
8618 if (c < '0' || c > '9')
8619 break;
8620 if ((prec*10) / 10 != prec) {
8621 PyErr_SetString(PyExc_ValueError,
8622 "prec too big");
8623 goto onError;
8624 }
8625 prec = prec*10 + (c - '0');
8626 }
8627 }
8628 } /* prec */
8629 if (fmtcnt >= 0) {
8630 if (c == 'h' || c == 'l' || c == 'L') {
8631 if (--fmtcnt >= 0)
8632 c = *fmt++;
8633 }
8634 }
8635 if (fmtcnt < 0) {
8636 PyErr_SetString(PyExc_ValueError,
8637 "incomplete format");
8638 goto onError;
8639 }
8640 if (c != '%') {
8641 v = getnextarg(args, arglen, &argidx);
8642 if (v == NULL)
8643 goto onError;
8644 }
8645 sign = 0;
8646 fill = ' ';
8647 switch (c) {
8648
8649 case '%':
8650 pbuf = formatbuf;
8651 /* presume that buffer length is at least 1 */
8652 pbuf[0] = '%';
8653 len = 1;
8654 break;
8655
8656 case 's':
8657 case 'r':
Victor Stinner4fd2ff92010-03-22 12:56:39 +00008658 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008659 temp = v;
8660 Py_INCREF(temp);
8661 }
8662 else {
8663 PyObject *unicode;
8664 if (c == 's')
8665 temp = PyObject_Unicode(v);
8666 else
8667 temp = PyObject_Repr(v);
8668 if (temp == NULL)
8669 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008670 if (PyUnicode_Check(temp))
8671 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008672 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008673 /* convert to string to Unicode */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008674 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8675 PyString_GET_SIZE(temp),
8676 NULL,
8677 "strict");
8678 Py_DECREF(temp);
8679 temp = unicode;
8680 if (temp == NULL)
8681 goto onError;
8682 }
8683 else {
8684 Py_DECREF(temp);
8685 PyErr_SetString(PyExc_TypeError,
8686 "%s argument has non-string str()");
8687 goto onError;
8688 }
8689 }
8690 pbuf = PyUnicode_AS_UNICODE(temp);
8691 len = PyUnicode_GET_SIZE(temp);
8692 if (prec >= 0 && len > prec)
8693 len = prec;
8694 break;
8695
8696 case 'i':
8697 case 'd':
8698 case 'u':
8699 case 'o':
8700 case 'x':
8701 case 'X':
8702 if (c == 'i')
8703 c = 'd';
8704 isnumok = 0;
8705 if (PyNumber_Check(v)) {
8706 PyObject *iobj=NULL;
8707
8708 if (PyInt_Check(v) || (PyLong_Check(v))) {
8709 iobj = v;
8710 Py_INCREF(iobj);
8711 }
8712 else {
8713 iobj = PyNumber_Int(v);
8714 if (iobj==NULL) iobj = PyNumber_Long(v);
8715 }
8716 if (iobj!=NULL) {
8717 if (PyInt_Check(iobj)) {
8718 isnumok = 1;
8719 pbuf = formatbuf;
8720 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8721 flags, prec, c, iobj);
8722 Py_DECREF(iobj);
8723 if (len < 0)
8724 goto onError;
8725 sign = 1;
8726 }
8727 else if (PyLong_Check(iobj)) {
8728 isnumok = 1;
8729 temp = formatlong(iobj, flags, prec, c);
8730 Py_DECREF(iobj);
8731 if (!temp)
8732 goto onError;
8733 pbuf = PyUnicode_AS_UNICODE(temp);
8734 len = PyUnicode_GET_SIZE(temp);
8735 sign = 1;
8736 }
8737 else {
8738 Py_DECREF(iobj);
8739 }
8740 }
8741 }
8742 if (!isnumok) {
8743 PyErr_Format(PyExc_TypeError,
8744 "%%%c format: a number is required, "
8745 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8746 goto onError;
8747 }
8748 if (flags & F_ZERO)
8749 fill = '0';
8750 break;
8751
8752 case 'e':
8753 case 'E':
8754 case 'f':
8755 case 'F':
8756 case 'g':
8757 case 'G':
8758 if (c == 'F')
8759 c = 'f';
8760 pbuf = formatbuf;
8761 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8762 flags, prec, c, v);
8763 if (len < 0)
8764 goto onError;
8765 sign = 1;
8766 if (flags & F_ZERO)
8767 fill = '0';
8768 break;
8769
8770 case 'c':
8771 pbuf = formatbuf;
8772 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8773 if (len < 0)
8774 goto onError;
8775 break;
8776
8777 default:
8778 PyErr_Format(PyExc_ValueError,
8779 "unsupported format character '%c' (0x%x) "
8780 "at index %zd",
8781 (31<=c && c<=126) ? (char)c : '?',
8782 (int)c,
8783 (Py_ssize_t)(fmt - 1 -
8784 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008785 goto onError;
8786 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008787 if (sign) {
8788 if (*pbuf == '-' || *pbuf == '+') {
8789 sign = *pbuf++;
8790 len--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008791 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008792 else if (flags & F_SIGN)
8793 sign = '+';
8794 else if (flags & F_BLANK)
8795 sign = ' ';
8796 else
8797 sign = 0;
8798 }
8799 if (width < len)
8800 width = len;
8801 if (rescnt - (sign != 0) < width) {
8802 reslen -= rescnt;
8803 rescnt = width + fmtcnt + 100;
8804 reslen += rescnt;
8805 if (reslen < 0) {
8806 Py_XDECREF(temp);
8807 PyErr_NoMemory();
8808 goto onError;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008809 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008810 if (_PyUnicode_Resize(&result, reslen) < 0) {
8811 Py_XDECREF(temp);
8812 goto onError;
8813 }
8814 res = PyUnicode_AS_UNICODE(result)
8815 + reslen - rescnt;
8816 }
8817 if (sign) {
8818 if (fill != ' ')
8819 *res++ = sign;
8820 rescnt--;
8821 if (width > len)
8822 width--;
8823 }
8824 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8825 assert(pbuf[0] == '0');
8826 assert(pbuf[1] == c);
8827 if (fill != ' ') {
8828 *res++ = *pbuf++;
8829 *res++ = *pbuf++;
8830 }
8831 rescnt -= 2;
8832 width -= 2;
8833 if (width < 0)
8834 width = 0;
8835 len -= 2;
8836 }
8837 if (width > len && !(flags & F_LJUST)) {
8838 do {
8839 --rescnt;
8840 *res++ = fill;
8841 } while (--width > len);
8842 }
8843 if (fill == ' ') {
8844 if (sign)
8845 *res++ = sign;
8846 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8847 assert(pbuf[0] == '0');
8848 assert(pbuf[1] == c);
8849 *res++ = *pbuf++;
8850 *res++ = *pbuf++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008851 }
8852 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008853 Py_UNICODE_COPY(res, pbuf, len);
8854 res += len;
8855 rescnt -= len;
8856 while (--width >= len) {
8857 --rescnt;
8858 *res++ = ' ';
8859 }
8860 if (dict && (argidx < arglen) && c != '%') {
8861 PyErr_SetString(PyExc_TypeError,
8862 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008863 Py_XDECREF(temp);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008864 goto onError;
8865 }
8866 Py_XDECREF(temp);
8867 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 } /* until end */
8869 if (argidx < arglen && !dict) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008870 PyErr_SetString(PyExc_TypeError,
8871 "not all arguments converted during string formatting");
8872 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 }
8874
Thomas Woutersa96affe2006-03-12 00:29:36 +00008875 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008876 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008878 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 }
8880 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881 return (PyObject *)result;
8882
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008883 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 Py_XDECREF(result);
8885 Py_DECREF(uformat);
8886 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008887 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 }
8889 return NULL;
8890}
8891
8892static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008893 (readbufferproc) unicode_buffer_getreadbuf,
8894 (writebufferproc) unicode_buffer_getwritebuf,
8895 (segcountproc) unicode_buffer_getsegcount,
8896 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897};
8898
Jeremy Hylton938ace62002-07-17 16:30:39 +00008899static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008900unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8901
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902static PyObject *
8903unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8904{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008905 PyObject *x = NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008906 static char *kwlist[] = {"string", "encoding", "errors", 0};
8907 char *encoding = NULL;
8908 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008909
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008910 if (type != &PyUnicode_Type)
8911 return unicode_subtype_new(type, args, kwds);
8912 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008913 kwlist, &x, &encoding, &errors))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008914 return NULL;
8915 if (x == NULL)
8916 return (PyObject *)_PyUnicode_New(0);
8917 if (encoding == NULL && errors == NULL)
8918 return PyObject_Unicode(x);
8919 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008920 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008921}
8922
Guido van Rossume023fe02001-08-30 03:12:59 +00008923static PyObject *
8924unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8925{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008926 PyUnicodeObject *tmp, *pnew;
8927 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008928
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008929 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8930 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8931 if (tmp == NULL)
8932 return NULL;
8933 assert(PyUnicode_Check(tmp));
8934 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8935 if (pnew == NULL) {
8936 Py_DECREF(tmp);
8937 return NULL;
8938 }
8939 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8940 if (pnew->str == NULL) {
8941 _Py_ForgetReference((PyObject *)pnew);
8942 PyObject_Del(pnew);
8943 Py_DECREF(tmp);
8944 return PyErr_NoMemory();
8945 }
8946 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8947 pnew->length = n;
8948 pnew->hash = tmp->hash;
8949 Py_DECREF(tmp);
8950 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008951}
8952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008953PyDoc_STRVAR(unicode_doc,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008954 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008955\n\
8956Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008957encoding defaults to the current default string encoding.\n\
8958errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008959
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008961 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008962 "unicode", /* tp_name */
8963 sizeof(PyUnicodeObject), /* tp_size */
8964 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 /* Slots */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008966 (destructor)unicode_dealloc, /* tp_dealloc */
8967 0, /* tp_print */
8968 0, /* tp_getattr */
8969 0, /* tp_setattr */
8970 0, /* tp_compare */
8971 unicode_repr, /* tp_repr */
8972 &unicode_as_number, /* tp_as_number */
8973 &unicode_as_sequence, /* tp_as_sequence */
8974 &unicode_as_mapping, /* tp_as_mapping */
8975 (hashfunc) unicode_hash, /* tp_hash*/
8976 0, /* tp_call*/
8977 (reprfunc) unicode_str, /* tp_str */
8978 PyObject_GenericGetAttr, /* tp_getattro */
8979 0, /* tp_setattro */
8980 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008981 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008982 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008983 unicode_doc, /* tp_doc */
8984 0, /* tp_traverse */
8985 0, /* tp_clear */
8986 PyUnicode_RichCompare, /* tp_richcompare */
8987 0, /* tp_weaklistoffset */
8988 0, /* tp_iter */
8989 0, /* tp_iternext */
8990 unicode_methods, /* tp_methods */
8991 0, /* tp_members */
8992 0, /* tp_getset */
8993 &PyBaseString_Type, /* tp_base */
8994 0, /* tp_dict */
8995 0, /* tp_descr_get */
8996 0, /* tp_descr_set */
8997 0, /* tp_dictoffset */
8998 0, /* tp_init */
8999 0, /* tp_alloc */
9000 unicode_new, /* tp_new */
9001 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002};
9003
9004/* Initialize the Unicode implementation */
9005
Thomas Wouters78890102000-07-22 19:25:51 +00009006void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009008 int i;
9009
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009010 /* XXX - move this array to unicodectype.c ? */
9011 Py_UNICODE linebreak[] = {
9012 0x000A, /* LINE FEED */
9013 0x000D, /* CARRIAGE RETURN */
9014 0x001C, /* FILE SEPARATOR */
9015 0x001D, /* GROUP SEPARATOR */
9016 0x001E, /* RECORD SEPARATOR */
9017 0x0085, /* NEXT LINE */
9018 0x2028, /* LINE SEPARATOR */
9019 0x2029, /* PARAGRAPH SEPARATOR */
9020 };
9021
Fred Drakee4315f52000-05-09 19:53:39 +00009022 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009023 free_list = NULL;
9024 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009026 if (!unicode_empty)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009027 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009028
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009029 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009030 for (i = 0; i < 256; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009031 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009032 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009033 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009034
9035 /* initialize the linebreak bloom filter */
9036 bloom_linebreak = make_bloom_mask(
9037 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9038 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009039
9040 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
9042
9043/* Finalize the Unicode implementation */
9044
Christian Heimes3b718a72008-02-14 12:47:33 +00009045int
9046PyUnicode_ClearFreeList(void)
9047{
9048 int freelist_size = numfree;
9049 PyUnicodeObject *u;
9050
9051 for (u = free_list; u != NULL;) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009052 PyUnicodeObject *v = u;
9053 u = *(PyUnicodeObject **)u;
9054 if (v->str)
9055 PyObject_DEL(v->str);
9056 Py_XDECREF(v->defenc);
9057 PyObject_Del(v);
9058 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009059 }
9060 free_list = NULL;
9061 assert(numfree == 0);
9062 return freelist_size;
9063}
9064
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065void
Thomas Wouters78890102000-07-22 19:25:51 +00009066_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009068 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009070 Py_XDECREF(unicode_empty);
9071 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009072
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009073 for (i = 0; i < 256; i++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009074 if (unicode_latin1[i]) {
9075 Py_DECREF(unicode_latin1[i]);
9076 unicode_latin1[i] = NULL;
9077 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009078 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009079 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009081
Anthony Baxterac6bd462006-04-13 02:06:09 +00009082#ifdef __cplusplus
9083}
9084#endif
9085
9086
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009087/*
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009088 Local variables:
9089 c-basic-offset: 4
9090 indent-tabs-mode: nil
9091 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009092*/