blob: c30d56874c599e8419594daaab98aa88418fceed [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson857ce152009-01-31 16:29:18 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson857ce152009-01-31 16:29:18 +0000247 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000443 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson857ce152009-01-31 16:29:18 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
Mark Dickinson6b265f12009-03-18 16:07:26 +0000532#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533# define CONVERT_WCHAR_TO_SURROGATES
534#endif
535
536#ifdef CONVERT_WCHAR_TO_SURROGATES
537
538/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
540
541PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
543{
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
548
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
552 }
553
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
560 }
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
565
566 /* Copy the wchar_t data into the new object */
567 {
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
576 }
577 else
578 *u++ = *w++;
579 }
580 }
581 return (PyObject *)unicode;
582}
583
584#else
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000587 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588{
589 PyUnicodeObject *unicode;
590
591 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000592 PyErr_BadInternalCall();
593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 }
595
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
599
600 /* Copy the wchar_t data into the new object */
601#ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000603#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 }
611#endif
612
613 return (PyObject *)unicode;
614}
615
Mark Dickinson6b265f12009-03-18 16:07:26 +0000616#endif /* CONVERT_WCHAR_TO_SURROGATES */
617
618#undef CONVERT_WCHAR_TO_SURROGATES
619
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000620static void
621makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
622{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
628 }
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
637 }
638 *fmt++ = c;
639 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640}
641
642#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
643
644PyObject *
645PyUnicode_FromFormatV(const char *format, va_list vargs)
646{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000667
668#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000669 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000670#else
671#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000672 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000673#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000675#endif
676#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000681 if (*f == '%') {
682 if (*(f+1)=='%')
683 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000684 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000685 ++callcount;
686 while (isdigit((unsigned)*f))
687 width = (width*10) + *f++ - '0';
688 while (*++f && *f != '%' && !isalpha((unsigned)*f))
689 ;
690 if (*f == 's')
691 ++callcount;
692 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 }
694 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000696 if (callcount) {
697 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
698 if (!callresults) {
699 PyErr_NoMemory();
700 return NULL;
701 }
702 callresult = callresults;
703 }
704 /* step 3: figure out how large a buffer we need */
705 for (f = format; *f; f++) {
706 if (*f == '%') {
707 const char* p = f;
708 width = 0;
709 while (isdigit((unsigned)*f))
710 width = (width*10) + *f++ - '0';
711 while (*++f && *f != '%' && !isalpha((unsigned)*f))
712 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000713
Benjamin Peterson857ce152009-01-31 16:29:18 +0000714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
716 */
717 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000718 (f[1] == 'd' || f[1] == 'u'))
719 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000720
Benjamin Peterson857ce152009-01-31 16:29:18 +0000721 switch (*f) {
722 case 'c':
723 (void)va_arg(count, int);
724 /* fall through... */
725 case '%':
726 n++;
727 break;
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
735 if (width < 20)
736 width = 20;
737 n += width;
738 if (abuffersize < width)
739 abuffersize = width;
740 break;
741 case 's':
742 {
743 /* UTF-8 */
Walter Dörwalded960ac2009-05-03 22:36:33 +0000744 unsigned char *s = va_arg(count, unsigned char*);
745 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000751 break;
752 }
753 case 'U':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 assert(obj && PyUnicode_Check(obj));
757 n += PyUnicode_GET_SIZE(obj);
758 break;
759 }
760 case 'V':
761 {
762 PyObject *obj = va_arg(count, PyObject *);
763 const char *str = va_arg(count, const char *);
764 assert(obj || str);
765 assert(!obj || PyUnicode_Check(obj));
766 if (obj)
767 n += PyUnicode_GET_SIZE(obj);
768 else
769 n += strlen(str);
770 break;
771 }
772 case 'S':
773 {
774 PyObject *obj = va_arg(count, PyObject *);
775 PyObject *str;
776 assert(obj);
777 str = PyObject_Str(obj);
778 if (!str)
779 goto fail;
780 n += PyUnicode_GET_SIZE(str);
781 /* Remember the str and switch to the next slot */
782 *callresult++ = str;
783 break;
784 }
785 case 'R':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *repr;
789 assert(obj);
790 repr = PyObject_Repr(obj);
791 if (!repr)
792 goto fail;
793 n += PyUnicode_GET_SIZE(repr);
794 /* Remember the repr and switch to the next slot */
795 *callresult++ = repr;
796 break;
797 }
798 case 'p':
799 (void) va_arg(count, int);
800 /* maximum 64-bit pointer representation:
801 * 0xffffffffffffffff
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
804 */
805 n += 19;
806 break;
807 default:
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
814 n += strlen(p);
815 goto expand;
816 }
817 } else
818 n++;
819 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000820 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000821 if (abuffersize > 20) {
822 abuffer = PyObject_Malloc(abuffersize);
823 if (!abuffer) {
824 PyErr_NoMemory();
825 goto fail;
826 }
827 realbuffer = abuffer;
828 }
829 else
830 realbuffer = buffer;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string = PyUnicode_FromUnicode(NULL, n);
836 if (!string)
837 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000838
Benjamin Peterson857ce152009-01-31 16:29:18 +0000839 s = PyUnicode_AS_UNICODE(string);
840 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000841
Benjamin Peterson857ce152009-01-31 16:29:18 +0000842 for (f = format; *f; f++) {
843 if (*f == '%') {
844 const char* p = f++;
845 int longflag = 0;
846 int size_tflag = 0;
847 zeropad = (*f == '0');
848 /* parse the width.precision part */
849 width = 0;
850 while (isdigit((unsigned)*f))
851 width = (width*10) + *f++ - '0';
852 precision = 0;
853 if (*f == '.') {
854 f++;
855 while (isdigit((unsigned)*f))
856 precision = (precision*10) + *f++ - '0';
857 }
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
861 longflag = 1;
862 ++f;
863 }
864 /* handle the size_t flag. */
865 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
866 size_tflag = 1;
867 ++f;
868 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000869
Benjamin Peterson857ce152009-01-31 16:29:18 +0000870 switch (*f) {
871 case 'c':
872 *s++ = va_arg(vargs, int);
873 break;
874 case 'd':
875 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
876 if (longflag)
877 sprintf(realbuffer, fmt, va_arg(vargs, long));
878 else if (size_tflag)
879 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
880 else
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
883 break;
884 case 'u':
885 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
886 if (longflag)
887 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
888 else if (size_tflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
890 else
891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
892 appendstring(realbuffer);
893 break;
894 case 'i':
895 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
896 sprintf(realbuffer, fmt, va_arg(vargs, int));
897 appendstring(realbuffer);
898 break;
899 case 'x':
900 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 's':
905 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000906 /* unused, since we already have the result */
907 (void) va_arg(vargs, char *);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
909 PyUnicode_GET_SIZE(*callresult));
910 s += PyUnicode_GET_SIZE(*callresult);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult);
913 /* switch to next unicode()/repr() result */
914 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000915 break;
916 }
917 case 'U':
918 {
919 PyObject *obj = va_arg(vargs, PyObject *);
920 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
922 s += size;
923 break;
924 }
925 case 'V':
926 {
927 PyObject *obj = va_arg(vargs, PyObject *);
928 const char *str = va_arg(vargs, const char *);
929 if (obj) {
930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
933 } else {
934 appendstring(str);
935 }
936 break;
937 }
938 case 'S':
939 case 'R':
940 {
941 Py_UNICODE *ucopy;
942 Py_ssize_t usize;
943 Py_ssize_t upos;
944 /* unused, since we already have the result */
945 (void) va_arg(vargs, PyObject *);
946 ucopy = PyUnicode_AS_UNICODE(*callresult);
947 usize = PyUnicode_GET_SIZE(*callresult);
948 for (upos = 0; upos<usize;)
949 *s++ = ucopy[upos++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult);
952 /* switch to next unicode()/repr() result */
953 ++callresult;
954 break;
955 }
956 case 'p':
957 sprintf(buffer, "%p", va_arg(vargs, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer[1] == 'X')
960 buffer[1] = 'x';
961 else if (buffer[1] != 'x') {
962 memmove(buffer+2, buffer, strlen(buffer)+1);
963 buffer[0] = '0';
964 buffer[1] = 'x';
965 }
966 appendstring(buffer);
967 break;
968 case '%':
969 *s++ = '%';
970 break;
971 default:
972 appendstring(p);
973 goto end;
974 }
975 } else
976 *s++ = *f;
977 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000978
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000979 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000980 if (callresults)
981 PyObject_Free(callresults);
982 if (abuffer)
983 PyObject_Free(abuffer);
984 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
985 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000986 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000987 if (callresults) {
988 PyObject **callresult2 = callresults;
989 while (callresult2 < callresult) {
990 Py_DECREF(*callresult2);
991 ++callresult2;
992 }
993 PyObject_Free(callresults);
994 }
995 if (abuffer)
996 PyObject_Free(abuffer);
997 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000998}
999
1000#undef appendstring
1001
1002PyObject *
1003PyUnicode_FromFormat(const char *format, ...)
1004{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001005 PyObject* ret;
1006 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001007
1008#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001009 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001010#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001011 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001012#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001013 ret = PyUnicode_FromFormatV(format, vargs);
1014 va_end(vargs);
1015 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001016}
1017
Martin v. Löwis18e16552006-02-15 17:27:45 +00001018Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001019 wchar_t *w,
1020 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021{
1022 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001023 PyErr_BadInternalCall();
1024 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001026
1027 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001029 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031#ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w, unicode->str, size * sizeof(wchar_t));
1033#else
1034 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001035 register Py_UNICODE *u;
1036 register Py_ssize_t i;
1037 u = PyUnicode_AS_UNICODE(unicode);
1038 for (i = size; i > 0; i--)
1039 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 }
1041#endif
1042
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043 if (size > PyUnicode_GET_SIZE(unicode))
1044 return PyUnicode_GET_SIZE(unicode);
1045 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047}
1048
1049#endif
1050
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001051PyObject *PyUnicode_FromOrdinal(int ordinal)
1052{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001053 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054
1055#ifdef Py_UNICODE_WIDE
1056 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 PyErr_SetString(PyExc_ValueError,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1060 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061 }
1062#else
1063 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001064 PyErr_SetString(PyExc_ValueError,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1067 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 }
1069#endif
1070
Hye-Shik Chang40574832004-04-06 07:24:51 +00001071 s[0] = (Py_UNICODE)ordinal;
1072 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001073}
1074
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075PyObject *PyUnicode_FromObject(register PyObject *obj)
1076{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001077 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001078 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001080 Py_INCREF(obj);
1081 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001082 }
1083 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001089 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1090}
1091
1092PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 const char *encoding,
1094 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001096 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001097 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001098 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 PyErr_BadInternalCall();
1102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001105#if 0
1106 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1109 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001110
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001111 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001113
1114 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001115 if (PyUnicode_Check(obj)) {
1116 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001117 PyErr_SetString(PyExc_TypeError,
1118 "decoding Unicode is not supported");
1119 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001120 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 return PyObject_Unicode(obj);
1122 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001123#else
1124 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 PyErr_SetString(PyExc_TypeError,
1126 "decoding Unicode is not supported");
1127 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001129#endif
1130
1131 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001132 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 s = PyString_AS_STRING(obj);
1134 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001135 }
Christian Heimes3497f942008-05-26 12:29:14 +00001136 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError,
1139 "decoding bytearray is not supported");
1140 return NULL;
1141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError))
1146 PyErr_Format(PyExc_TypeError,
1147 "coercing to Unicode: need string or buffer, "
1148 "%.80s found",
1149 Py_TYPE(obj)->tp_name);
1150 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001151 }
Tim Petersced69f82003-09-16 20:30:58 +00001152
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001153 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001155 Py_INCREF(unicode_empty);
1156 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 }
Tim Petersced69f82003-09-16 20:30:58 +00001158 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001159 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001160
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161 return v;
1162
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001163 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165}
1166
1167PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_ssize_t size,
1169 const char *encoding,
1170 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171{
1172 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001173
1174 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001175 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001176
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001180 else if (strcmp(encoding, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001182#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s, size, errors);
1185#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001186 else if (strcmp(encoding, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 /* Decode via the codec registry */
1190 buffer = PyBuffer_FromMemory((void *)s, size);
1191 if (buffer == NULL)
1192 goto onError;
1193 unicode = PyCodec_Decode(buffer, encoding, errors);
1194 if (unicode == NULL)
1195 goto onError;
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001198 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001199 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 Py_DECREF(unicode);
1201 goto onError;
1202 }
1203 Py_DECREF(buffer);
1204 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001205
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207 Py_XDECREF(buffer);
1208 return NULL;
1209}
1210
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001211PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212 const char *encoding,
1213 const char *errors)
1214{
1215 PyObject *v;
1216
1217 if (!PyUnicode_Check(unicode)) {
1218 PyErr_BadArgument();
1219 goto onError;
1220 }
1221
1222 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001223 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224
1225 /* Decode via the codec registry */
1226 v = PyCodec_Decode(unicode, encoding, errors);
1227 if (v == NULL)
1228 goto onError;
1229 return v;
1230
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001231 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001232 return NULL;
1233}
1234
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239{
1240 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 unicode = PyUnicode_FromUnicode(s, size);
1243 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246 Py_DECREF(unicode);
1247 return v;
1248}
1249
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001250PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251 const char *encoding,
1252 const char *errors)
1253{
1254 PyObject *v;
1255
1256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 goto onError;
1259 }
1260
1261 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001262 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263
1264 /* Encode via the codec registry */
1265 v = PyCodec_Encode(unicode, encoding, errors);
1266 if (v == NULL)
1267 goto onError;
1268 return v;
1269
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001270 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001271 return NULL;
1272}
1273
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275 const char *encoding,
1276 const char *errors)
1277{
1278 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001279
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1283 }
Fred Drakee4315f52000-05-09 19:53:39 +00001284
Tim Petersced69f82003-09-16 20:30:58 +00001285 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001286 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001287
1288 /* Shortcuts for common default encodings */
1289 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001290 if (strcmp(encoding, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode);
1292 else if (strcmp(encoding, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001294#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001295 else if (strcmp(encoding, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001297#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001298 else if (strcmp(encoding, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
1302 /* Encode via the codec registry */
1303 v = PyCodec_Encode(unicode, encoding, errors);
1304 if (v == NULL)
1305 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001306 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001308 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001309 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 Py_DECREF(v);
1311 goto onError;
1312 }
1313 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001315 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 return NULL;
1317}
1318
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001319PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001320 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001321{
1322 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1323
1324 if (v)
1325 return v;
1326 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327 if (v && errors == NULL)
1328 ((PyUnicodeObject *)unicode)->defenc = v;
1329 return v;
1330}
1331
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1333{
1334 if (!PyUnicode_Check(unicode)) {
1335 PyErr_BadArgument();
1336 goto onError;
1337 }
1338 return PyUnicode_AS_UNICODE(unicode);
1339
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001340 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 return NULL;
1342}
1343
Martin v. Löwis18e16552006-02-15 17:27:45 +00001344Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345{
1346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 goto onError;
1349 }
1350 return PyUnicode_GET_SIZE(unicode);
1351
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001352 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 return -1;
1354}
1355
Thomas Wouters78890102000-07-22 19:25:51 +00001356const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001357{
1358 return unicode_default_encoding;
1359}
1360
1361int PyUnicode_SetDefaultEncoding(const char *encoding)
1362{
1363 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Fred Drakee4315f52000-05-09 19:53:39 +00001365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v = _PyCodec_Lookup(encoding);
1368 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001369 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001370 Py_DECREF(v);
1371 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001372 encoding,
1373 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001374 return 0;
1375
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001376 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001377 return -1;
1378}
1379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001380/* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001382 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 and adjust various state variables.
1384 return 0 on success, -1 on error
1385*/
1386
1387static
1388int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 const char *encoding, const char *reason,
1390 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001394 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395
1396 PyObject *restuple = NULL;
1397 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001398 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399 Py_ssize_t requiredsize;
1400 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001401 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001402 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 int res = -1;
1404
1405 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 *errorHandler = PyCodec_LookupError(errors);
1407 if (*errorHandler == NULL)
1408 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 }
1410
1411 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001412 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001413 encoding, input, insize, *startinpos, *endinpos, reason);
1414 if (*exceptionObject == NULL)
1415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 }
1417 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001418 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419 goto onError;
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421 goto onError;
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001424 }
1425
1426 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001430 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 }
1433 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001436 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001437 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001441
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr = PyUnicode_AS_UNICODE(repunicode);
1447 repsize = PyUnicode_GET_SIZE(repunicode);
1448 requiredsize = *outpos + repsize + insize-newpos;
1449 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001450 if (requiredsize<2*outsize)
1451 requiredsize = 2*outsize;
1452 if (_PyUnicode_Resize(output, requiredsize) < 0)
1453 goto onError;
1454 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001455 }
1456 *endinpos = newpos;
1457 *inptr = input + newpos;
1458 Py_UNICODE_COPY(*outptr, repptr, repsize);
1459 *outptr += repsize;
1460 *outpos += repsize;
1461 /* we made it! */
1462 res = 0;
1463
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001464 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465 Py_XDECREF(restuple);
1466 return res;
1467}
1468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469/* --- UTF-7 Codec -------------------------------------------------------- */
1470
1471/* see RFC2152 for details */
1472
Tim Petersced69f82003-09-16 20:30:58 +00001473static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474char utf7_special[128] = {
1475 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1476 encoded:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001477 0 - not special
1478 1 - special
1479 2 - whitespace (optional)
1480 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1483 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1485 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1487 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1489
1490};
1491
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001492/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1493 warnings about the comparison always being false; since
1494 utf7_special[0] is 1, we can safely make that one comparison
1495 true */
1496
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001497#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001498 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001499 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001500 (encodeO && (utf7_special[(c)] == 3)))
1501
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001502#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001503 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001504#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001505 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001506#define UB64(c) \
1507 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001508 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001510#define ENCODE(out, ch, bits) \
1511 while (bits >= 6) { \
1512 *out++ = B64(ch >> (bits-6)); \
1513 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001514 }
1515
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001516#define DECODE(out, ch, bits, surrogate) \
1517 while (bits >= 16) { \
1518 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1519 bits -= 16; \
1520 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001521 /* We have already generated an error for the high surrogate \
1522 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001523 surrogate = 0; \
1524 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001526 it in a 16-bit character */ \
1527 surrogate = 1; \
1528 errmsg = "code pairs are not supported"; \
1529 goto utf7Error; \
1530 } else { \
1531 *out++ = outCh; \
1532 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001533 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001536 Py_ssize_t size,
1537 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001539 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1540}
1541
1542PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001543 Py_ssize_t size,
1544 const char *errors,
1545 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001546{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001548 Py_ssize_t startinpos;
1549 Py_ssize_t endinpos;
1550 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 const char *e;
1552 PyUnicodeObject *unicode;
1553 Py_UNICODE *p;
1554 const char *errmsg = "";
1555 int inShift = 0;
1556 unsigned int bitsleft = 0;
1557 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001558 int surrogate = 0;
1559 PyObject *errorHandler = NULL;
1560 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561
1562 unicode = _PyUnicode_New(size);
1563 if (!unicode)
1564 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001565 if (size == 0) {
1566 if (consumed)
1567 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001569 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570
1571 p = unicode->str;
1572 e = s + size;
1573
1574 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001575 Py_UNICODE ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001576 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001577 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578
1579 if (inShift) {
1580 if ((ch == '-') || !B64CHAR(ch)) {
1581 inShift = 0;
1582 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001583
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1585 if (bitsleft >= 6) {
1586 /* The shift sequence has a partial character in it. If
1587 bitsleft < 6 then we could just classify it as padding
1588 but that is not the case here */
1589
1590 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001591 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 }
1593 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001594 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 here so indicate the potential of a misencoded character. */
1596
1597 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1598 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1599 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001600 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 }
1602
1603 if (ch == '-') {
1604 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001605 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 inShift = 1;
1607 }
1608 } else if (SPECIAL(ch,0,0)) {
1609 errmsg = "unexpected special character";
Benjamin Peterson857ce152009-01-31 16:29:18 +00001610 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 } else {
1612 *p++ = ch;
1613 }
1614 } else {
1615 charsleft = (charsleft << 6) | UB64(ch);
1616 bitsleft += 6;
1617 s++;
1618 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1619 }
1620 }
1621 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 s++;
1624 if (s < e && *s == '-') {
1625 s++;
1626 *p++ = '+';
1627 } else
1628 {
1629 inShift = 1;
1630 bitsleft = 0;
1631 }
1632 }
1633 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001634 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 errmsg = "unexpected special character";
1636 s++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001637 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 }
1639 else {
1640 *p++ = ch;
1641 s++;
1642 }
1643 continue;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001644 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001645 outpos = p-PyUnicode_AS_UNICODE(unicode);
1646 endinpos = s-starts;
1647 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001648 errors, &errorHandler,
1649 "utf7", errmsg,
1650 starts, size, &startinpos, &endinpos, &exc, &s,
1651 &unicode, &outpos, &p))
1652 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
1654
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001655 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 outpos = p-PyUnicode_AS_UNICODE(unicode);
1657 endinpos = size;
1658 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001659 errors, &errorHandler,
1660 "utf7", "unterminated shift sequence",
1661 starts, size, &startinpos, &endinpos, &exc, &s,
1662 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 if (s < e)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001665 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001667 if (consumed) {
1668 if(inShift)
1669 *consumed = startinpos;
1670 else
1671 *consumed = s-starts;
1672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001674 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 goto onError;
1676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 Py_XDECREF(errorHandler);
1678 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 return (PyObject *)unicode;
1680
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001681 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 Py_XDECREF(errorHandler);
1683 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001684 Py_DECREF(unicode);
1685 return NULL;
1686}
1687
1688
1689PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001690 Py_ssize_t size,
1691 int encodeSetO,
1692 int encodeWhiteSpace,
1693 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001694{
1695 PyObject *v;
1696 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001697 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001698 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700 unsigned int bitsleft = 0;
1701 unsigned long charsleft = 0;
1702 char * out;
1703 char * start;
1704
Neal Norwitze7d8be82008-07-31 17:17:14 +00001705 if (cbAllocated / 5 != size)
1706 return PyErr_NoMemory();
1707
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001709 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001710
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001711 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 if (v == NULL)
1713 return NULL;
1714
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001715 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 for (;i < size; ++i) {
1717 Py_UNICODE ch = s[i];
1718
1719 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001720 if (ch == '+') {
1721 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 *out++ = '-';
1723 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1724 charsleft = ch;
1725 bitsleft = 16;
1726 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001727 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001729 } else {
1730 *out++ = (char) ch;
1731 }
1732 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1734 *out++ = B64(charsleft << (6-bitsleft));
1735 charsleft = 0;
1736 bitsleft = 0;
1737 /* Characters not in the BASE64 set implicitly unshift the sequence
1738 so no '-' is required, except if the character is itself a '-' */
1739 if (B64CHAR(ch) || ch == '-') {
1740 *out++ = '-';
1741 }
1742 inShift = 0;
1743 *out++ = (char) ch;
1744 } else {
1745 bitsleft += 16;
1746 charsleft = (charsleft << 16) | ch;
1747 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1748
Mark Dickinson3e4caeb2009-02-21 20:27:01 +00001749 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001750 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751 or '-' then the shift sequence will be terminated implicitly and we
1752 don't have to insert a '-'. */
1753
1754 if (bitsleft == 0) {
1755 if (i + 1 < size) {
1756 Py_UNICODE ch2 = s[i+1];
1757
1758 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001759
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001760 } else if (B64CHAR(ch2) || ch2 == '-') {
1761 *out++ = '-';
1762 inShift = 0;
1763 } else {
1764 inShift = 0;
1765 }
1766
1767 }
1768 else {
1769 *out++ = '-';
1770 inShift = 0;
1771 }
1772 }
Tim Petersced69f82003-09-16 20:30:58 +00001773 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001774 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001775 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 if (bitsleft) {
1777 *out++= B64(charsleft << (6-bitsleft) );
1778 *out++ = '-';
1779 }
1780
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001781 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 return v;
1783}
1784
1785#undef SPECIAL
1786#undef B64
1787#undef B64CHAR
1788#undef UB64
1789#undef ENCODE
1790#undef DECODE
1791
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792/* --- UTF-8 Codec -------------------------------------------------------- */
1793
Tim Petersced69f82003-09-16 20:30:58 +00001794static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795char utf8_code_length[256] = {
1796 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1797 illegal prefix. see RFC 2279 for details */
1798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1799 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1800 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1801 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1802 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1803 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1804 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1805 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1806 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1807 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1809 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1810 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1811 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1813 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1814};
1815
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001817 Py_ssize_t size,
1818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819{
Walter Dörwald69652032004-09-07 20:24:22 +00001820 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1821}
1822
1823PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001824 Py_ssize_t size,
1825 const char *errors,
1826 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001830 Py_ssize_t startinpos;
1831 Py_ssize_t endinpos;
1832 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 const char *e;
1834 PyUnicodeObject *unicode;
1835 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001836 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 PyObject *errorHandler = NULL;
1838 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839
1840 /* Note: size will always be longer than the resulting Unicode
1841 character count */
1842 unicode = _PyUnicode_New(size);
1843 if (!unicode)
1844 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001845 if (size == 0) {
1846 if (consumed)
1847 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850
1851 /* Unpack UTF-8 encoded data */
1852 p = unicode->str;
1853 e = s + size;
1854
1855 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001856 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857
1858 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001859 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 s++;
1861 continue;
1862 }
1863
1864 n = utf8_code_length[ch];
1865
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001866 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001867 if (consumed)
1868 break;
1869 else {
1870 errmsg = "unexpected end of data";
1871 startinpos = s-starts;
1872 endinpos = size;
1873 goto utf8Error;
1874 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876
1877 switch (n) {
1878
1879 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001881 startinpos = s-starts;
1882 endinpos = startinpos+1;
1883 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884
1885 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001886 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001887 startinpos = s-starts;
1888 endinpos = startinpos+1;
1889 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890
1891 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001892 if ((s[1] & 0xc0) != 0x80) {
1893 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001894 startinpos = s-starts;
1895 endinpos = startinpos+2;
1896 goto utf8Error;
1897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001899 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001900 startinpos = s-starts;
1901 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001902 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001903 goto utf8Error;
1904 }
1905 else
1906 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 break;
1908
1909 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001910 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001911 (s[2] & 0xc0) != 0x80) {
1912 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001913 startinpos = s-starts;
1914 endinpos = startinpos+3;
1915 goto utf8Error;
1916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001918 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001919 /* Note: UTF-8 encodings of surrogates are considered
1920 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001922 XXX For wide builds (UCS-4) we should probably try
1923 to recombine the surrogates into a single code
1924 unit.
1925 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001927 startinpos = s-starts;
1928 endinpos = startinpos+3;
1929 goto utf8Error;
1930 }
1931 else
1932 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001933 break;
1934
1935 case 4:
1936 if ((s[1] & 0xc0) != 0x80 ||
1937 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001938 (s[3] & 0xc0) != 0x80) {
1939 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001940 startinpos = s-starts;
1941 endinpos = startinpos+4;
1942 goto utf8Error;
1943 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001945 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001947 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001948 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001949 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001950 UTF-16 */
1951 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001952 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001953 startinpos = s-starts;
1954 endinpos = startinpos+4;
1955 goto utf8Error;
1956 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001957#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001958 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001959#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001960 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001961
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001962 /* translate from 10000..10FFFF to 0..FFFF */
1963 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001964
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001965 /* high surrogate = top 10 bits added to D800 */
1966 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001967
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001968 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001969 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001970#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 break;
1972
1973 default:
1974 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001975 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001976 startinpos = s-starts;
1977 endinpos = startinpos+n;
1978 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 }
1980 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001981 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001982
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 utf8Error:
1984 outpos = p-PyUnicode_AS_UNICODE(unicode);
1985 if (unicode_decode_call_errorhandler(
1986 errors, &errorHandler,
1987 "utf8", errmsg,
1988 starts, size, &startinpos, &endinpos, &exc, &s,
1989 &unicode, &outpos, &p))
1990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 }
Walter Dörwald69652032004-09-07 20:24:22 +00001992 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001993 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001996 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 goto onError;
1998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001999 Py_XDECREF(errorHandler);
2000 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 return (PyObject *)unicode;
2002
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002003 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 Py_XDECREF(errorHandler);
2005 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 Py_DECREF(unicode);
2007 return NULL;
2008}
2009
Tim Peters602f7402002-04-27 18:03:26 +00002010/* Allocation strategy: if the string is short, convert into a stack buffer
2011 and allocate exactly as much space needed at the end. Else allocate the
2012 maximum possible needed (4 result bytes per Unicode character), and return
2013 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002014*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002015PyObject *
2016PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002017 Py_ssize_t size,
2018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019{
Tim Peters602f7402002-04-27 18:03:26 +00002020#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002021
Martin v. Löwis18e16552006-02-15 17:27:45 +00002022 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002023 PyObject *v; /* result string object */
2024 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002025 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002026 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002027 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002028
Tim Peters602f7402002-04-27 18:03:26 +00002029 assert(s != NULL);
2030 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031
Tim Peters602f7402002-04-27 18:03:26 +00002032 if (size <= MAX_SHORT_UNICHARS) {
2033 /* Write into the stack buffer; nallocated can't overflow.
2034 * At the end, we'll allocate exactly as much heap space as it
2035 * turns out we need.
2036 */
2037 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2038 v = NULL; /* will allocate after we're done */
2039 p = stackbuf;
2040 }
2041 else {
2042 /* Overallocate on the heap, and give the excess back at the end. */
2043 nallocated = size * 4;
2044 if (nallocated / 4 != size) /* overflow! */
2045 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002046 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002047 if (v == NULL)
2048 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002049 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002050 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002051
Tim Peters602f7402002-04-27 18:03:26 +00002052 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002053 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002054
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002055 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002056 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002058
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002060 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002061 *p++ = (char)(0xc0 | (ch >> 6));
2062 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002063 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002064 else {
Tim Peters602f7402002-04-27 18:03:26 +00002065 /* Encode UCS2 Unicode ordinals */
2066 if (ch < 0x10000) {
2067 /* Special case: check for high surrogate */
2068 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2069 Py_UCS4 ch2 = s[i];
2070 /* Check for low surrogate and combine the two to
2071 form a UCS4 value */
2072 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002073 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002074 i++;
2075 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002076 }
Tim Peters602f7402002-04-27 18:03:26 +00002077 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002078 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002079 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002080 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2081 *p++ = (char)(0x80 | (ch & 0x3f));
2082 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002083 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002084 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002085 /* Encode UCS4 Unicode ordinals */
2086 *p++ = (char)(0xf0 | (ch >> 18));
2087 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2088 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2089 *p++ = (char)(0x80 | (ch & 0x3f));
2090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002092
Tim Peters602f7402002-04-27 18:03:26 +00002093 if (v == NULL) {
2094 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002095 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002096 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002097 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002098 }
2099 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002100 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002101 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002102 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002103 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002106
Tim Peters602f7402002-04-27 18:03:26 +00002107#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108}
2109
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 if (!PyUnicode_Check(unicode)) {
2113 PyErr_BadArgument();
2114 return NULL;
2115 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002116 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002117 PyUnicode_GET_SIZE(unicode),
2118 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119}
2120
Walter Dörwald6e390802007-08-17 16:41:28 +00002121/* --- UTF-32 Codec ------------------------------------------------------- */
2122
2123PyObject *
2124PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002125 Py_ssize_t size,
2126 const char *errors,
2127 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002128{
2129 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2130}
2131
2132PyObject *
2133PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002134 Py_ssize_t size,
2135 const char *errors,
2136 int *byteorder,
2137 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002138{
2139 const char *starts = s;
2140 Py_ssize_t startinpos;
2141 Py_ssize_t endinpos;
2142 Py_ssize_t outpos;
2143 PyUnicodeObject *unicode;
2144 Py_UNICODE *p;
2145#ifndef Py_UNICODE_WIDE
2146 int i, pairs;
2147#else
2148 const int pairs = 0;
2149#endif
2150 const unsigned char *q, *e;
2151 int bo = 0; /* assume native ordering by default */
2152 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002153 /* Offsets from q for retrieving bytes in the right order. */
2154#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2155 int iorder[] = {0, 1, 2, 3};
2156#else
2157 int iorder[] = {3, 2, 1, 0};
2158#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002159 PyObject *errorHandler = NULL;
2160 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002161 /* On narrow builds we split characters outside the BMP into two
2162 codepoints => count how much extra space we need. */
2163#ifndef Py_UNICODE_WIDE
2164 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002165 if (((Py_UCS4 *)s)[i] >= 0x10000)
2166 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002167#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002168
2169 /* This might be one to much, because of a BOM */
2170 unicode = _PyUnicode_New((size+3)/4+pairs);
2171 if (!unicode)
2172 return NULL;
2173 if (size == 0)
2174 return (PyObject *)unicode;
2175
2176 /* Unpack UTF-32 encoded data */
2177 p = unicode->str;
2178 q = (unsigned char *)s;
2179 e = q + size;
2180
2181 if (byteorder)
2182 bo = *byteorder;
2183
2184 /* Check for BOM marks (U+FEFF) in the input and adjust current
2185 byte order setting accordingly. In native mode, the leading BOM
2186 mark is skipped, in all other modes, it is copied to the output
2187 stream as-is (giving a ZWNBSP character). */
2188 if (bo == 0) {
2189 if (size >= 4) {
2190 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002191 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002192#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002193 if (bom == 0x0000FEFF) {
2194 q += 4;
2195 bo = -1;
2196 }
2197 else if (bom == 0xFFFE0000) {
2198 q += 4;
2199 bo = 1;
2200 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002201#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002202 if (bom == 0x0000FEFF) {
2203 q += 4;
2204 bo = 1;
2205 }
2206 else if (bom == 0xFFFE0000) {
2207 q += 4;
2208 bo = -1;
2209 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002210#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002211 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002212 }
2213
2214 if (bo == -1) {
2215 /* force LE */
2216 iorder[0] = 0;
2217 iorder[1] = 1;
2218 iorder[2] = 2;
2219 iorder[3] = 3;
2220 }
2221 else if (bo == 1) {
2222 /* force BE */
2223 iorder[0] = 3;
2224 iorder[1] = 2;
2225 iorder[2] = 1;
2226 iorder[3] = 0;
2227 }
2228
2229 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002230 Py_UCS4 ch;
2231 /* remaining bytes at the end? (size should be divisible by 4) */
2232 if (e-q<4) {
2233 if (consumed)
2234 break;
2235 errmsg = "truncated data";
2236 startinpos = ((const char *)q)-starts;
2237 endinpos = ((const char *)e)-starts;
2238 goto utf32Error;
2239 /* The remaining input chars are ignored if the callback
2240 chooses to skip the input */
2241 }
2242 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2243 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002244
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002245 if (ch >= 0x110000)
2246 {
2247 errmsg = "codepoint not in range(0x110000)";
2248 startinpos = ((const char *)q)-starts;
2249 endinpos = startinpos+4;
2250 goto utf32Error;
2251 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002252#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002253 if (ch >= 0x10000)
2254 {
2255 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2256 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2257 }
2258 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 *p++ = ch;
2261 q += 4;
2262 continue;
2263 utf32Error:
2264 outpos = p-PyUnicode_AS_UNICODE(unicode);
2265 if (unicode_decode_call_errorhandler(
2266 errors, &errorHandler,
2267 "utf32", errmsg,
2268 starts, size, &startinpos, &endinpos, &exc, &s,
2269 &unicode, &outpos, &p))
2270 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002271 }
2272
2273 if (byteorder)
2274 *byteorder = bo;
2275
2276 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002277 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002278
2279 /* Adjust length */
2280 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2281 goto onError;
2282
2283 Py_XDECREF(errorHandler);
2284 Py_XDECREF(exc);
2285 return (PyObject *)unicode;
2286
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002287 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002288 Py_DECREF(unicode);
2289 Py_XDECREF(errorHandler);
2290 Py_XDECREF(exc);
2291 return NULL;
2292}
2293
2294PyObject *
2295PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002296 Py_ssize_t size,
2297 const char *errors,
2298 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002299{
2300 PyObject *v;
2301 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002302 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002303#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002304 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002305#else
2306 const int pairs = 0;
2307#endif
2308 /* Offsets from p for storing byte pairs in the right order. */
2309#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2310 int iorder[] = {0, 1, 2, 3};
2311#else
2312 int iorder[] = {3, 2, 1, 0};
2313#endif
2314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002315#define STORECHAR(CH) \
2316 do { \
2317 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2318 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2319 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2320 p[iorder[0]] = (CH) & 0xff; \
2321 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002322 } while(0)
2323
2324 /* In narrow builds we can output surrogate pairs as one codepoint,
2325 so we need less space. */
2326#ifndef Py_UNICODE_WIDE
2327 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002328 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2329 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2330 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002331#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002332 nsize = (size - pairs + (byteorder == 0));
2333 bytesize = nsize * 4;
2334 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002335 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002336 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002337 if (v == NULL)
2338 return NULL;
2339
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002340 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002341 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002342 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002343 if (size == 0)
2344 return v;
2345
2346 if (byteorder == -1) {
2347 /* force LE */
2348 iorder[0] = 0;
2349 iorder[1] = 1;
2350 iorder[2] = 2;
2351 iorder[3] = 3;
2352 }
2353 else if (byteorder == 1) {
2354 /* force BE */
2355 iorder[0] = 3;
2356 iorder[1] = 2;
2357 iorder[2] = 1;
2358 iorder[3] = 0;
2359 }
2360
2361 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002362 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002363#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002364 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2365 Py_UCS4 ch2 = *s;
2366 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2367 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2368 s++;
2369 size--;
2370 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002371 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#endif
2373 STORECHAR(ch);
2374 }
2375 return v;
2376#undef STORECHAR
2377}
2378
2379PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2380{
2381 if (!PyUnicode_Check(unicode)) {
2382 PyErr_BadArgument();
2383 return NULL;
2384 }
2385 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002386 PyUnicode_GET_SIZE(unicode),
2387 NULL,
2388 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002389}
2390
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391/* --- UTF-16 Codec ------------------------------------------------------- */
2392
Tim Peters772747b2001-08-09 22:21:55 +00002393PyObject *
2394PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002395 Py_ssize_t size,
2396 const char *errors,
2397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398{
Walter Dörwald69652032004-09-07 20:24:22 +00002399 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2400}
2401
2402PyObject *
2403PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002404 Py_ssize_t size,
2405 const char *errors,
2406 int *byteorder,
2407 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002409 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002410 Py_ssize_t startinpos;
2411 Py_ssize_t endinpos;
2412 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 PyUnicodeObject *unicode;
2414 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002415 const unsigned char *q, *e;
2416 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002417 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002418 /* Offsets from q for retrieving byte pairs in the right order. */
2419#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2420 int ihi = 1, ilo = 0;
2421#else
2422 int ihi = 0, ilo = 1;
2423#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002424 PyObject *errorHandler = NULL;
2425 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426
2427 /* Note: size will always be longer than the resulting Unicode
2428 character count */
2429 unicode = _PyUnicode_New(size);
2430 if (!unicode)
2431 return NULL;
2432 if (size == 0)
2433 return (PyObject *)unicode;
2434
2435 /* Unpack UTF-16 encoded data */
2436 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002437 q = (unsigned char *)s;
2438 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439
2440 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002441 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002443 /* Check for BOM marks (U+FEFF) in the input and adjust current
2444 byte order setting accordingly. In native mode, the leading BOM
2445 mark is skipped, in all other modes, it is copied to the output
2446 stream as-is (giving a ZWNBSP character). */
2447 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002448 if (size >= 2) {
2449 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002450#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002451 if (bom == 0xFEFF) {
2452 q += 2;
2453 bo = -1;
2454 }
2455 else if (bom == 0xFFFE) {
2456 q += 2;
2457 bo = 1;
2458 }
Tim Petersced69f82003-09-16 20:30:58 +00002459#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002460 if (bom == 0xFEFF) {
2461 q += 2;
2462 bo = 1;
2463 }
2464 else if (bom == 0xFFFE) {
2465 q += 2;
2466 bo = -1;
2467 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002468#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002469 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471
Tim Peters772747b2001-08-09 22:21:55 +00002472 if (bo == -1) {
2473 /* force LE */
2474 ihi = 1;
2475 ilo = 0;
2476 }
2477 else if (bo == 1) {
2478 /* force BE */
2479 ihi = 0;
2480 ilo = 1;
2481 }
2482
2483 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002484 Py_UNICODE ch;
2485 /* remaining bytes at the end? (size should be even) */
2486 if (e-q<2) {
2487 if (consumed)
2488 break;
2489 errmsg = "truncated data";
2490 startinpos = ((const char *)q)-starts;
2491 endinpos = ((const char *)e)-starts;
2492 goto utf16Error;
2493 /* The remaining input chars are ignored if the callback
2494 chooses to skip the input */
2495 }
2496 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002497
Benjamin Peterson857ce152009-01-31 16:29:18 +00002498 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002499
2500 if (ch < 0xD800 || ch > 0xDFFF) {
2501 *p++ = ch;
2502 continue;
2503 }
2504
2505 /* UTF-16 code pair: */
2506 if (q >= e) {
2507 errmsg = "unexpected end of data";
2508 startinpos = (((const char *)q)-2)-starts;
2509 endinpos = ((const char *)e)-starts;
2510 goto utf16Error;
2511 }
2512 if (0xD800 <= ch && ch <= 0xDBFF) {
2513 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2514 q += 2;
2515 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002516#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002517 *p++ = ch;
2518 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002519#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002520 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002521#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002522 continue;
2523 }
2524 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002525 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002526 startinpos = (((const char *)q)-4)-starts;
2527 endinpos = startinpos+2;
2528 goto utf16Error;
2529 }
2530
Benjamin Peterson857ce152009-01-31 16:29:18 +00002531 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002532 errmsg = "illegal encoding";
2533 startinpos = (((const char *)q)-2)-starts;
2534 endinpos = startinpos+2;
2535 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002536
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002537 utf16Error:
2538 outpos = p-PyUnicode_AS_UNICODE(unicode);
2539 if (unicode_decode_call_errorhandler(
2540 errors, &errorHandler,
2541 "utf16", errmsg,
2542 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2543 &unicode, &outpos, &p))
2544 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 }
2546
2547 if (byteorder)
2548 *byteorder = bo;
2549
Walter Dörwald69652032004-09-07 20:24:22 +00002550 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002551 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002552
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002554 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 goto onError;
2556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002557 Py_XDECREF(errorHandler);
2558 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 return (PyObject *)unicode;
2560
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002561 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 Py_XDECREF(errorHandler);
2564 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565 return NULL;
2566}
2567
Tim Peters772747b2001-08-09 22:21:55 +00002568PyObject *
2569PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002570 Py_ssize_t size,
2571 const char *errors,
2572 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573{
2574 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002575 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002576 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002577#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002578 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002579#else
2580 const int pairs = 0;
2581#endif
Tim Peters772747b2001-08-09 22:21:55 +00002582 /* Offsets from p for storing byte pairs in the right order. */
2583#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2584 int ihi = 1, ilo = 0;
2585#else
2586 int ihi = 0, ilo = 1;
2587#endif
2588
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589#define STORECHAR(CH) \
2590 do { \
2591 p[ihi] = ((CH) >> 8) & 0xff; \
2592 p[ilo] = (CH) & 0xff; \
2593 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002594 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002596#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002597 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002598 if (s[i] >= 0x10000)
2599 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002600#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002601 /* 2 * (size + pairs + (byteorder == 0)) */
2602 if (size > PY_SSIZE_T_MAX ||
2603 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002604 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002605 nsize = size + pairs + (byteorder == 0);
2606 bytesize = nsize * 2;
2607 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002608 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002609 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 if (v == NULL)
2611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002613 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002615 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002616 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002617 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002618
2619 if (byteorder == -1) {
2620 /* force LE */
2621 ihi = 1;
2622 ilo = 0;
2623 }
2624 else if (byteorder == 1) {
2625 /* force BE */
2626 ihi = 0;
2627 ilo = 1;
2628 }
2629
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002630 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631 Py_UNICODE ch = *s++;
2632 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002633#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002634 if (ch >= 0x10000) {
2635 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2636 ch = 0xD800 | ((ch-0x10000) >> 10);
2637 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002638#endif
Tim Peters772747b2001-08-09 22:21:55 +00002639 STORECHAR(ch);
2640 if (ch2)
2641 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002644#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645}
2646
2647PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2648{
2649 if (!PyUnicode_Check(unicode)) {
2650 PyErr_BadArgument();
2651 return NULL;
2652 }
2653 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002654 PyUnicode_GET_SIZE(unicode),
2655 NULL,
2656 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657}
2658
2659/* --- Unicode Escape Codec ----------------------------------------------- */
2660
Fredrik Lundh06d12682001-01-24 07:59:11 +00002661static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002662
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002664 Py_ssize_t size,
2665 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002668 Py_ssize_t startinpos;
2669 Py_ssize_t endinpos;
2670 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002675 char* message;
2676 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 PyObject *errorHandler = NULL;
2678 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002679
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 /* Escaped strings will always be longer than the resulting
2681 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 length after conversion to the true value.
2683 (but if the error callback returns a long replacement string
2684 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 v = _PyUnicode_New(size);
2686 if (v == NULL)
2687 goto onError;
2688 if (size == 0)
2689 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002693
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 while (s < end) {
2695 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002696 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 /* Non-escape characters are interpreted as Unicode ordinals */
2700 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 continue;
2703 }
2704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 /* \ - Escapes */
2707 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002708 c = *s++;
2709 if (s > end)
2710 c = '\0'; /* Invalid after \ */
2711 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002713 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 case '\n': break;
2715 case '\\': *p++ = '\\'; break;
2716 case '\'': *p++ = '\''; break;
2717 case '\"': *p++ = '\"'; break;
2718 case 'b': *p++ = '\b'; break;
2719 case 'f': *p++ = '\014'; break; /* FF */
2720 case 't': *p++ = '\t'; break;
2721 case 'n': *p++ = '\n'; break;
2722 case 'r': *p++ = '\r'; break;
2723 case 'v': *p++ = '\013'; break; /* VT */
2724 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2725
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002726 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 case '0': case '1': case '2': case '3':
2728 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002729 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002730 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002731 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002732 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002733 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002735 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 break;
2737
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002738 /* hex escapes */
2739 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741 digits = 2;
2742 message = "truncated \\xXX escape";
2743 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002745 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747 digits = 4;
2748 message = "truncated \\uXXXX escape";
2749 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002751 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 digits = 8;
2754 message = "truncated \\UXXXXXXXX escape";
2755 hexescape:
2756 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 outpos = p-PyUnicode_AS_UNICODE(v);
2758 if (s+digits>end) {
2759 endinpos = size;
2760 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002761 errors, &errorHandler,
2762 "unicodeescape", "end of string in escape sequence",
2763 starts, size, &startinpos, &endinpos, &exc, &s,
2764 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 goto onError;
2766 goto nextByte;
2767 }
2768 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002769 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002770 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002771 endinpos = (s+i+1)-starts;
2772 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002773 errors, &errorHandler,
2774 "unicodeescape", message,
2775 starts, size, &startinpos, &endinpos, &exc, &s,
2776 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002777 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002778 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002779 }
2780 chr = (chr<<4) & ~0xF;
2781 if (c >= '0' && c <= '9')
2782 chr += c - '0';
2783 else if (c >= 'a' && c <= 'f')
2784 chr += 10 + c - 'a';
2785 else
2786 chr += 10 + c - 'A';
2787 }
2788 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002789 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 /* _decoding_error will have already written into the
2791 target buffer. */
2792 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002794 /* when we get here, chr is a 32-bit unicode character */
2795 if (chr <= 0xffff)
2796 /* UCS-2 character */
2797 *p++ = (Py_UNICODE) chr;
2798 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002799 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002800 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002801#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002802 *p++ = chr;
2803#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002804 chr -= 0x10000L;
2805 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002806 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002807#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002808 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 endinpos = s-starts;
2810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002812 errors, &errorHandler,
2813 "unicodeescape", "illegal Unicode character",
2814 starts, size, &startinpos, &endinpos, &exc, &s,
2815 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002816 goto onError;
2817 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 break;
2819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002820 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002821 case 'N':
2822 message = "malformed \\N character escape";
2823 if (ucnhash_CAPI == NULL) {
2824 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002825 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002826 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002827 if (m == NULL)
2828 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002829 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002830 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002831 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002832 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002833 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002834 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835 if (ucnhash_CAPI == NULL)
2836 goto ucnhashError;
2837 }
2838 if (*s == '{') {
2839 const char *start = s+1;
2840 /* look for the closing brace */
2841 while (*s != '}' && s < end)
2842 s++;
2843 if (s > start && s < end && *s == '}') {
2844 /* found a name. look it up in the unicode database */
2845 message = "unknown Unicode character name";
2846 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002847 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002848 goto store;
2849 }
2850 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 endinpos = s-starts;
2852 outpos = p-PyUnicode_AS_UNICODE(v);
2853 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002854 errors, &errorHandler,
2855 "unicodeescape", message,
2856 starts, size, &startinpos, &endinpos, &exc, &s,
2857 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002858 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002859 break;
2860
2861 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002862 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 message = "\\ at end of string";
2864 s--;
2865 endinpos = s-starts;
2866 outpos = p-PyUnicode_AS_UNICODE(v);
2867 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002868 errors, &errorHandler,
2869 "unicodeescape", message,
2870 starts, size, &startinpos, &endinpos, &exc, &s,
2871 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002872 goto onError;
2873 }
2874 else {
2875 *p++ = '\\';
2876 *p++ = (unsigned char)s[-1];
2877 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002880 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002883 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002884 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002885 Py_XDECREF(errorHandler);
2886 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002888
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002889 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002890 PyErr_SetString(
2891 PyExc_UnicodeError,
2892 "\\N escapes not supported (can't load unicodedata module)"
2893 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002894 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002895 Py_XDECREF(errorHandler);
2896 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002897 return NULL;
2898
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002899 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901 Py_XDECREF(errorHandler);
2902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 return NULL;
2904}
2905
2906/* Return a Unicode-Escape string version of the Unicode object.
2907
2908 If quotes is true, the string is enclosed in u"" or u'' quotes as
2909 appropriate.
2910
2911*/
2912
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002913Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002914 Py_ssize_t size,
2915 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002916{
2917 /* like wcschr, but doesn't stop at NULL characters */
2918
2919 while (size-- > 0) {
2920 if (*s == ch)
2921 return s;
2922 s++;
2923 }
2924
2925 return NULL;
2926}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002927
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928static
2929PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002930 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 int quotes)
2932{
2933 PyObject *repr;
2934 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002936 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002937#ifdef Py_UNICODE_WIDE
2938 const Py_ssize_t expandsize = 10;
2939#else
2940 const Py_ssize_t expandsize = 6;
2941#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942
Neal Norwitz17753ec2006-08-21 22:21:19 +00002943 /* XXX(nnorwitz): rather than over-allocating, it would be
2944 better to choose a different scheme. Perhaps scan the
2945 first N-chars of the string and allocate based on that size.
2946 */
2947 /* Initial allocation is based on the longest-possible unichr
2948 escape.
2949
2950 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2951 unichr, so in this case it's the longest unichr escape. In
2952 narrow (UTF-16) builds this is five chars per source unichr
2953 since there are two unichrs in the surrogate pair, so in narrow
2954 (UTF-16) builds it's not the longest unichr escape.
2955
2956 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2957 so in the narrow (UTF-16) build case it's the longest unichr
2958 escape.
2959 */
2960
Neal Norwitze7d8be82008-07-31 17:17:14 +00002961 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002962 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002963
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002964 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002965 2
2966 + expandsize*size
2967 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 if (repr == NULL)
2969 return NULL;
2970
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002971 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972
2973 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002975 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 !findchar(s, size, '"')) ? '"' : '\'';
2977 }
2978 while (size-- > 0) {
2979 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002980
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002981 /* Escape quotes and backslashes */
2982 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002983 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 *p++ = '\\';
2985 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002986 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002989#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002990 /* Map 21-bit characters to '\U00xxxxxx' */
2991 else if (ch >= 0x10000) {
2992 *p++ = '\\';
2993 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002994 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2995 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2996 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2997 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2998 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2999 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3000 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003001 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003002 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003003 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003004#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003005 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3006 else if (ch >= 0xD800 && ch < 0xDC00) {
3007 Py_UNICODE ch2;
3008 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003009
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003010 ch2 = *s++;
3011 size--;
3012 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3013 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3014 *p++ = '\\';
3015 *p++ = 'U';
3016 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3017 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3018 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3019 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3020 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3021 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3022 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3023 *p++ = hexdigit[ucs & 0x0000000F];
3024 continue;
3025 }
3026 /* Fall through: isolated surrogates are copied as-is */
3027 s--;
3028 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003029 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003030#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003031
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 *p++ = '\\';
3035 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003036 *p++ = hexdigit[(ch >> 12) & 0x000F];
3037 *p++ = hexdigit[(ch >> 8) & 0x000F];
3038 *p++ = hexdigit[(ch >> 4) & 0x000F];
3039 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003041
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003042 /* Map special whitespace to '\t', \n', '\r' */
3043 else if (ch == '\t') {
3044 *p++ = '\\';
3045 *p++ = 't';
3046 }
3047 else if (ch == '\n') {
3048 *p++ = '\\';
3049 *p++ = 'n';
3050 }
3051 else if (ch == '\r') {
3052 *p++ = '\\';
3053 *p++ = 'r';
3054 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003055
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003056 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003057 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003059 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003060 *p++ = hexdigit[(ch >> 4) & 0x000F];
3061 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003062 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 /* Copy everything else as-is */
3065 else
3066 *p++ = (char) ch;
3067 }
3068 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003069 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070
3071 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003072 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 return repr;
3074}
3075
3076PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003077 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078{
3079 return unicodeescape_string(s, size, 0);
3080}
3081
3082PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3083{
3084 if (!PyUnicode_Check(unicode)) {
3085 PyErr_BadArgument();
3086 return NULL;
3087 }
3088 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003089 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090}
3091
3092/* --- Raw Unicode Escape Codec ------------------------------------------- */
3093
3094PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003095 Py_ssize_t size,
3096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003099 Py_ssize_t startinpos;
3100 Py_ssize_t endinpos;
3101 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 const char *end;
3105 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106 PyObject *errorHandler = NULL;
3107 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003108
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 /* Escaped strings will always be longer than the resulting
3110 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 length after conversion to the true value. (But decoding error
3112 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 v = _PyUnicode_New(size);
3114 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003115 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003117 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 end = s + size;
3120 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003121 unsigned char c;
3122 Py_UCS4 x;
3123 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003124 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003126 /* Non-escape characters are interpreted as Unicode ordinals */
3127 if (*s != '\\') {
3128 *p++ = (unsigned char)*s++;
3129 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003130 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003131 startinpos = s-starts;
3132
3133 /* \u-escapes are only interpreted iff the number of leading
3134 backslashes if odd */
3135 bs = s;
3136 for (;s < end;) {
3137 if (*s != '\\')
3138 break;
3139 *p++ = (unsigned char)*s++;
3140 }
3141 if (((s - bs) & 1) == 0 ||
3142 s >= end ||
3143 (*s != 'u' && *s != 'U')) {
3144 continue;
3145 }
3146 p--;
3147 count = *s=='u' ? 4 : 8;
3148 s++;
3149
3150 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3151 outpos = p-PyUnicode_AS_UNICODE(v);
3152 for (x = 0, i = 0; i < count; ++i, ++s) {
3153 c = (unsigned char)*s;
3154 if (!isxdigit(c)) {
3155 endinpos = s-starts;
3156 if (unicode_decode_call_errorhandler(
3157 errors, &errorHandler,
3158 "rawunicodeescape", "truncated \\uXXXX",
3159 starts, size, &startinpos, &endinpos, &exc, &s,
3160 &v, &outpos, &p))
3161 goto onError;
3162 goto nextByte;
3163 }
3164 x = (x<<4) & ~0xF;
3165 if (c >= '0' && c <= '9')
3166 x += c - '0';
3167 else if (c >= 'a' && c <= 'f')
3168 x += 10 + c - 'a';
3169 else
3170 x += 10 + c - 'A';
3171 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003172 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003173 /* UCS-2 character */
3174 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003175 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003176 /* UCS-4 character. Either store directly, or as
3177 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003178#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003179 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003180#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003181 x -= 0x10000L;
3182 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3183 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003184#endif
3185 } else {
3186 endinpos = s-starts;
3187 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003188 if (unicode_decode_call_errorhandler(
3189 errors, &errorHandler,
3190 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003191 starts, size, &startinpos, &endinpos, &exc, &s,
3192 &v, &outpos, &p))
3193 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003194 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003195 nextByte:
3196 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003198 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003199 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200 Py_XDECREF(errorHandler);
3201 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003203
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003204 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 Py_XDECREF(errorHandler);
3207 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 return NULL;
3209}
3210
3211PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003212 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
3214 PyObject *repr;
3215 char *p;
3216 char *q;
3217
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003218 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003219#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003220 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003221#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003222 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003223#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003224
Neal Norwitze7d8be82008-07-31 17:17:14 +00003225 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003226 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003227
Neal Norwitze7d8be82008-07-31 17:17:14 +00003228 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 if (repr == NULL)
3230 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003231 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003234 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 while (size-- > 0) {
3236 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003237#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 /* Map 32-bit characters to '\Uxxxxxxxx' */
3239 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003240 *p++ = '\\';
3241 *p++ = 'U';
3242 *p++ = hexdigit[(ch >> 28) & 0xf];
3243 *p++ = hexdigit[(ch >> 24) & 0xf];
3244 *p++ = hexdigit[(ch >> 20) & 0xf];
3245 *p++ = hexdigit[(ch >> 16) & 0xf];
3246 *p++ = hexdigit[(ch >> 12) & 0xf];
3247 *p++ = hexdigit[(ch >> 8) & 0xf];
3248 *p++ = hexdigit[(ch >> 4) & 0xf];
3249 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003250 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003252#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003253 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3254 if (ch >= 0xD800 && ch < 0xDC00) {
3255 Py_UNICODE ch2;
3256 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003257
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 ch2 = *s++;
3259 size--;
3260 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3261 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3262 *p++ = '\\';
3263 *p++ = 'U';
3264 *p++ = hexdigit[(ucs >> 28) & 0xf];
3265 *p++ = hexdigit[(ucs >> 24) & 0xf];
3266 *p++ = hexdigit[(ucs >> 20) & 0xf];
3267 *p++ = hexdigit[(ucs >> 16) & 0xf];
3268 *p++ = hexdigit[(ucs >> 12) & 0xf];
3269 *p++ = hexdigit[(ucs >> 8) & 0xf];
3270 *p++ = hexdigit[(ucs >> 4) & 0xf];
3271 *p++ = hexdigit[ucs & 0xf];
3272 continue;
3273 }
3274 /* Fall through: isolated surrogates are copied as-is */
3275 s--;
3276 size++;
3277 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003279 /* Map 16-bit characters to '\uxxxx' */
3280 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 *p++ = '\\';
3282 *p++ = 'u';
3283 *p++ = hexdigit[(ch >> 12) & 0xf];
3284 *p++ = hexdigit[(ch >> 8) & 0xf];
3285 *p++ = hexdigit[(ch >> 4) & 0xf];
3286 *p++ = hexdigit[ch & 15];
3287 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003288 /* Copy everything else as-is */
3289 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 *p++ = (char) ch;
3291 }
3292 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003293 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 return repr;
3295}
3296
3297PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3298{
3299 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003300 PyErr_BadArgument();
3301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
3303 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003304 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305}
3306
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003307/* --- Unicode Internal Codec ------------------------------------------- */
3308
3309PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003310 Py_ssize_t size,
3311 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003312{
3313 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003314 Py_ssize_t startinpos;
3315 Py_ssize_t endinpos;
3316 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003317 PyUnicodeObject *v;
3318 Py_UNICODE *p;
3319 const char *end;
3320 const char *reason;
3321 PyObject *errorHandler = NULL;
3322 PyObject *exc = NULL;
3323
Neal Norwitzd43069c2006-01-08 01:12:10 +00003324#ifdef Py_UNICODE_WIDE
3325 Py_UNICODE unimax = PyUnicode_GetMax();
3326#endif
3327
Armin Rigo7ccbca92006-10-04 12:17:45 +00003328 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003329 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3330 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003331 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003332 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003333 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003334 p = PyUnicode_AS_UNICODE(v);
3335 end = s + size;
3336
3337 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003338 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003339 /* We have to sanity check the raw data, otherwise doom looms for
3340 some malformed UCS-4 data. */
3341 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003342#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003343 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003344#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003345 end-s < Py_UNICODE_SIZE
3346 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003347 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003348 startinpos = s - starts;
3349 if (end-s < Py_UNICODE_SIZE) {
3350 endinpos = end-starts;
3351 reason = "truncated input";
3352 }
3353 else {
3354 endinpos = s - starts + Py_UNICODE_SIZE;
3355 reason = "illegal code point (> 0x10FFFF)";
3356 }
3357 outpos = p - PyUnicode_AS_UNICODE(v);
3358 if (unicode_decode_call_errorhandler(
3359 errors, &errorHandler,
3360 "unicode_internal", reason,
3361 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003362 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003363 goto onError;
3364 }
3365 }
3366 else {
3367 p++;
3368 s += Py_UNICODE_SIZE;
3369 }
3370 }
3371
Martin v. Löwis412fb672006-04-13 06:34:32 +00003372 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003373 goto onError;
3374 Py_XDECREF(errorHandler);
3375 Py_XDECREF(exc);
3376 return (PyObject *)v;
3377
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003378 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003379 Py_XDECREF(v);
3380 Py_XDECREF(errorHandler);
3381 Py_XDECREF(exc);
3382 return NULL;
3383}
3384
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385/* --- Latin-1 Codec ------------------------------------------------------ */
3386
3387PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003388 Py_ssize_t size,
3389 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390{
3391 PyUnicodeObject *v;
3392 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003393
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003395 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003396 Py_UNICODE r = *(unsigned char*)s;
3397 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003398 }
3399
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 v = _PyUnicode_New(size);
3401 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003402 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 p = PyUnicode_AS_UNICODE(v);
3406 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003407 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003409
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003410 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 Py_XDECREF(v);
3412 return NULL;
3413}
3414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415/* create or adjust a UnicodeEncodeError */
3416static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003417 const char *encoding,
3418 const Py_UNICODE *unicode, Py_ssize_t size,
3419 Py_ssize_t startpos, Py_ssize_t endpos,
3420 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003423 *exceptionObject = PyUnicodeEncodeError_Create(
3424 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 }
3426 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003427 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3428 goto onError;
3429 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3430 goto onError;
3431 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3432 goto onError;
3433 return;
3434 onError:
3435 Py_DECREF(*exceptionObject);
3436 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 }
3438}
3439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440/* raises a UnicodeEncodeError */
3441static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 const char *encoding,
3443 const Py_UNICODE *unicode, Py_ssize_t size,
3444 Py_ssize_t startpos, Py_ssize_t endpos,
3445 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446{
3447 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003448 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003450 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451}
3452
3453/* error handling callback helper:
3454 build arguments, call the callback and check the arguments,
3455 put the result into newpos and return the replacement string, which
3456 has to be freed by the caller */
3457static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003458 PyObject **errorHandler,
3459 const char *encoding, const char *reason,
3460 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3461 Py_ssize_t startpos, Py_ssize_t endpos,
3462 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003464 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465
3466 PyObject *restuple;
3467 PyObject *resunicode;
3468
3469 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003470 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003472 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 }
3474
3475 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003476 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003478 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479
3480 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003483 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003485 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003486 Py_DECREF(restuple);
3487 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 }
3489 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003490 &resunicode, newpos)) {
3491 Py_DECREF(restuple);
3492 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 }
3494 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003495 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003496 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003497 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3498 Py_DECREF(restuple);
3499 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003500 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 Py_INCREF(resunicode);
3502 Py_DECREF(restuple);
3503 return resunicode;
3504}
3505
3506static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 Py_ssize_t size,
3508 const char *errors,
3509 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510{
3511 /* output object */
3512 PyObject *res;
3513 /* pointers to the beginning and end+1 of input */
3514 const Py_UNICODE *startp = p;
3515 const Py_UNICODE *endp = p + size;
3516 /* pointer to the beginning of the unencodable characters */
3517 /* const Py_UNICODE *badp = NULL; */
3518 /* pointer into the output */
3519 char *str;
3520 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003521 Py_ssize_t respos = 0;
3522 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003523 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3524 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 PyObject *errorHandler = NULL;
3526 PyObject *exc = NULL;
3527 /* the following variable is used for caching string comparisons
3528 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3529 int known_errorHandler = -1;
3530
3531 /* allocate enough for a simple encoding without
3532 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003533 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 if (res == NULL)
3535 goto onError;
3536 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003537 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003538 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 ressize = size;
3540
3541 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003544 /* can we encode this? */
3545 if (c<limit) {
3546 /* no overflow check, because we know that the space is enough */
3547 *str++ = (char)c;
3548 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003549 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 else {
3551 Py_ssize_t unicodepos = p-startp;
3552 Py_ssize_t requiredsize;
3553 PyObject *repunicode;
3554 Py_ssize_t repsize;
3555 Py_ssize_t newpos;
3556 Py_ssize_t respos;
3557 Py_UNICODE *uni2;
3558 /* startpos for collecting unencodable chars */
3559 const Py_UNICODE *collstart = p;
3560 const Py_UNICODE *collend = p;
3561 /* find all unecodable characters */
3562 while ((collend < endp) && ((*collend)>=limit))
3563 ++collend;
3564 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3565 if (known_errorHandler==-1) {
3566 if ((errors==NULL) || (!strcmp(errors, "strict")))
3567 known_errorHandler = 1;
3568 else if (!strcmp(errors, "replace"))
3569 known_errorHandler = 2;
3570 else if (!strcmp(errors, "ignore"))
3571 known_errorHandler = 3;
3572 else if (!strcmp(errors, "xmlcharrefreplace"))
3573 known_errorHandler = 4;
3574 else
3575 known_errorHandler = 0;
3576 }
3577 switch (known_errorHandler) {
3578 case 1: /* strict */
3579 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3580 goto onError;
3581 case 2: /* replace */
3582 while (collstart++<collend)
3583 *str++ = '?'; /* fall through */
3584 case 3: /* ignore */
3585 p = collend;
3586 break;
3587 case 4: /* xmlcharrefreplace */
3588 respos = str-PyString_AS_STRING(res);
3589 /* determine replacement size (temporarily (mis)uses p) */
3590 for (p = collstart, repsize = 0; p < collend; ++p) {
3591 if (*p<10)
3592 repsize += 2+1+1;
3593 else if (*p<100)
3594 repsize += 2+2+1;
3595 else if (*p<1000)
3596 repsize += 2+3+1;
3597 else if (*p<10000)
3598 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003599#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003600 else
3601 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003602#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003603 else if (*p<100000)
3604 repsize += 2+5+1;
3605 else if (*p<1000000)
3606 repsize += 2+6+1;
3607 else
3608 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003609#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003610 }
3611 requiredsize = respos+repsize+(endp-collend);
3612 if (requiredsize > ressize) {
3613 if (requiredsize<2*ressize)
3614 requiredsize = 2*ressize;
3615 if (_PyString_Resize(&res, requiredsize))
3616 goto onError;
3617 str = PyString_AS_STRING(res) + respos;
3618 ressize = requiredsize;
3619 }
3620 /* generate replacement (temporarily (mis)uses p) */
3621 for (p = collstart; p < collend; ++p) {
3622 str += sprintf(str, "&#%d;", (int)*p);
3623 }
3624 p = collend;
3625 break;
3626 default:
3627 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3628 encoding, reason, startp, size, &exc,
3629 collstart-startp, collend-startp, &newpos);
3630 if (repunicode == NULL)
3631 goto onError;
3632 /* need more space? (at least enough for what we have+the
3633 replacement+the rest of the string, so we won't have to
3634 check space for encodable characters) */
3635 respos = str-PyString_AS_STRING(res);
3636 repsize = PyUnicode_GET_SIZE(repunicode);
3637 requiredsize = respos+repsize+(endp-collend);
3638 if (requiredsize > ressize) {
3639 if (requiredsize<2*ressize)
3640 requiredsize = 2*ressize;
3641 if (_PyString_Resize(&res, requiredsize)) {
3642 Py_DECREF(repunicode);
3643 goto onError;
3644 }
3645 str = PyString_AS_STRING(res) + respos;
3646 ressize = requiredsize;
3647 }
3648 /* check if there is anything unencodable in the replacement
3649 and copy it to the output */
3650 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3651 c = *uni2;
3652 if (c >= limit) {
3653 raise_encode_exception(&exc, encoding, startp, size,
3654 unicodepos, unicodepos+1, reason);
3655 Py_DECREF(repunicode);
3656 goto onError;
3657 }
3658 *str = (char)c;
3659 }
3660 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003661 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003662 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003663 }
3664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003666 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003668 /* If this falls res will be NULL */
3669 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 Py_XDECREF(errorHandler);
3671 Py_XDECREF(exc);
3672 return res;
3673
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003674 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(res);
3676 Py_XDECREF(errorHandler);
3677 Py_XDECREF(exc);
3678 return NULL;
3679}
3680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003682 Py_ssize_t size,
3683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686}
3687
3688PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3689{
3690 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003691 PyErr_BadArgument();
3692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 }
3694 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003695 PyUnicode_GET_SIZE(unicode),
3696 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697}
3698
3699/* --- 7-bit ASCII Codec -------------------------------------------------- */
3700
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003702 Py_ssize_t size,
3703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 PyUnicodeObject *v;
3707 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003708 Py_ssize_t startinpos;
3709 Py_ssize_t endinpos;
3710 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 const char *e;
3712 PyObject *errorHandler = NULL;
3713 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003714
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003716 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003717 Py_UNICODE r = *(unsigned char*)s;
3718 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003719 }
Tim Petersced69f82003-09-16 20:30:58 +00003720
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 v = _PyUnicode_New(size);
3722 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003723 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003725 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 e = s + size;
3728 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003729 register unsigned char c = (unsigned char)*s;
3730 if (c < 128) {
3731 *p++ = c;
3732 ++s;
3733 }
3734 else {
3735 startinpos = s-starts;
3736 endinpos = startinpos + 1;
3737 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3738 if (unicode_decode_call_errorhandler(
3739 errors, &errorHandler,
3740 "ascii", "ordinal not in range(128)",
3741 starts, size, &startinpos, &endinpos, &exc, &s,
3742 &v, &outpos, &p))
3743 goto onError;
3744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003746 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003747 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3748 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 Py_XDECREF(errorHandler);
3750 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003752
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003753 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 Py_XDECREF(errorHandler);
3756 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 return NULL;
3758}
3759
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003761 Py_ssize_t size,
3762 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765}
3766
3767PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3768{
3769 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003770 PyErr_BadArgument();
3771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 }
3773 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003774 PyUnicode_GET_SIZE(unicode),
3775 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776}
3777
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003778#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003779
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003780/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003781
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003782#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003783#define NEED_RETRY
3784#endif
3785
3786/* XXX This code is limited to "true" double-byte encodings, as
3787 a) it assumes an incomplete character consists of a single byte, and
3788 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003790
3791static int is_dbcs_lead_byte(const char *s, int offset)
3792{
3793 const char *curr = s + offset;
3794
3795 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003796 const char *prev = CharPrev(s, curr);
3797 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003798 }
3799 return 0;
3800}
3801
3802/*
3803 * Decode MBCS string into unicode object. If 'final' is set, converts
3804 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3805 */
3806static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003807 const char *s, /* MBCS string */
3808 int size, /* sizeof MBCS string */
3809 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003810{
3811 Py_UNICODE *p;
3812 Py_ssize_t n = 0;
3813 int usize = 0;
3814
3815 assert(size >= 0);
3816
3817 /* Skip trailing lead-byte unless 'final' is set */
3818 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003819 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003820
3821 /* First get the size of the result */
3822 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003823 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3824 if (usize == 0) {
3825 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3826 return -1;
3827 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003828 }
3829
3830 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003831 /* Create unicode object */
3832 *v = _PyUnicode_New(usize);
3833 if (*v == NULL)
3834 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003835 }
3836 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003837 /* Extend unicode object */
3838 n = PyUnicode_GET_SIZE(*v);
3839 if (_PyUnicode_Resize(v, n + usize) < 0)
3840 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003841 }
3842
3843 /* Do the conversion */
3844 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003845 p = PyUnicode_AS_UNICODE(*v) + n;
3846 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3847 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3848 return -1;
3849 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003850 }
3851
3852 return size;
3853}
3854
3855PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003856 Py_ssize_t size,
3857 const char *errors,
3858 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003859{
3860 PyUnicodeObject *v = NULL;
3861 int done;
3862
3863 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003865
3866#ifdef NEED_RETRY
3867 retry:
3868 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003869 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003870 else
3871#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003872 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003873
3874 if (done < 0) {
3875 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003876 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877 }
3878
3879 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003880 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003881
3882#ifdef NEED_RETRY
3883 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003884 s += done;
3885 size -= done;
3886 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003887 }
3888#endif
3889
3890 return (PyObject *)v;
3891}
3892
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003893PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003894 Py_ssize_t size,
3895 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003896{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003897 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3898}
3899
3900/*
3901 * Convert unicode into string object (MBCS).
3902 * Returns 0 if succeed, -1 otherwise.
3903 */
3904static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003905 const Py_UNICODE *p, /* unicode */
3906 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907{
3908 int mbcssize = 0;
3909 Py_ssize_t n = 0;
3910
3911 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003912
3913 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003915 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3916 if (mbcssize == 0) {
3917 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3918 return -1;
3919 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003920 }
3921
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003923 /* Create string object */
3924 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3925 if (*repr == NULL)
3926 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003927 }
3928 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 /* Extend string object */
3930 n = PyString_Size(*repr);
3931 if (_PyString_Resize(repr, n + mbcssize) < 0)
3932 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933 }
3934
3935 /* Do the conversion */
3936 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003937 char *s = PyString_AS_STRING(*repr) + n;
3938 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3939 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3940 return -1;
3941 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003942 }
3943
3944 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003945}
3946
3947PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 Py_ssize_t size,
3949 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003951 PyObject *repr = NULL;
3952 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003953
Martin v. Löwisd8251432006-06-14 05:21:04 +00003954#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003955 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003956 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003957 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003958 else
3959#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003960 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003961
Martin v. Löwisd8251432006-06-14 05:21:04 +00003962 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003963 Py_XDECREF(repr);
3964 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003965 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003966
3967#ifdef NEED_RETRY
3968 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 p += INT_MAX;
3970 size -= INT_MAX;
3971 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972 }
3973#endif
3974
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003975 return repr;
3976}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003977
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003978PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3979{
3980 if (!PyUnicode_Check(unicode)) {
3981 PyErr_BadArgument();
3982 return NULL;
3983 }
3984 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003985 PyUnicode_GET_SIZE(unicode),
3986 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003987}
3988
Martin v. Löwisd8251432006-06-14 05:21:04 +00003989#undef NEED_RETRY
3990
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003991#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003992
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993/* --- Character Mapping Codec -------------------------------------------- */
3994
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003996 Py_ssize_t size,
3997 PyObject *mapping,
3998 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004001 Py_ssize_t startinpos;
4002 Py_ssize_t endinpos;
4003 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 PyUnicodeObject *v;
4006 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004007 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 PyObject *errorHandler = NULL;
4009 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004010 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004011 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004012
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 /* Default to Latin-1 */
4014 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016
4017 v = _PyUnicode_New(size);
4018 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004019 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004021 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004024 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004025 mapstring = PyUnicode_AS_UNICODE(mapping);
4026 maplen = PyUnicode_GET_SIZE(mapping);
4027 while (s < e) {
4028 unsigned char ch = *s;
4029 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004031 if (ch < maplen)
4032 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004034 if (x == 0xfffe) {
4035 /* undefined mapping */
4036 outpos = p-PyUnicode_AS_UNICODE(v);
4037 startinpos = s-starts;
4038 endinpos = startinpos+1;
4039 if (unicode_decode_call_errorhandler(
4040 errors, &errorHandler,
4041 "charmap", "character maps to <undefined>",
4042 starts, size, &startinpos, &endinpos, &exc, &s,
4043 &v, &outpos, &p)) {
4044 goto onError;
4045 }
4046 continue;
4047 }
4048 *p++ = x;
4049 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004050 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004051 }
4052 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004053 while (s < e) {
4054 unsigned char ch = *s;
4055 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004056
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004057 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4058 w = PyInt_FromLong((long)ch);
4059 if (w == NULL)
4060 goto onError;
4061 x = PyObject_GetItem(mapping, w);
4062 Py_DECREF(w);
4063 if (x == NULL) {
4064 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4065 /* No mapping found means: mapping is undefined. */
4066 PyErr_Clear();
4067 x = Py_None;
4068 Py_INCREF(x);
4069 } else
4070 goto onError;
4071 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004072
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004073 /* Apply mapping */
4074 if (PyInt_Check(x)) {
4075 long value = PyInt_AS_LONG(x);
4076 if (value < 0 || value > 65535) {
4077 PyErr_SetString(PyExc_TypeError,
4078 "character mapping must be in range(65536)");
4079 Py_DECREF(x);
4080 goto onError;
4081 }
4082 *p++ = (Py_UNICODE)value;
4083 }
4084 else if (x == Py_None) {
4085 /* undefined mapping */
4086 outpos = p-PyUnicode_AS_UNICODE(v);
4087 startinpos = s-starts;
4088 endinpos = startinpos+1;
4089 if (unicode_decode_call_errorhandler(
4090 errors, &errorHandler,
4091 "charmap", "character maps to <undefined>",
4092 starts, size, &startinpos, &endinpos, &exc, &s,
4093 &v, &outpos, &p)) {
4094 Py_DECREF(x);
4095 goto onError;
4096 }
4097 Py_DECREF(x);
4098 continue;
4099 }
4100 else if (PyUnicode_Check(x)) {
4101 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004102
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004103 if (targetsize == 1)
4104 /* 1-1 mapping */
4105 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004106
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004107 else if (targetsize > 1) {
4108 /* 1-n mapping */
4109 if (targetsize > extrachars) {
4110 /* resize first */
4111 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4112 Py_ssize_t needed = (targetsize - extrachars) + \
4113 (targetsize << 2);
4114 extrachars += needed;
4115 /* XXX overflow detection missing */
4116 if (_PyUnicode_Resize(&v,
4117 PyUnicode_GET_SIZE(v) + needed) < 0) {
4118 Py_DECREF(x);
4119 goto onError;
4120 }
4121 p = PyUnicode_AS_UNICODE(v) + oldpos;
4122 }
4123 Py_UNICODE_COPY(p,
4124 PyUnicode_AS_UNICODE(x),
4125 targetsize);
4126 p += targetsize;
4127 extrachars -= targetsize;
4128 }
4129 /* 1-0 mapping: skip the character */
4130 }
4131 else {
4132 /* wrong return value */
4133 PyErr_SetString(PyExc_TypeError,
4134 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004135 Py_DECREF(x);
4136 goto onError;
4137 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004138 Py_DECREF(x);
4139 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 }
4142 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004143 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4144 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 Py_XDECREF(errorHandler);
4146 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004148
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004149 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 Py_XDECREF(errorHandler);
4151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 Py_XDECREF(v);
4153 return NULL;
4154}
4155
Martin v. Löwis3f767792006-06-04 19:36:28 +00004156/* Charmap encoding: the lookup table */
4157
4158struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004159 PyObject_HEAD
4160 unsigned char level1[32];
4161 int count2, count3;
4162 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004163};
4164
4165static PyObject*
4166encoding_map_size(PyObject *obj, PyObject* args)
4167{
4168 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004169 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004170 128*map->count3);
4171}
4172
4173static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004174 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004175 PyDoc_STR("Return the size (in bytes) of this object") },
4176 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004177};
4178
4179static void
4180encoding_map_dealloc(PyObject* o)
4181{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004182 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004183}
4184
4185static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004186 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004187 "EncodingMap", /*tp_name*/
4188 sizeof(struct encoding_map), /*tp_basicsize*/
4189 0, /*tp_itemsize*/
4190 /* methods */
4191 encoding_map_dealloc, /*tp_dealloc*/
4192 0, /*tp_print*/
4193 0, /*tp_getattr*/
4194 0, /*tp_setattr*/
4195 0, /*tp_compare*/
4196 0, /*tp_repr*/
4197 0, /*tp_as_number*/
4198 0, /*tp_as_sequence*/
4199 0, /*tp_as_mapping*/
4200 0, /*tp_hash*/
4201 0, /*tp_call*/
4202 0, /*tp_str*/
4203 0, /*tp_getattro*/
4204 0, /*tp_setattro*/
4205 0, /*tp_as_buffer*/
4206 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4207 0, /*tp_doc*/
4208 0, /*tp_traverse*/
4209 0, /*tp_clear*/
4210 0, /*tp_richcompare*/
4211 0, /*tp_weaklistoffset*/
4212 0, /*tp_iter*/
4213 0, /*tp_iternext*/
4214 encoding_map_methods, /*tp_methods*/
4215 0, /*tp_members*/
4216 0, /*tp_getset*/
4217 0, /*tp_base*/
4218 0, /*tp_dict*/
4219 0, /*tp_descr_get*/
4220 0, /*tp_descr_set*/
4221 0, /*tp_dictoffset*/
4222 0, /*tp_init*/
4223 0, /*tp_alloc*/
4224 0, /*tp_new*/
4225 0, /*tp_free*/
4226 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004227};
4228
4229PyObject*
4230PyUnicode_BuildEncodingMap(PyObject* string)
4231{
4232 Py_UNICODE *decode;
4233 PyObject *result;
4234 struct encoding_map *mresult;
4235 int i;
4236 int need_dict = 0;
4237 unsigned char level1[32];
4238 unsigned char level2[512];
4239 unsigned char *mlevel1, *mlevel2, *mlevel3;
4240 int count2 = 0, count3 = 0;
4241
4242 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4243 PyErr_BadArgument();
4244 return NULL;
4245 }
4246 decode = PyUnicode_AS_UNICODE(string);
4247 memset(level1, 0xFF, sizeof level1);
4248 memset(level2, 0xFF, sizeof level2);
4249
4250 /* If there isn't a one-to-one mapping of NULL to \0,
4251 or if there are non-BMP characters, we need to use
4252 a mapping dictionary. */
4253 if (decode[0] != 0)
4254 need_dict = 1;
4255 for (i = 1; i < 256; i++) {
4256 int l1, l2;
4257 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004258#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004259 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004260#endif
4261 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004262 need_dict = 1;
4263 break;
4264 }
4265 if (decode[i] == 0xFFFE)
4266 /* unmapped character */
4267 continue;
4268 l1 = decode[i] >> 11;
4269 l2 = decode[i] >> 7;
4270 if (level1[l1] == 0xFF)
4271 level1[l1] = count2++;
4272 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004273 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004274 }
4275
4276 if (count2 >= 0xFF || count3 >= 0xFF)
4277 need_dict = 1;
4278
4279 if (need_dict) {
4280 PyObject *result = PyDict_New();
4281 PyObject *key, *value;
4282 if (!result)
4283 return NULL;
4284 for (i = 0; i < 256; i++) {
4285 key = value = NULL;
4286 key = PyInt_FromLong(decode[i]);
4287 value = PyInt_FromLong(i);
4288 if (!key || !value)
4289 goto failed1;
4290 if (PyDict_SetItem(result, key, value) == -1)
4291 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004292 Py_DECREF(key);
4293 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004294 }
4295 return result;
4296 failed1:
4297 Py_XDECREF(key);
4298 Py_XDECREF(value);
4299 Py_DECREF(result);
4300 return NULL;
4301 }
4302
4303 /* Create a three-level trie */
4304 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4305 16*count2 + 128*count3 - 1);
4306 if (!result)
4307 return PyErr_NoMemory();
4308 PyObject_Init(result, &EncodingMapType);
4309 mresult = (struct encoding_map*)result;
4310 mresult->count2 = count2;
4311 mresult->count3 = count3;
4312 mlevel1 = mresult->level1;
4313 mlevel2 = mresult->level23;
4314 mlevel3 = mresult->level23 + 16*count2;
4315 memcpy(mlevel1, level1, 32);
4316 memset(mlevel2, 0xFF, 16*count2);
4317 memset(mlevel3, 0, 128*count3);
4318 count3 = 0;
4319 for (i = 1; i < 256; i++) {
4320 int o1, o2, o3, i2, i3;
4321 if (decode[i] == 0xFFFE)
4322 /* unmapped character */
4323 continue;
4324 o1 = decode[i]>>11;
4325 o2 = (decode[i]>>7) & 0xF;
4326 i2 = 16*mlevel1[o1] + o2;
4327 if (mlevel2[i2] == 0xFF)
4328 mlevel2[i2] = count3++;
4329 o3 = decode[i] & 0x7F;
4330 i3 = 128*mlevel2[i2] + o3;
4331 mlevel3[i3] = i;
4332 }
4333 return result;
4334}
4335
4336static int
4337encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4338{
4339 struct encoding_map *map = (struct encoding_map*)mapping;
4340 int l1 = c>>11;
4341 int l2 = (c>>7) & 0xF;
4342 int l3 = c & 0x7F;
4343 int i;
4344
4345#ifdef Py_UNICODE_WIDE
4346 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004347 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004348 }
4349#endif
4350 if (c == 0)
4351 return 0;
4352 /* level 1*/
4353 i = map->level1[l1];
4354 if (i == 0xFF) {
4355 return -1;
4356 }
4357 /* level 2*/
4358 i = map->level23[16*i+l2];
4359 if (i == 0xFF) {
4360 return -1;
4361 }
4362 /* level 3 */
4363 i = map->level23[16*map->count2 + 128*i + l3];
4364 if (i == 0) {
4365 return -1;
4366 }
4367 return i;
4368}
4369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370/* Lookup the character ch in the mapping. If the character
4371 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004372 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 PyObject *w = PyInt_FromLong((long)c);
4376 PyObject *x;
4377
4378 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 x = PyObject_GetItem(mapping, w);
4381 Py_DECREF(w);
4382 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004383 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4384 /* No mapping found means: mapping is undefined. */
4385 PyErr_Clear();
4386 x = Py_None;
4387 Py_INCREF(x);
4388 return x;
4389 } else
4390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004392 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004393 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004395 long value = PyInt_AS_LONG(x);
4396 if (value < 0 || value > 255) {
4397 PyErr_SetString(PyExc_TypeError,
4398 "character mapping must be in range(256)");
4399 Py_DECREF(x);
4400 return NULL;
4401 }
4402 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004404 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004405 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004407 /* wrong return value */
4408 PyErr_SetString(PyExc_TypeError,
4409 "character mapping must return integer, None or str");
4410 Py_DECREF(x);
4411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 }
4413}
4414
Martin v. Löwis3f767792006-06-04 19:36:28 +00004415static int
4416charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4417{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004418 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4419 /* exponentially overallocate to minimize reallocations */
4420 if (requiredsize < 2*outsize)
4421 requiredsize = 2*outsize;
4422 if (_PyString_Resize(outobj, requiredsize)) {
4423 return 0;
4424 }
4425 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004426}
4427
Benjamin Peterson857ce152009-01-31 16:29:18 +00004428typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004429 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004430}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431/* lookup the character, put the result in the output string and adjust
4432 various state variables. Reallocate the output string if not enough
4433 space is available. Return a new reference to the object that
4434 was put in the output buffer, or Py_None, if the mapping was undefined
4435 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004436 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004438charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004439 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004441 PyObject *rep;
4442 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004443 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444
Christian Heimese93237d2007-12-19 02:37:44 +00004445 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004446 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004447 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004448 if (res == -1)
4449 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004450 if (outsize<requiredsize)
4451 if (!charmapencode_resize(outobj, outpos, requiredsize))
4452 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004453 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004454 outstart[(*outpos)++] = (char)res;
4455 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004456 }
4457
4458 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004460 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004461 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004462 Py_DECREF(rep);
4463 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004464 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004465 if (PyInt_Check(rep)) {
4466 Py_ssize_t requiredsize = *outpos+1;
4467 if (outsize<requiredsize)
4468 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4469 Py_DECREF(rep);
4470 return enc_EXCEPTION;
4471 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004472 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004473 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004474 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004475 else {
4476 const char *repchars = PyString_AS_STRING(rep);
4477 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4478 Py_ssize_t requiredsize = *outpos+repsize;
4479 if (outsize<requiredsize)
4480 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4481 Py_DECREF(rep);
4482 return enc_EXCEPTION;
4483 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004484 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004485 memcpy(outstart + *outpos, repchars, repsize);
4486 *outpos += repsize;
4487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 }
Georg Brandl9f167602006-06-04 21:46:16 +00004489 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004490 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491}
4492
4493/* handle an error in PyUnicode_EncodeCharmap
4494 Return 0 on success, -1 on error */
4495static
4496int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004497 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004499 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501{
4502 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t repsize;
4504 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 Py_UNICODE *uni2;
4506 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004507 Py_ssize_t collstartpos = *inpos;
4508 Py_ssize_t collendpos = *inpos+1;
4509 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 char *encoding = "charmap";
4511 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 /* find all unencodable characters */
4515 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004517 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 int res = encoding_map_lookup(p[collendpos], mapping);
4519 if (res != -1)
4520 break;
4521 ++collendpos;
4522 continue;
4523 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004524
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004525 rep = charmapencode_lookup(p[collendpos], mapping);
4526 if (rep==NULL)
4527 return -1;
4528 else if (rep!=Py_None) {
4529 Py_DECREF(rep);
4530 break;
4531 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004532 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 }
4535 /* cache callback name lookup
4536 * (if not done yet, i.e. it's the first error) */
4537 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004538 if ((errors==NULL) || (!strcmp(errors, "strict")))
4539 *known_errorHandler = 1;
4540 else if (!strcmp(errors, "replace"))
4541 *known_errorHandler = 2;
4542 else if (!strcmp(errors, "ignore"))
4543 *known_errorHandler = 3;
4544 else if (!strcmp(errors, "xmlcharrefreplace"))
4545 *known_errorHandler = 4;
4546 else
4547 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 }
4549 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004550 case 1: /* strict */
4551 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4552 return -1;
4553 case 2: /* replace */
4554 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004555 x = charmapencode_output('?', mapping, res, respos);
4556 if (x==enc_EXCEPTION) {
4557 return -1;
4558 }
4559 else if (x==enc_FAILED) {
4560 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4561 return -1;
4562 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004563 }
4564 /* fall through */
4565 case 3: /* ignore */
4566 *inpos = collendpos;
4567 break;
4568 case 4: /* xmlcharrefreplace */
4569 /* generate replacement (temporarily (mis)uses p) */
4570 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004571 char buffer[2+29+1+1];
4572 char *cp;
4573 sprintf(buffer, "&#%d;", (int)p[collpos]);
4574 for (cp = buffer; *cp; ++cp) {
4575 x = charmapencode_output(*cp, mapping, res, respos);
4576 if (x==enc_EXCEPTION)
4577 return -1;
4578 else if (x==enc_FAILED) {
4579 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4580 return -1;
4581 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004582 }
4583 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004584 *inpos = collendpos;
4585 break;
4586 default:
4587 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004588 encoding, reason, p, size, exceptionObject,
4589 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004590 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004591 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004592 /* generate replacement */
4593 repsize = PyUnicode_GET_SIZE(repunicode);
4594 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004595 x = charmapencode_output(*uni2, mapping, res, respos);
4596 if (x==enc_EXCEPTION) {
4597 return -1;
4598 }
4599 else if (x==enc_FAILED) {
4600 Py_DECREF(repunicode);
4601 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4602 return -1;
4603 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004604 }
4605 *inpos = newpos;
4606 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 }
4608 return 0;
4609}
4610
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004612 Py_ssize_t size,
4613 PyObject *mapping,
4614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 /* output object */
4617 PyObject *res = NULL;
4618 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004619 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004621 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 PyObject *errorHandler = NULL;
4623 PyObject *exc = NULL;
4624 /* the following variable is used for caching string comparisons
4625 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4626 * 3=ignore, 4=xmlcharrefreplace */
4627 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
4629 /* Default to Latin-1 */
4630 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004631 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 /* allocate enough for a simple encoding without
4634 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004635 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 if (res == NULL)
4637 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004638 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004639 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004642 /* try to encode it */
4643 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4644 if (x==enc_EXCEPTION) /* error */
4645 goto onError;
4646 if (x==enc_FAILED) { /* unencodable character */
4647 if (charmap_encoding_error(p, size, &inpos, mapping,
4648 &exc,
4649 &known_errorHandler, &errorHandler, errors,
4650 &res, &respos)) {
4651 goto onError;
4652 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004653 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004654 else
4655 /* done with this character => adjust input position */
4656 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004660 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004661 if (_PyString_Resize(&res, respos))
4662 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 }
4664 Py_XDECREF(exc);
4665 Py_XDECREF(errorHandler);
4666 return res;
4667
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004668 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 Py_XDECREF(res);
4670 Py_XDECREF(exc);
4671 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 return NULL;
4673}
4674
4675PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004676 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677{
4678 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004679 PyErr_BadArgument();
4680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 }
4682 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004683 PyUnicode_GET_SIZE(unicode),
4684 mapping,
4685 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686}
4687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688/* create or adjust a UnicodeTranslateError */
4689static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004690 const Py_UNICODE *unicode, Py_ssize_t size,
4691 Py_ssize_t startpos, Py_ssize_t endpos,
4692 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004695 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004696 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 }
4698 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004699 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4700 goto onError;
4701 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4702 goto onError;
4703 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4704 goto onError;
4705 return;
4706 onError:
4707 Py_DECREF(*exceptionObject);
4708 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 }
4710}
4711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712/* raises a UnicodeTranslateError */
4713static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004714 const Py_UNICODE *unicode, Py_ssize_t size,
4715 Py_ssize_t startpos, Py_ssize_t endpos,
4716 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717{
4718 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004719 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004721 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722}
4723
4724/* error handling callback helper:
4725 build arguments, call the callback and check the arguments,
4726 put the result into newpos and return the replacement string, which
4727 has to be freed by the caller */
4728static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004729 PyObject **errorHandler,
4730 const char *reason,
4731 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4732 Py_ssize_t startpos, Py_ssize_t endpos,
4733 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004735 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736
Martin v. Löwis412fb672006-04-13 06:34:32 +00004737 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 PyObject *restuple;
4739 PyObject *resunicode;
4740
4741 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004742 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 }
4746
4747 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004748 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751
4752 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004753 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004755 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004757 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004758 Py_DECREF(restuple);
4759 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 }
4761 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004762 &resunicode, &i_newpos)) {
4763 Py_DECREF(restuple);
4764 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004767 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004768 else
4769 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004770 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004771 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4772 Py_DECREF(restuple);
4773 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004774 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 Py_INCREF(resunicode);
4776 Py_DECREF(restuple);
4777 return resunicode;
4778}
4779
4780/* Lookup the character ch in the mapping and put the result in result,
4781 which must be decrefed by the caller.
4782 Return 0 on success, -1 on error */
4783static
4784int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4785{
4786 PyObject *w = PyInt_FromLong((long)c);
4787 PyObject *x;
4788
4789 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004790 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 x = PyObject_GetItem(mapping, w);
4792 Py_DECREF(w);
4793 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004794 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4795 /* No mapping found means: use 1:1 mapping. */
4796 PyErr_Clear();
4797 *result = NULL;
4798 return 0;
4799 } else
4800 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 }
4802 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004803 *result = x;
4804 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 }
4806 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004807 long value = PyInt_AS_LONG(x);
4808 long max = PyUnicode_GetMax();
4809 if (value < 0 || value > max) {
4810 PyErr_Format(PyExc_TypeError,
4811 "character mapping must be in range(0x%lx)", max+1);
4812 Py_DECREF(x);
4813 return -1;
4814 }
4815 *result = x;
4816 return 0;
4817 }
4818 else if (PyUnicode_Check(x)) {
4819 *result = x;
4820 return 0;
4821 }
4822 else {
4823 /* wrong return value */
4824 PyErr_SetString(PyExc_TypeError,
4825 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004826 Py_DECREF(x);
4827 return -1;
4828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829}
4830/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 if not reallocate and adjust various state variables.
4832 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833static
Walter Dörwald4894c302003-10-24 14:25:28 +00004834int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004835 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004837 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004838 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004839 /* remember old output position */
4840 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4841 /* exponentially overallocate to minimize reallocations */
4842 if (requiredsize < 2 * oldsize)
4843 requiredsize = 2 * oldsize;
4844 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4845 return -1;
4846 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 }
4848 return 0;
4849}
4850/* lookup the character, put the result in the output string and adjust
4851 various state variables. Return a new reference to the object that
4852 was put in the output buffer in *result, or Py_None, if the mapping was
4853 undefined (in which case no character was written).
4854 The called must decref result.
4855 Return 0 on success, -1 on error. */
4856static
Walter Dörwald4894c302003-10-24 14:25:28 +00004857int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004858 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4859 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860{
Walter Dörwald4894c302003-10-24 14:25:28 +00004861 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004862 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004864 /* not found => default to 1:1 mapping */
4865 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 }
4867 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004868 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 /* no overflow check, because we know that the space is enough */
4871 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 }
4873 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004874 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4875 if (repsize==1) {
4876 /* no overflow check, because we know that the space is enough */
4877 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4878 }
4879 else if (repsize!=0) {
4880 /* more than one character */
4881 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4882 (insize - (curinp-startinp)) +
4883 repsize - 1;
4884 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4885 return -1;
4886 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4887 *outp += repsize;
4888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 }
4890 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 return 0;
4893}
4894
4895PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004896 Py_ssize_t size,
4897 PyObject *mapping,
4898 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 /* output object */
4901 PyObject *res = NULL;
4902 /* pointers to the beginning and end+1 of input */
4903 const Py_UNICODE *startp = p;
4904 const Py_UNICODE *endp = p + size;
4905 /* pointer into the output */
4906 Py_UNICODE *str;
4907 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004908 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 char *reason = "character maps to <undefined>";
4910 PyObject *errorHandler = NULL;
4911 PyObject *exc = NULL;
4912 /* the following variable is used for caching string comparisons
4913 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4914 * 3=ignore, 4=xmlcharrefreplace */
4915 int known_errorHandler = -1;
4916
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 PyErr_BadArgument();
4919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921
4922 /* allocate enough for a simple 1:1 translation without
4923 replacements, if we need more, we'll resize */
4924 res = PyUnicode_FromUnicode(NULL, size);
4925 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004926 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004928 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004932 /* try to encode it */
4933 PyObject *x = NULL;
4934 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4935 Py_XDECREF(x);
4936 goto onError;
4937 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004938 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004939 if (x!=Py_None) /* it worked => adjust input pointer */
4940 ++p;
4941 else { /* untranslatable character */
4942 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4943 Py_ssize_t repsize;
4944 Py_ssize_t newpos;
4945 Py_UNICODE *uni2;
4946 /* startpos for collecting untranslatable chars */
4947 const Py_UNICODE *collstart = p;
4948 const Py_UNICODE *collend = p+1;
4949 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004951 /* find all untranslatable characters */
4952 while (collend < endp) {
4953 if (charmaptranslate_lookup(*collend, mapping, &x))
4954 goto onError;
4955 Py_XDECREF(x);
4956 if (x!=Py_None)
4957 break;
4958 ++collend;
4959 }
4960 /* cache callback name lookup
4961 * (if not done yet, i.e. it's the first error) */
4962 if (known_errorHandler==-1) {
4963 if ((errors==NULL) || (!strcmp(errors, "strict")))
4964 known_errorHandler = 1;
4965 else if (!strcmp(errors, "replace"))
4966 known_errorHandler = 2;
4967 else if (!strcmp(errors, "ignore"))
4968 known_errorHandler = 3;
4969 else if (!strcmp(errors, "xmlcharrefreplace"))
4970 known_errorHandler = 4;
4971 else
4972 known_errorHandler = 0;
4973 }
4974 switch (known_errorHandler) {
4975 case 1: /* strict */
4976 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004977 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004978 case 2: /* replace */
4979 /* No need to check for space, this is a 1:1 replacement */
4980 for (coll = collstart; coll<collend; ++coll)
4981 *str++ = '?';
4982 /* fall through */
4983 case 3: /* ignore */
4984 p = collend;
4985 break;
4986 case 4: /* xmlcharrefreplace */
4987 /* generate replacement (temporarily (mis)uses p) */
4988 for (p = collstart; p < collend; ++p) {
4989 char buffer[2+29+1+1];
4990 char *cp;
4991 sprintf(buffer, "&#%d;", (int)*p);
4992 if (charmaptranslate_makespace(&res, &str,
4993 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4994 goto onError;
4995 for (cp = buffer; *cp; ++cp)
4996 *str++ = *cp;
4997 }
4998 p = collend;
4999 break;
5000 default:
5001 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5002 reason, startp, size, &exc,
5003 collstart-startp, collend-startp, &newpos);
5004 if (repunicode == NULL)
5005 goto onError;
5006 /* generate replacement */
5007 repsize = PyUnicode_GET_SIZE(repunicode);
5008 if (charmaptranslate_makespace(&res, &str,
5009 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5010 Py_DECREF(repunicode);
5011 goto onError;
5012 }
5013 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5014 *str++ = *uni2;
5015 p = startp + newpos;
5016 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005017 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005018 }
5019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 /* Resize if we allocated to much */
5021 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005022 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005023 if (PyUnicode_Resize(&res, respos) < 0)
5024 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 }
5026 Py_XDECREF(exc);
5027 Py_XDECREF(errorHandler);
5028 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005030 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 Py_XDECREF(res);
5032 Py_XDECREF(exc);
5033 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 return NULL;
5035}
5036
5037PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005038 PyObject *mapping,
5039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040{
5041 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005042
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 str = PyUnicode_FromObject(str);
5044 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005045 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005047 PyUnicode_GET_SIZE(str),
5048 mapping,
5049 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 Py_DECREF(str);
5051 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005052
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005053 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 Py_XDECREF(str);
5055 return NULL;
5056}
Tim Petersced69f82003-09-16 20:30:58 +00005057
Guido van Rossum9e896b32000-04-05 20:11:21 +00005058/* --- Decimal Encoder ---------------------------------------------------- */
5059
5060int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005061 Py_ssize_t length,
5062 char *output,
5063 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005064{
5065 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005066 PyObject *errorHandler = NULL;
5067 PyObject *exc = NULL;
5068 const char *encoding = "decimal";
5069 const char *reason = "invalid decimal Unicode string";
5070 /* the following variable is used for caching string comparisons
5071 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5072 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005073
5074 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005075 PyErr_BadArgument();
5076 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005077 }
5078
5079 p = s;
5080 end = s + length;
5081 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005082 register Py_UNICODE ch = *p;
5083 int decimal;
5084 PyObject *repunicode;
5085 Py_ssize_t repsize;
5086 Py_ssize_t newpos;
5087 Py_UNICODE *uni2;
5088 Py_UNICODE *collstart;
5089 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005090
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005091 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005092 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005093 ++p;
5094 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005095 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005096 decimal = Py_UNICODE_TODECIMAL(ch);
5097 if (decimal >= 0) {
5098 *output++ = '0' + decimal;
5099 ++p;
5100 continue;
5101 }
5102 if (0 < ch && ch < 256) {
5103 *output++ = (char)ch;
5104 ++p;
5105 continue;
5106 }
5107 /* All other characters are considered unencodable */
5108 collstart = p;
5109 collend = p+1;
5110 while (collend < end) {
5111 if ((0 < *collend && *collend < 256) ||
5112 !Py_UNICODE_ISSPACE(*collend) ||
5113 Py_UNICODE_TODECIMAL(*collend))
5114 break;
5115 }
5116 /* cache callback name lookup
5117 * (if not done yet, i.e. it's the first error) */
5118 if (known_errorHandler==-1) {
5119 if ((errors==NULL) || (!strcmp(errors, "strict")))
5120 known_errorHandler = 1;
5121 else if (!strcmp(errors, "replace"))
5122 known_errorHandler = 2;
5123 else if (!strcmp(errors, "ignore"))
5124 known_errorHandler = 3;
5125 else if (!strcmp(errors, "xmlcharrefreplace"))
5126 known_errorHandler = 4;
5127 else
5128 known_errorHandler = 0;
5129 }
5130 switch (known_errorHandler) {
5131 case 1: /* strict */
5132 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5133 goto onError;
5134 case 2: /* replace */
5135 for (p = collstart; p < collend; ++p)
5136 *output++ = '?';
5137 /* fall through */
5138 case 3: /* ignore */
5139 p = collend;
5140 break;
5141 case 4: /* xmlcharrefreplace */
5142 /* generate replacement (temporarily (mis)uses p) */
5143 for (p = collstart; p < collend; ++p)
5144 output += sprintf(output, "&#%d;", (int)*p);
5145 p = collend;
5146 break;
5147 default:
5148 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5149 encoding, reason, s, length, &exc,
5150 collstart-s, collend-s, &newpos);
5151 if (repunicode == NULL)
5152 goto onError;
5153 /* generate replacement */
5154 repsize = PyUnicode_GET_SIZE(repunicode);
5155 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5156 Py_UNICODE ch = *uni2;
5157 if (Py_UNICODE_ISSPACE(ch))
5158 *output++ = ' ';
5159 else {
5160 decimal = Py_UNICODE_TODECIMAL(ch);
5161 if (decimal >= 0)
5162 *output++ = '0' + decimal;
5163 else if (0 < ch && ch < 256)
5164 *output++ = (char)ch;
5165 else {
5166 Py_DECREF(repunicode);
5167 raise_encode_exception(&exc, encoding,
5168 s, length, collstart-s, collend-s, reason);
5169 goto onError;
5170 }
5171 }
5172 }
5173 p = s + newpos;
5174 Py_DECREF(repunicode);
5175 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005176 }
5177 /* 0-terminate the output string */
5178 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 Py_XDECREF(exc);
5180 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005181 return 0;
5182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005183 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 Py_XDECREF(exc);
5185 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005186 return -1;
5187}
5188
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189/* --- Helpers ------------------------------------------------------------ */
5190
Eric Smitha9f7d622008-02-17 19:46:49 +00005191#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005192
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005193#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005194
Fredrik Lundha50d2012006-05-26 17:04:58 +00005195#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005196
5197#include "stringlib/count.h"
5198#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005199#include "stringlib/partition.h"
5200
Fredrik Lundhc8162812006-05-26 19:33:03 +00005201/* helper macro to fixup start/end slice values */
5202#define FIX_START_END(obj) \
5203 if (start < 0) \
5204 start += (obj)->length; \
5205 if (start < 0) \
5206 start = 0; \
5207 if (end > (obj)->length) \
5208 end = (obj)->length; \
5209 if (end < 0) \
5210 end += (obj)->length; \
5211 if (end < 0) \
5212 end = 0;
5213
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005215 PyObject *substr,
5216 Py_ssize_t start,
5217 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005219 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005220 PyUnicodeObject* str_obj;
5221 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005222
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005223 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5224 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005225 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005226 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5227 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005228 Py_DECREF(str_obj);
5229 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 }
Tim Petersced69f82003-09-16 20:30:58 +00005231
Fredrik Lundhc8162812006-05-26 19:33:03 +00005232 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005233
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005234 result = stringlib_count(
5235 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5236 );
5237
5238 Py_DECREF(sub_obj);
5239 Py_DECREF(str_obj);
5240
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 return result;
5242}
5243
Martin v. Löwis18e16552006-02-15 17:27:45 +00005244Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005245 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005246 Py_ssize_t start,
5247 Py_ssize_t end,
5248 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005251
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005252 str = PyUnicode_FromObject(str);
5253 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005254 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005255 sub = PyUnicode_FromObject(sub);
5256 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005257 Py_DECREF(str);
5258 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 }
Tim Petersced69f82003-09-16 20:30:58 +00005260
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005261 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005262 result = stringlib_find_slice(
5263 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5264 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5265 start, end
5266 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005267 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005268 result = stringlib_rfind_slice(
5269 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5270 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5271 start, end
5272 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005273
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005274 Py_DECREF(str);
5275 Py_DECREF(sub);
5276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 return result;
5278}
5279
Tim Petersced69f82003-09-16 20:30:58 +00005280static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005282 PyUnicodeObject *substring,
5283 Py_ssize_t start,
5284 Py_ssize_t end,
5285 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 if (substring->length == 0)
5288 return 1;
5289
Fredrik Lundhc8162812006-05-26 19:33:03 +00005290 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
5292 end -= substring->length;
5293 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005294 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
5296 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005297 if (Py_UNICODE_MATCH(self, end, substring))
5298 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 } else {
5300 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005301 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 }
5303
5304 return 0;
5305}
5306
Martin v. Löwis18e16552006-02-15 17:27:45 +00005307Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005308 PyObject *substr,
5309 Py_ssize_t start,
5310 Py_ssize_t end,
5311 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005313 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005314
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 str = PyUnicode_FromObject(str);
5316 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005317 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 substr = PyUnicode_FromObject(substr);
5319 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005320 Py_DECREF(str);
5321 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 }
Tim Petersced69f82003-09-16 20:30:58 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005325 (PyUnicodeObject *)substr,
5326 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 Py_DECREF(str);
5328 Py_DECREF(substr);
5329 return result;
5330}
5331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332/* Apply fixfct filter to the Unicode object self and return a
5333 reference to the modified object */
5334
Tim Petersced69f82003-09-16 20:30:58 +00005335static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005337 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338{
5339
5340 PyUnicodeObject *u;
5341
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005342 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005344 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005345
5346 Py_UNICODE_COPY(u->str, self->str, self->length);
5347
Tim Peters7a29bd52001-09-12 03:03:31 +00005348 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005349 /* fixfct should return TRUE if it modified the buffer. If
5350 FALSE, return a reference to the original buffer instead
5351 (to save space, not time) */
5352 Py_INCREF(self);
5353 Py_DECREF(u);
5354 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 }
5356 return (PyObject*) u;
5357}
5358
Tim Petersced69f82003-09-16 20:30:58 +00005359static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360int fixupper(PyUnicodeObject *self)
5361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005362 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 Py_UNICODE *s = self->str;
5364 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005367 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005368
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005369 ch = Py_UNICODE_TOUPPER(*s);
5370 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005372 *s = ch;
5373 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 s++;
5375 }
5376
5377 return status;
5378}
5379
Tim Petersced69f82003-09-16 20:30:58 +00005380static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381int fixlower(PyUnicodeObject *self)
5382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 Py_UNICODE *s = self->str;
5385 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005388 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005389
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005390 ch = Py_UNICODE_TOLOWER(*s);
5391 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005393 *s = ch;
5394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 s++;
5396 }
5397
5398 return status;
5399}
5400
Tim Petersced69f82003-09-16 20:30:58 +00005401static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402int fixswapcase(PyUnicodeObject *self)
5403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 Py_UNICODE *s = self->str;
5406 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 while (len-- > 0) {
5409 if (Py_UNICODE_ISUPPER(*s)) {
5410 *s = Py_UNICODE_TOLOWER(*s);
5411 status = 1;
5412 } else if (Py_UNICODE_ISLOWER(*s)) {
5413 *s = Py_UNICODE_TOUPPER(*s);
5414 status = 1;
5415 }
5416 s++;
5417 }
5418
5419 return status;
5420}
5421
Tim Petersced69f82003-09-16 20:30:58 +00005422static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423int fixcapitalize(PyUnicodeObject *self)
5424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005426 Py_UNICODE *s = self->str;
5427 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005428
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005429 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005430 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005431 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005432 *s = Py_UNICODE_TOUPPER(*s);
5433 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005435 s++;
5436 while (--len > 0) {
5437 if (Py_UNICODE_ISUPPER(*s)) {
5438 *s = Py_UNICODE_TOLOWER(*s);
5439 status = 1;
5440 }
5441 s++;
5442 }
5443 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444}
5445
5446static
5447int fixtitle(PyUnicodeObject *self)
5448{
5449 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5450 register Py_UNICODE *e;
5451 int previous_is_cased;
5452
5453 /* Shortcut for single character strings */
5454 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005455 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5456 if (*p != ch) {
5457 *p = ch;
5458 return 1;
5459 }
5460 else
5461 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
Tim Petersced69f82003-09-16 20:30:58 +00005463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 e = p + PyUnicode_GET_SIZE(self);
5465 previous_is_cased = 0;
5466 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005467 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005468
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005469 if (previous_is_cased)
5470 *p = Py_UNICODE_TOLOWER(ch);
5471 else
5472 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005473
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005474 if (Py_UNICODE_ISLOWER(ch) ||
5475 Py_UNICODE_ISUPPER(ch) ||
5476 Py_UNICODE_ISTITLE(ch))
5477 previous_is_cased = 1;
5478 else
5479 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 }
5481 return 1;
5482}
5483
Tim Peters8ce9f162004-08-27 01:49:32 +00005484PyObject *
5485PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486{
Tim Peters8ce9f162004-08-27 01:49:32 +00005487 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005488 const Py_UNICODE blank = ' ';
5489 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005490 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005491 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005492 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5493 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5495 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005496 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005497 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005498 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
Tim Peters05eba1f2004-08-27 21:32:02 +00005500 fseq = PySequence_Fast(seq, "");
5501 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005502 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005503 }
5504
Tim Peters91879ab2004-08-27 22:35:44 +00005505 /* Grrrr. A codec may be invoked to convert str objects to
5506 * Unicode, and so it's possible to call back into Python code
5507 * during PyUnicode_FromObject(), and so it's possible for a sick
5508 * codec to change the size of fseq (if seq is a list). Therefore
5509 * we have to keep refetching the size -- can't assume seqlen
5510 * is invariant.
5511 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005512 seqlen = PySequence_Fast_GET_SIZE(fseq);
5513 /* If empty sequence, return u"". */
5514 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005515 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5516 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005517 }
5518 /* If singleton sequence with an exact Unicode, return that. */
5519 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005520 item = PySequence_Fast_GET_ITEM(fseq, 0);
5521 if (PyUnicode_CheckExact(item)) {
5522 Py_INCREF(item);
5523 res = (PyUnicodeObject *)item;
5524 goto Done;
5525 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005526 }
5527
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 /* At least two items to join, or one that isn't exact Unicode. */
5529 if (seqlen > 1) {
5530 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005531 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005532 sep = &blank;
5533 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005534 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005535 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005536 internal_separator = PyUnicode_FromObject(separator);
5537 if (internal_separator == NULL)
5538 goto onError;
5539 sep = PyUnicode_AS_UNICODE(internal_separator);
5540 seplen = PyUnicode_GET_SIZE(internal_separator);
5541 /* In case PyUnicode_FromObject() mutated seq. */
5542 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005543 }
5544 }
5545
5546 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005547 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005548 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 res_p = PyUnicode_AS_UNICODE(res);
5551 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005552
Tim Peters05eba1f2004-08-27 21:32:02 +00005553 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005554 Py_ssize_t itemlen;
5555 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005556
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005557 item = PySequence_Fast_GET_ITEM(fseq, i);
5558 /* Convert item to Unicode. */
5559 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5560 PyErr_Format(PyExc_TypeError,
5561 "sequence item %zd: expected string or Unicode,"
5562 " %.80s found",
5563 i, Py_TYPE(item)->tp_name);
5564 goto onError;
5565 }
5566 item = PyUnicode_FromObject(item);
5567 if (item == NULL)
5568 goto onError;
5569 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005570
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005571 /* In case PyUnicode_FromObject() mutated seq. */
5572 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005573
Tim Peters8ce9f162004-08-27 01:49:32 +00005574 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005575 itemlen = PyUnicode_GET_SIZE(item);
5576 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005577 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005578 goto Overflow;
5579 if (i < seqlen - 1) {
5580 new_res_used += seplen;
5581 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005582 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005583 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005584 if (new_res_used > res_alloc) {
5585 /* double allocated size until it's big enough */
5586 do {
5587 res_alloc += res_alloc;
5588 if (res_alloc <= 0)
5589 goto Overflow;
5590 } while (new_res_used > res_alloc);
5591 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5592 Py_DECREF(item);
5593 goto onError;
5594 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005595 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005596 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005597
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005598 /* Copy item, and maybe the separator. */
5599 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5600 res_p += itemlen;
5601 if (i < seqlen - 1) {
5602 Py_UNICODE_COPY(res_p, sep, seplen);
5603 res_p += seplen;
5604 }
5605 Py_DECREF(item);
5606 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005607 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005608
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 /* Shrink res to match the used area; this probably can't fail,
5610 * but it's cheap to check.
5611 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005612 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005614
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005615 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005616 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005617 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 return (PyObject *)res;
5619
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005620 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005621 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005622 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005623 Py_DECREF(item);
5624 /* fall through */
5625
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005626 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005627 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005628 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005629 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 return NULL;
5631}
5632
Tim Petersced69f82003-09-16 20:30:58 +00005633static
5634PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005635 Py_ssize_t left,
5636 Py_ssize_t right,
5637 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
5639 PyUnicodeObject *u;
5640
5641 if (left < 0)
5642 left = 0;
5643 if (right < 0)
5644 right = 0;
5645
Tim Peters7a29bd52001-09-12 03:03:31 +00005646 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 Py_INCREF(self);
5648 return self;
5649 }
5650
Neal Norwitze7d8be82008-07-31 17:17:14 +00005651 if (left > PY_SSIZE_T_MAX - self->length ||
5652 right > PY_SSIZE_T_MAX - (left + self->length)) {
5653 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5654 return NULL;
5655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 u = _PyUnicode_New(left + self->length + right);
5657 if (u) {
5658 if (left)
5659 Py_UNICODE_FILL(u->str, fill, left);
5660 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5661 if (right)
5662 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5663 }
5664
5665 return u;
5666}
5667
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005668#define SPLIT_APPEND(data, left, right) \
5669 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5670 if (!str) \
5671 goto onError; \
5672 if (PyList_Append(list, str)) { \
5673 Py_DECREF(str); \
5674 goto onError; \
5675 } \
5676 else \
5677 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678
5679static
5680PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005681 PyObject *list,
5682 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684 register Py_ssize_t i;
5685 register Py_ssize_t j;
5686 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005688 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005691 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005692 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005693 i++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005694 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005695 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5696 i++;
5697 if (j < i) {
5698 if (maxcount-- <= 0)
5699 break;
5700 SPLIT_APPEND(buf, j, i);
5701 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5702 i++;
5703 j = i;
5704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 }
5706 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005707 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 }
5709 return list;
5710
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005711 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 Py_DECREF(list);
5713 return NULL;
5714}
5715
5716PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005717 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005719 register Py_ssize_t i;
5720 register Py_ssize_t j;
5721 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 PyObject *list;
5723 PyObject *str;
5724 Py_UNICODE *data;
5725
5726 string = PyUnicode_FromObject(string);
5727 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 data = PyUnicode_AS_UNICODE(string);
5730 len = PyUnicode_GET_SIZE(string);
5731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 list = PyList_New(0);
5733 if (!list)
5734 goto onError;
5735
5736 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005737 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005738
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005739 /* Find a line and append it */
5740 while (i < len && !BLOOM_LINEBREAK(data[i]))
5741 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005743 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005744 eol = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005745 if (i < len) {
5746 if (data[i] == '\r' && i + 1 < len &&
5747 data[i+1] == '\n')
5748 i += 2;
5749 else
5750 i++;
5751 if (keepends)
5752 eol = i;
5753 }
5754 SPLIT_APPEND(data, j, eol);
5755 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 }
5757 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005758 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 }
5760
5761 Py_DECREF(string);
5762 return list;
5763
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005764 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005765 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 Py_DECREF(string);
5767 return NULL;
5768}
5769
Tim Petersced69f82003-09-16 20:30:58 +00005770static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771PyObject *split_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005772 PyObject *list,
5773 Py_UNICODE ch,
5774 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005776 register Py_ssize_t i;
5777 register Py_ssize_t j;
5778 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005780 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
5782 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005783 if (buf[i] == ch) {
5784 if (maxcount-- <= 0)
5785 break;
5786 SPLIT_APPEND(buf, j, i);
5787 i = j = i + 1;
5788 } else
5789 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 }
5791 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005792 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 }
5794 return list;
5795
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 Py_DECREF(list);
5798 return NULL;
5799}
5800
Tim Petersced69f82003-09-16 20:30:58 +00005801static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802PyObject *split_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005803 PyObject *list,
5804 PyUnicodeObject *substring,
5805 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005807 register Py_ssize_t i;
5808 register Py_ssize_t j;
5809 Py_ssize_t len = self->length;
5810 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 PyObject *str;
5812
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005813 for (i = j = 0; i <= len - sublen; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005814 if (Py_UNICODE_MATCH(self, i, substring)) {
5815 if (maxcount-- <= 0)
5816 break;
5817 SPLIT_APPEND(self->str, j, i);
5818 i = j = i + sublen;
5819 } else
5820 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 }
5822 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005823 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 }
5825 return list;
5826
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005827 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 Py_DECREF(list);
5829 return NULL;
5830}
5831
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832static
5833PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005834 PyObject *list,
5835 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 register Py_ssize_t i;
5838 register Py_ssize_t j;
5839 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005841 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005842
5843 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005844 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005845 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005846 i--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005847 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005848 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5849 i--;
5850 if (j > i) {
5851 if (maxcount-- <= 0)
5852 break;
5853 SPLIT_APPEND(buf, i + 1, j + 1);
5854 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5855 i--;
5856 j = i;
5857 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858 }
5859 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005860 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005861 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005862 if (PyList_Reverse(list) < 0)
5863 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864 return list;
5865
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005866 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867 Py_DECREF(list);
5868 return NULL;
5869}
5870
Benjamin Peterson857ce152009-01-31 16:29:18 +00005871static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005873 PyObject *list,
5874 Py_UNICODE ch,
5875 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005877 register Py_ssize_t i;
5878 register Py_ssize_t j;
5879 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005880 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005881 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005882
5883 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005884 if (buf[i] == ch) {
5885 if (maxcount-- <= 0)
5886 break;
5887 SPLIT_APPEND(buf, i + 1, j + 1);
5888 j = i = i - 1;
5889 } else
5890 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005891 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005892 if (j >= -1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005893 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005895 if (PyList_Reverse(list) < 0)
5896 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005897 return list;
5898
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005899 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005900 Py_DECREF(list);
5901 return NULL;
5902}
5903
Benjamin Peterson857ce152009-01-31 16:29:18 +00005904static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005905PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005906 PyObject *list,
5907 PyUnicodeObject *substring,
5908 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005909{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005910 register Py_ssize_t i;
5911 register Py_ssize_t j;
5912 Py_ssize_t len = self->length;
5913 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005914 PyObject *str;
5915
5916 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005917 if (Py_UNICODE_MATCH(self, i, substring)) {
5918 if (maxcount-- <= 0)
5919 break;
5920 SPLIT_APPEND(self->str, i + sublen, j);
5921 j = i;
5922 i -= sublen;
5923 } else
5924 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005925 }
5926 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005927 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005929 if (PyList_Reverse(list) < 0)
5930 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005931 return list;
5932
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005933 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005934 Py_DECREF(list);
5935 return NULL;
5936}
5937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938#undef SPLIT_APPEND
5939
5940static
5941PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005942 PyUnicodeObject *substring,
5943 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944{
5945 PyObject *list;
5946
5947 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005948 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
5950 list = PyList_New(0);
5951 if (!list)
5952 return NULL;
5953
5954 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005955 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005958 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005961 Py_DECREF(list);
5962 PyErr_SetString(PyExc_ValueError, "empty separator");
5963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 }
5965 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005966 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967}
5968
Tim Petersced69f82003-09-16 20:30:58 +00005969static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005970PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005971 PyUnicodeObject *substring,
5972 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005973{
5974 PyObject *list;
5975
5976 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005977 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005978
5979 list = PyList_New(0);
5980 if (!list)
5981 return NULL;
5982
5983 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005984 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005985
5986 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005987 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005988
5989 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005990 Py_DECREF(list);
5991 PyErr_SetString(PyExc_ValueError, "empty separator");
5992 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005993 }
5994 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005995 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005996}
5997
5998static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006000 PyUnicodeObject *str1,
6001 PyUnicodeObject *str2,
6002 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003{
6004 PyUnicodeObject *u;
6005
6006 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006007 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Fredrik Lundh347ee272006-05-24 16:35:18 +00006009 if (str1->length == str2->length) {
6010 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00006011 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006012 if (str1->length == 1) {
6013 /* replace characters */
6014 Py_UNICODE u1, u2;
6015 if (!findchar(self->str, self->length, str1->str[0]))
6016 goto nothing;
6017 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6018 if (!u)
6019 return NULL;
6020 Py_UNICODE_COPY(u->str, self->str, self->length);
6021 u1 = str1->str[0];
6022 u2 = str2->str[0];
6023 for (i = 0; i < u->length; i++)
6024 if (u->str[i] == u1) {
6025 if (--maxcount < 0)
6026 break;
6027 u->str[i] = u2;
6028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006030 i = fastsearch(
6031 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00006033 if (i < 0)
6034 goto nothing;
6035 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6036 if (!u)
6037 return NULL;
6038 Py_UNICODE_COPY(u->str, self->str, self->length);
6039 while (i <= self->length - str1->length)
6040 if (Py_UNICODE_MATCH(self, i, str1)) {
6041 if (--maxcount < 0)
6042 break;
6043 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6044 i += str1->length;
6045 } else
6046 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006049
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006050 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006051 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 Py_UNICODE *p;
6053
6054 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006055 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 if (n > maxcount)
6057 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006058 if (n == 0)
6059 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006060 /* new_size = self->length + n * (str2->length - str1->length)); */
6061 delta = (str2->length - str1->length);
6062 if (delta == 0) {
6063 new_size = self->length;
6064 } else {
6065 product = n * (str2->length - str1->length);
6066 if ((product / (str2->length - str1->length)) != n) {
6067 PyErr_SetString(PyExc_OverflowError,
6068 "replace string is too long");
6069 return NULL;
6070 }
6071 new_size = self->length + product;
6072 if (new_size < 0) {
6073 PyErr_SetString(PyExc_OverflowError,
6074 "replace string is too long");
6075 return NULL;
6076 }
6077 }
6078 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006079 if (!u)
6080 return NULL;
6081 i = 0;
6082 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006083 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006084 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006085 while (n-- > 0) {
6086 /* look for next match */
6087 j = i;
6088 while (j <= e) {
6089 if (Py_UNICODE_MATCH(self, j, str1))
6090 break;
6091 j++;
6092 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006093 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006094 if (j > e)
6095 break;
6096 /* copy unchanged part [i:j] */
6097 Py_UNICODE_COPY(p, self->str+i, j-i);
6098 p += j - i;
6099 }
6100 /* copy substitution string */
6101 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006102 Py_UNICODE_COPY(p, str2->str, str2->length);
6103 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006104 }
6105 i = j + str1->length;
6106 }
6107 if (i < self->length)
6108 /* copy tail [i:] */
6109 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006110 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006111 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006112 while (n > 0) {
6113 Py_UNICODE_COPY(p, str2->str, str2->length);
6114 p += str2->length;
6115 if (--n <= 0)
6116 break;
6117 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006119 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
6121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006123
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006124 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006125 /* nothing to replace; return original string (when possible) */
6126 if (PyUnicode_CheckExact(self)) {
6127 Py_INCREF(self);
6128 return (PyObject *) self;
6129 }
6130 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131}
6132
6133/* --- Unicode Object Methods --------------------------------------------- */
6134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006135PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006136 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137\n\
6138Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006139characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
6141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006142unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 return fixup(self, fixtitle);
6145}
6146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006147PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006148 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149\n\
6150Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006154unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 return fixup(self, fixcapitalize);
6157}
6158
6159#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006160PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006161 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162\n\
6163Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006164normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
6166static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006167unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168{
6169 PyObject *list;
6170 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006171 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 /* Split into words */
6174 list = split(self, NULL, -1);
6175 if (!list)
6176 return NULL;
6177
6178 /* Capitalize each word */
6179 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6180 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006181 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 if (item == NULL)
6183 goto onError;
6184 Py_DECREF(PyList_GET_ITEM(list, i));
6185 PyList_SET_ITEM(list, i, item);
6186 }
6187
6188 /* Join the words to form a new string */
6189 item = PyUnicode_Join(NULL, list);
6190
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006191 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 Py_DECREF(list);
6193 return (PyObject *)item;
6194}
6195#endif
6196
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006197/* Argument converter. Coerces to a single unicode character */
6198
6199static int
6200convert_uc(PyObject *obj, void *addr)
6201{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006202 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6203 PyObject *uniobj;
6204 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006205
Benjamin Peterson857ce152009-01-31 16:29:18 +00006206 uniobj = PyUnicode_FromObject(obj);
6207 if (uniobj == NULL) {
6208 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006209 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006210 return 0;
6211 }
6212 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6213 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006214 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006215 Py_DECREF(uniobj);
6216 return 0;
6217 }
6218 unistr = PyUnicode_AS_UNICODE(uniobj);
6219 *fillcharloc = unistr[0];
6220 Py_DECREF(uniobj);
6221 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006222}
6223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006224PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006225 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006227Return S centered in a Unicode string of length width. Padding is\n\
6228done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
6230static PyObject *
6231unicode_center(PyUnicodeObject *self, PyObject *args)
6232{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006233 Py_ssize_t marg, left;
6234 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006235 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
Thomas Woutersde017742006-02-16 19:34:37 +00006237 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 return NULL;
6239
Tim Peters7a29bd52001-09-12 03:03:31 +00006240 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 Py_INCREF(self);
6242 return (PyObject*) self;
6243 }
6244
6245 marg = width - self->length;
6246 left = marg / 2 + (marg & width & 1);
6247
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006248 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249}
6250
Marc-André Lemburge5034372000-08-08 08:04:29 +00006251#if 0
6252
6253/* This code should go into some future Unicode collation support
6254 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006255 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006256
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006257/* speedy UTF-16 code point order comparison */
6258/* gleaned from: */
6259/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6260
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006261static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006262{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006263 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006264 0, 0, 0, 0, 0, 0, 0, 0,
6265 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006266 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006267};
6268
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269static int
6270unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006272 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006273
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 Py_UNICODE *s1 = str1->str;
6275 Py_UNICODE *s2 = str2->str;
6276
6277 len1 = str1->length;
6278 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006281 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006282
6283 c1 = *s1++;
6284 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006285
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006286 if (c1 > (1<<11) * 26)
6287 c1 += utf16Fixup[c1>>11];
6288 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006289 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006290 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006291
6292 if (c1 != c2)
6293 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006294
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006295 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 }
6297
6298 return (len1 < len2) ? -1 : (len1 != len2);
6299}
6300
Marc-André Lemburge5034372000-08-08 08:04:29 +00006301#else
6302
6303static int
6304unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006306 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006307
6308 Py_UNICODE *s1 = str1->str;
6309 Py_UNICODE *s2 = str2->str;
6310
6311 len1 = str1->length;
6312 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006313
Marc-André Lemburge5034372000-08-08 08:04:29 +00006314 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006315 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006316
Fredrik Lundh45714e92001-06-26 16:39:36 +00006317 c1 = *s1++;
6318 c2 = *s2++;
6319
6320 if (c1 != c2)
6321 return (c1 < c2) ? -1 : 1;
6322
Marc-André Lemburge5034372000-08-08 08:04:29 +00006323 len1--; len2--;
6324 }
6325
6326 return (len1 < len2) ? -1 : (len1 != len2);
6327}
6328
6329#endif
6330
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006332 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333{
6334 PyUnicodeObject *u = NULL, *v = NULL;
6335 int result;
6336
6337 /* Coerce the two arguments */
6338 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6339 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6342 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006343 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344
Thomas Wouters7e474022000-07-16 12:04:32 +00006345 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006347 Py_DECREF(u);
6348 Py_DECREF(v);
6349 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 }
6351
6352 result = unicode_compare(u, v);
6353
6354 Py_DECREF(u);
6355 Py_DECREF(v);
6356 return result;
6357
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006358 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 Py_XDECREF(u);
6360 Py_XDECREF(v);
6361 return -1;
6362}
6363
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006364PyObject *PyUnicode_RichCompare(PyObject *left,
6365 PyObject *right,
6366 int op)
6367{
6368 int result;
6369
6370 result = PyUnicode_Compare(left, right);
6371 if (result == -1 && PyErr_Occurred())
6372 goto onError;
6373
6374 /* Convert the return value to a Boolean */
6375 switch (op) {
6376 case Py_EQ:
6377 result = (result == 0);
6378 break;
6379 case Py_NE:
6380 result = (result != 0);
6381 break;
6382 case Py_LE:
6383 result = (result <= 0);
6384 break;
6385 case Py_GE:
6386 result = (result >= 0);
6387 break;
6388 case Py_LT:
6389 result = (result == -1);
6390 break;
6391 case Py_GT:
6392 result = (result == 1);
6393 break;
6394 }
6395 return PyBool_FromLong(result);
6396
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006397 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006398
6399 /* Standard case
6400
6401 Type errors mean that PyUnicode_FromObject() could not convert
6402 one of the arguments (usually the right hand side) to Unicode,
6403 ie. we can't handle the comparison request. However, it is
6404 possible that the other object knows a comparison method, which
6405 is why we return Py_NotImplemented to give the other object a
6406 chance.
6407
6408 */
6409 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6410 PyErr_Clear();
6411 Py_INCREF(Py_NotImplemented);
6412 return Py_NotImplemented;
6413 }
6414 if (op != Py_EQ && op != Py_NE)
6415 return NULL;
6416
6417 /* Equality comparison.
6418
6419 This is a special case: we silence any PyExc_UnicodeDecodeError
6420 and instead turn it into a PyErr_UnicodeWarning.
6421
6422 */
6423 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6424 return NULL;
6425 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006426 if (PyErr_Warn(PyExc_UnicodeWarning,
6427 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006428 "Unicode equal comparison "
6429 "failed to convert both arguments to Unicode - "
6430 "interpreting them as being unequal" :
6431 "Unicode unequal comparison "
6432 "failed to convert both arguments to Unicode - "
6433 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006434 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006435 return NULL;
6436 result = (op == Py_NE);
6437 return PyBool_FromLong(result);
6438}
6439
Guido van Rossum403d68b2000-03-13 15:55:09 +00006440int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006441 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006442{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006443 PyObject *str, *sub;
6444 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006445
6446 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006447 sub = PyUnicode_FromObject(element);
6448 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006449 PyErr_SetString(PyExc_TypeError,
6450 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006451 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006452 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006453
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006454 str = PyUnicode_FromObject(container);
6455 if (!str) {
6456 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006457 return -1;
6458 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006459
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006460 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006461
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006462 Py_DECREF(str);
6463 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006464
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006465 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006466}
6467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468/* Concat to string or Unicode object giving a new Unicode object. */
6469
6470PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006471 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472{
6473 PyUnicodeObject *u = NULL, *v = NULL, *w;
6474
6475 /* Coerce the two arguments */
6476 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6477 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006478 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6480 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006481 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
6483 /* Shortcuts */
6484 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006485 Py_DECREF(v);
6486 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 }
6488 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006489 Py_DECREF(u);
6490 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 }
6492
6493 /* Concat the two Unicode strings */
6494 w = _PyUnicode_New(u->length + v->length);
6495 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 Py_UNICODE_COPY(w->str, u->str, u->length);
6498 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6499
6500 Py_DECREF(u);
6501 Py_DECREF(v);
6502 return (PyObject *)w;
6503
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006504 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 Py_XDECREF(u);
6506 Py_XDECREF(v);
6507 return NULL;
6508}
6509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006510PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006511 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006513Return the number of non-overlapping occurrences of substring sub in\n\
6514Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006515interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
6517static PyObject *
6518unicode_count(PyUnicodeObject *self, PyObject *args)
6519{
6520 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006522 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 PyObject *result;
6524
Guido van Rossumb8872e62000-05-09 14:14:27 +00006525 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006526 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 return NULL;
6528
6529 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006530 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006532 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006533
Fredrik Lundhc8162812006-05-26 19:33:03 +00006534 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006536 result = PyInt_FromSsize_t(
6537 stringlib_count(self->str + start, end - start,
6538 substring->str, substring->length)
6539 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540
6541 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006542
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 return result;
6544}
6545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006546PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006547 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006549Encodes S using the codec registered for encoding. encoding defaults\n\
6550to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006551handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6553'xmlcharrefreplace' as well as any other name registered with\n\
6554codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555
6556static PyObject *
6557unicode_encode(PyUnicodeObject *self, PyObject *args)
6558{
6559 char *encoding = NULL;
6560 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006561 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006562
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6564 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006565 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006566 if (v == NULL)
6567 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006568 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006569 PyErr_Format(PyExc_TypeError,
6570 "encoder did not return a string/unicode object "
6571 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006572 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006573 Py_DECREF(v);
6574 return NULL;
6575 }
6576 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006577
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006578 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006579 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006580}
6581
6582PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006583 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006584\n\
6585Decodes S using the codec registered for encoding. encoding defaults\n\
6586to the default encoding. errors may be given to set a different error\n\
6587handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6588a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6589as well as any other name registerd with codecs.register_error that is\n\
6590able to handle UnicodeDecodeErrors.");
6591
6592static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006593unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006594{
6595 char *encoding = NULL;
6596 char *errors = NULL;
6597 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006599 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6600 return NULL;
6601 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006602 if (v == NULL)
6603 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006604 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006605 PyErr_Format(PyExc_TypeError,
6606 "decoder did not return a string/unicode object "
6607 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006608 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006609 Py_DECREF(v);
6610 return NULL;
6611 }
6612 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006613
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006614 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616}
6617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006619 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620\n\
6621Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006622If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
6624static PyObject*
6625unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6626{
6627 Py_UNICODE *e;
6628 Py_UNICODE *p;
6629 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006630 Py_UNICODE *qe;
6631 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 PyUnicodeObject *u;
6633 int tabsize = 8;
6634
6635 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
Thomas Wouters7e474022000-07-16 12:04:32 +00006638 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006639 i = 0; /* chars up to and including most recent \n or \r */
6640 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6641 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 for (p = self->str; p < e; p++)
6643 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006644 if (tabsize > 0) {
6645 incr = tabsize - (j % tabsize); /* cannot overflow */
6646 if (j > PY_SSIZE_T_MAX - incr)
6647 goto overflow1;
6648 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006649 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006652 if (j > PY_SSIZE_T_MAX - 1)
6653 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 j++;
6655 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 if (i > PY_SSIZE_T_MAX - j)
6657 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006659 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
6661 }
6662
Guido van Rossum5bdff602008-03-11 21:18:06 +00006663 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006664 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 /* Second pass: create output string and fill it */
6667 u = _PyUnicode_New(i + j);
6668 if (!u)
6669 return NULL;
6670
Guido van Rossum5bdff602008-03-11 21:18:06 +00006671 j = 0; /* same as in first pass */
6672 q = u->str; /* next output char */
6673 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675 for (p = self->str; p < e; p++)
6676 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006677 if (tabsize > 0) {
6678 i = tabsize - (j % tabsize);
6679 j += i;
6680 while (i--) {
6681 if (q >= qe)
6682 goto overflow2;
6683 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006684 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006685 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006686 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006687 else {
6688 if (q >= qe)
6689 goto overflow2;
6690 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006691 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 if (*p == '\n' || *p == '\r')
6693 j = 0;
6694 }
6695
6696 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006697
6698 overflow2:
6699 Py_DECREF(u);
6700 overflow1:
6701 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703}
6704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006705PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006706 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707\n\
6708Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006709such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710arguments start and end are interpreted as in slice notation.\n\
6711\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject *
6715unicode_find(PyUnicodeObject *self, PyObject *args)
6716{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006717 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006718 Py_ssize_t start;
6719 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006720 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
Facundo Batista57d56692007-11-16 18:04:14 +00006722 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006725 result = stringlib_find_slice(
6726 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6727 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6728 start, end
6729 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
6731 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006732
6733 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
6736static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006737unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 if (index < 0 || index >= self->length) {
6740 PyErr_SetString(PyExc_IndexError, "string index out of range");
6741 return NULL;
6742 }
6743
6744 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6745}
6746
6747static long
6748unicode_hash(PyUnicodeObject *self)
6749{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006750 /* Since Unicode objects compare equal to their ASCII string
6751 counterparts, they should use the individual character values
6752 as basis for their hash value. This is needed to assure that
6753 strings and Unicode objects behave in the same way as
6754 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
Martin v. Löwis18e16552006-02-15 17:27:45 +00006756 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006757 register Py_UNICODE *p;
6758 register long x;
6759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006761 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006762 len = PyUnicode_GET_SIZE(self);
6763 p = PyUnicode_AS_UNICODE(self);
6764 x = *p << 7;
6765 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006766 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006767 x ^= PyUnicode_GET_SIZE(self);
6768 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006769 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006770 self->hash = x;
6771 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772}
6773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006774PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006775 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
6779static PyObject *
6780unicode_index(PyUnicodeObject *self, PyObject *args)
6781{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006782 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006783 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006784 Py_ssize_t start;
6785 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786
Facundo Batista57d56692007-11-16 18:04:14 +00006787 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006790 result = stringlib_find_slice(
6791 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6792 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6793 start, end
6794 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795
6796 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006797
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 if (result < 0) {
6799 PyErr_SetString(PyExc_ValueError, "substring not found");
6800 return NULL;
6801 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006802
Martin v. Löwis18e16552006-02-15 17:27:45 +00006803 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006813unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
6815 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6816 register const Py_UNICODE *e;
6817 int cased;
6818
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 /* Shortcut for single character strings */
6820 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006821 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006823 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006824 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006825 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006826
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 e = p + PyUnicode_GET_SIZE(self);
6828 cased = 0;
6829 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006830 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006831
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006832 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6833 return PyBool_FromLong(0);
6834 else if (!cased && Py_UNICODE_ISLOWER(ch))
6835 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006837 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838}
6839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006840PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006841 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006843Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006844at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
6846static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006847unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848{
6849 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6850 register const Py_UNICODE *e;
6851 int cased;
6852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 /* Shortcut for single character strings */
6854 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006855 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006857 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006858 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006859 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006860
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 e = p + PyUnicode_GET_SIZE(self);
6862 cased = 0;
6863 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006864 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006865
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006866 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6867 return PyBool_FromLong(0);
6868 else if (!cased && Py_UNICODE_ISUPPER(ch))
6869 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006874PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006875 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006877Return True if S is a titlecased string and there is at least one\n\
6878character in S, i.e. upper- and titlecase characters may only\n\
6879follow uncased characters and lowercase characters only cased ones.\n\
6880Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
6882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006883unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884{
6885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6886 register const Py_UNICODE *e;
6887 int cased, previous_is_cased;
6888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 /* Shortcut for single character strings */
6890 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006891 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6892 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006894 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006895 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 e = p + PyUnicode_GET_SIZE(self);
6899 cased = 0;
6900 previous_is_cased = 0;
6901 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006902 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006903
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006904 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6905 if (previous_is_cased)
6906 return PyBool_FromLong(0);
6907 previous_is_cased = 1;
6908 cased = 1;
6909 }
6910 else if (Py_UNICODE_ISLOWER(ch)) {
6911 if (!previous_is_cased)
6912 return PyBool_FromLong(0);
6913 previous_is_cased = 1;
6914 cased = 1;
6915 }
6916 else
6917 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006919 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006922PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006925Return True if all characters in S are whitespace\n\
6926and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
6931 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6932 register const Py_UNICODE *e;
6933
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 /* Shortcut for single character strings */
6935 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006936 Py_UNICODE_ISSPACE(*p))
6937 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006939 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006940 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006941 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 e = p + PyUnicode_GET_SIZE(self);
6944 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006945 if (!Py_UNICODE_ISSPACE(*p))
6946 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006953\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006954Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006956
6957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006958unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006959{
6960 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6961 register const Py_UNICODE *e;
6962
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006963 /* Shortcut for single character strings */
6964 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006965 Py_UNICODE_ISALPHA(*p))
6966 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006967
6968 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006969 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006970 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006971
6972 e = p + PyUnicode_GET_SIZE(self);
6973 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006974 if (!Py_UNICODE_ISALPHA(*p))
6975 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006976 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006977 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006982\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006983Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006984and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006985
6986static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006987unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006988{
6989 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6990 register const Py_UNICODE *e;
6991
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006992 /* Shortcut for single character strings */
6993 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006994 Py_UNICODE_ISALNUM(*p))
6995 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006996
6997 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006998 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006999 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007000
7001 e = p + PyUnicode_GET_SIZE(self);
7002 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007003 if (!Py_UNICODE_ISALNUM(*p))
7004 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007005 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007006 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007007}
7008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007009PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007010 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007012Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007013False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014
7015static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007016unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017{
7018 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7019 register const Py_UNICODE *e;
7020
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 /* Shortcut for single character strings */
7022 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007023 Py_UNICODE_ISDECIMAL(*p))
7024 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007026 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007027 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007028 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007029
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 e = p + PyUnicode_GET_SIZE(self);
7031 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007032 if (!Py_UNICODE_ISDECIMAL(*p))
7033 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007035 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007039 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007041Return True if all characters in S are digits\n\
7042and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007045unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
7047 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7048 register const Py_UNICODE *e;
7049
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 /* Shortcut for single character strings */
7051 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007052 Py_UNICODE_ISDIGIT(*p))
7053 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007055 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007056 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007057 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007058
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 e = p + PyUnicode_GET_SIZE(self);
7060 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007061 if (!Py_UNICODE_ISDIGIT(*p))
7062 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007064 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065}
7066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007068 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007070Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007071False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072
7073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007074unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075{
7076 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7077 register const Py_UNICODE *e;
7078
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 /* Shortcut for single character strings */
7080 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007081 Py_UNICODE_ISNUMERIC(*p))
7082 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007084 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007085 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007086 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007087
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 e = p + PyUnicode_GET_SIZE(self);
7089 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007090 if (!Py_UNICODE_ISNUMERIC(*p))
7091 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007093 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094}
7095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096PyDoc_STRVAR(join__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007097 "S.join(sequence) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098\n\
7099Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101
7102static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007103unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007105 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106}
7107
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109unicode_length(PyUnicodeObject *self)
7110{
7111 return self->length;
7112}
7113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007114PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007115 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007117Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007118done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
7120static PyObject *
7121unicode_ljust(PyUnicodeObject *self, PyObject *args)
7122{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007123 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007124 Py_UNICODE fillchar = ' ';
7125
Martin v. Löwis412fb672006-04-13 06:34:32 +00007126 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 return NULL;
7128
Tim Peters7a29bd52001-09-12 03:03:31 +00007129 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 Py_INCREF(self);
7131 return (PyObject*) self;
7132 }
7133
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007134 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135}
7136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007138 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
7142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007143unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return fixup(self, fixlower);
7146}
7147
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148#define LEFTSTRIP 0
7149#define RIGHTSTRIP 1
7150#define BOTHSTRIP 2
7151
7152/* Arrays indexed by above */
7153static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7154
7155#define STRIPNAME(i) (stripformat[i]+3)
7156
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007157/* externally visible for str.strip(unicode) */
7158PyObject *
7159_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7160{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007161 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7162 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7163 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7164 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7165 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007166
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007167 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007168
Benjamin Peterson857ce152009-01-31 16:29:18 +00007169 i = 0;
7170 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7172 i++;
7173 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007174 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007175
Benjamin Peterson857ce152009-01-31 16:29:18 +00007176 j = len;
7177 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007178 do {
7179 j--;
7180 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7181 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007182 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007183
Benjamin Peterson857ce152009-01-31 16:29:18 +00007184 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007185 Py_INCREF(self);
7186 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007187 }
7188 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007189 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190}
7191
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
7193static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007194do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007196 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7197 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007198
Benjamin Peterson857ce152009-01-31 16:29:18 +00007199 i = 0;
7200 if (striptype != RIGHTSTRIP) {
7201 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7202 i++;
7203 }
7204 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007205
Benjamin Peterson857ce152009-01-31 16:29:18 +00007206 j = len;
7207 if (striptype != LEFTSTRIP) {
7208 do {
7209 j--;
7210 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7211 j++;
7212 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213
Benjamin Peterson857ce152009-01-31 16:29:18 +00007214 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7215 Py_INCREF(self);
7216 return (PyObject*)self;
7217 }
7218 else
7219 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220}
7221
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222
7223static PyObject *
7224do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7225{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007226 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007227
Benjamin Peterson857ce152009-01-31 16:29:18 +00007228 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7229 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007230
Benjamin Peterson857ce152009-01-31 16:29:18 +00007231 if (sep != NULL && sep != Py_None) {
7232 if (PyUnicode_Check(sep))
7233 return _PyUnicode_XStrip(self, striptype, sep);
7234 else if (PyString_Check(sep)) {
7235 PyObject *res;
7236 sep = PyUnicode_FromObject(sep);
7237 if (sep==NULL)
7238 return NULL;
7239 res = _PyUnicode_XStrip(self, striptype, sep);
7240 Py_DECREF(sep);
7241 return res;
7242 }
7243 else {
7244 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007245 "%s arg must be None, unicode or str",
7246 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007247 return NULL;
7248 }
7249 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007250
Benjamin Peterson857ce152009-01-31 16:29:18 +00007251 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007252}
7253
7254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007255PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007256 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007257\n\
7258Return a copy of the string S with leading and trailing\n\
7259whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007260If chars is given and not None, remove characters in chars instead.\n\
7261If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007262
7263static PyObject *
7264unicode_strip(PyUnicodeObject *self, PyObject *args)
7265{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007266 if (PyTuple_GET_SIZE(args) == 0)
7267 return do_strip(self, BOTHSTRIP); /* Common case */
7268 else
7269 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007270}
7271
7272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007273PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007274 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007275\n\
7276Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007277If chars is given and not None, remove characters in chars instead.\n\
7278If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007279
7280static PyObject *
7281unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7282{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007283 if (PyTuple_GET_SIZE(args) == 0)
7284 return do_strip(self, LEFTSTRIP); /* Common case */
7285 else
7286 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007287}
7288
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007291 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007292\n\
7293Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007294If chars is given and not None, remove characters in chars instead.\n\
7295If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007296
7297static PyObject *
7298unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7299{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007300 if (PyTuple_GET_SIZE(args) == 0)
7301 return do_strip(self, RIGHTSTRIP); /* Common case */
7302 else
7303 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007304}
7305
7306
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007308unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309{
7310 PyUnicodeObject *u;
7311 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007312 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007313 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
7315 if (len < 0)
7316 len = 0;
7317
Tim Peters7a29bd52001-09-12 03:03:31 +00007318 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 /* no repeat, return original string */
7320 Py_INCREF(str);
7321 return (PyObject*) str;
7322 }
Tim Peters8f422462000-09-09 06:13:41 +00007323
7324 /* ensure # of chars needed doesn't overflow int and # of bytes
7325 * needed doesn't overflow size_t
7326 */
7327 nchars = len * str->length;
7328 if (len && nchars / len != str->length) {
7329 PyErr_SetString(PyExc_OverflowError,
7330 "repeated string is too long");
7331 return NULL;
7332 }
7333 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7334 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7335 PyErr_SetString(PyExc_OverflowError,
7336 "repeated string is too long");
7337 return NULL;
7338 }
7339 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 if (!u)
7341 return NULL;
7342
7343 p = u->str;
7344
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007345 if (str->length == 1 && len > 0) {
7346 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007347 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007348 Py_ssize_t done = 0; /* number of characters copied this far */
7349 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007350 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007351 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007352 }
7353 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007354 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007355 Py_UNICODE_COPY(p+done, p, n);
7356 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007357 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
7360 return (PyObject*) u;
7361}
7362
7363PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007364 PyObject *subobj,
7365 PyObject *replobj,
7366 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367{
7368 PyObject *self;
7369 PyObject *str1;
7370 PyObject *str2;
7371 PyObject *result;
7372
7373 self = PyUnicode_FromObject(obj);
7374 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 str1 = PyUnicode_FromObject(subobj);
7377 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007378 Py_DECREF(self);
7379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 }
7381 str2 = PyUnicode_FromObject(replobj);
7382 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007383 Py_DECREF(self);
7384 Py_DECREF(str1);
7385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 }
Tim Petersced69f82003-09-16 20:30:58 +00007387 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007388 (PyUnicodeObject *)str1,
7389 (PyUnicodeObject *)str2,
7390 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 Py_DECREF(self);
7392 Py_DECREF(str1);
7393 Py_DECREF(str2);
7394 return result;
7395}
7396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007398 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399\n\
7400Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007401old replaced by new. If the optional argument count is\n\
7402given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403
7404static PyObject*
7405unicode_replace(PyUnicodeObject *self, PyObject *args)
7406{
7407 PyUnicodeObject *str1;
7408 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007409 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 PyObject *result;
7411
Martin v. Löwis18e16552006-02-15 17:27:45 +00007412 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 return NULL;
7414 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7415 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007418 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007419 Py_DECREF(str1);
7420 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
7423 result = replace(self, str1, str2, maxcount);
7424
7425 Py_DECREF(str1);
7426 Py_DECREF(str2);
7427 return result;
7428}
7429
7430static
7431PyObject *unicode_repr(PyObject *unicode)
7432{
7433 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007434 PyUnicode_GET_SIZE(unicode),
7435 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436}
7437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007438PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007439 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440\n\
7441Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007442such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443arguments start and end are interpreted as in slice notation.\n\
7444\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject *
7448unicode_rfind(PyUnicodeObject *self, PyObject *args)
7449{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007450 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007451 Py_ssize_t start;
7452 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007453 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
Facundo Batista57d56692007-11-16 18:04:14 +00007455 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007458 result = stringlib_rfind_slice(
7459 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7460 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7461 start, end
7462 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007465
7466 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467}
7468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007469PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007470 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473
7474static PyObject *
7475unicode_rindex(PyUnicodeObject *self, PyObject *args)
7476{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007477 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007478 Py_ssize_t start;
7479 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007480 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
Facundo Batista57d56692007-11-16 18:04:14 +00007482 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007485 result = stringlib_rfind_slice(
7486 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7487 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7488 start, end
7489 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490
7491 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007492
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 if (result < 0) {
7494 PyErr_SetString(PyExc_ValueError, "substring not found");
7495 return NULL;
7496 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007497 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007501 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007503Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007504done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
7506static PyObject *
7507unicode_rjust(PyUnicodeObject *self, PyObject *args)
7508{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007509 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007510 Py_UNICODE fillchar = ' ';
7511
Martin v. Löwis412fb672006-04-13 06:34:32 +00007512 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 return NULL;
7514
Tim Peters7a29bd52001-09-12 03:03:31 +00007515 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 Py_INCREF(self);
7517 return (PyObject*) self;
7518 }
7519
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007520 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521}
7522
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007524unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525{
7526 /* standard clamping */
7527 if (start < 0)
7528 start = 0;
7529 if (end < 0)
7530 end = 0;
7531 if (end > self->length)
7532 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007533 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 /* full slice, return original string */
7535 Py_INCREF(self);
7536 return (PyObject*) self;
7537 }
7538 if (start > end)
7539 start = end;
7540 /* copy slice */
7541 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007542 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543}
7544
7545PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007546 PyObject *sep,
7547 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548{
7549 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007550
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 s = PyUnicode_FromObject(s);
7552 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007553 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007554 if (sep != NULL) {
7555 sep = PyUnicode_FromObject(sep);
7556 if (sep == NULL) {
7557 Py_DECREF(s);
7558 return NULL;
7559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 }
7561
7562 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7563
7564 Py_DECREF(s);
7565 Py_XDECREF(sep);
7566 return result;
7567}
7568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007570 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571\n\
7572Return a list of the words in S, using sep as the\n\
7573delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007574splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007575whitespace string is a separator and empty strings are\n\
7576removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject*
7579unicode_split(PyUnicodeObject *self, PyObject *args)
7580{
7581 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007582 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 return NULL;
7586
7587 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007588 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007590 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007592 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593}
7594
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007595PyObject *
7596PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7597{
7598 PyObject* str_obj;
7599 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007600 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007601
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007602 str_obj = PyUnicode_FromObject(str_in);
7603 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007604 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007605 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007606 if (!sep_obj) {
7607 Py_DECREF(str_obj);
7608 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007609 }
7610
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007611 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007612 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7613 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7614 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007615
Fredrik Lundhb9479482006-05-26 17:22:38 +00007616 Py_DECREF(sep_obj);
7617 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007618
7619 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007620}
7621
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007622
7623PyObject *
7624PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7625{
7626 PyObject* str_obj;
7627 PyObject* sep_obj;
7628 PyObject* out;
7629
7630 str_obj = PyUnicode_FromObject(str_in);
7631 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007632 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007633 sep_obj = PyUnicode_FromObject(sep_in);
7634 if (!sep_obj) {
7635 Py_DECREF(str_obj);
7636 return NULL;
7637 }
7638
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007639 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007640 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7641 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7642 );
7643
7644 Py_DECREF(sep_obj);
7645 Py_DECREF(str_obj);
7646
7647 return out;
7648}
7649
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007650PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007651 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007652\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007653Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007654the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007655found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007656
7657static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007658unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007659{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007660 return PyUnicode_Partition((PyObject *)self, separator);
7661}
7662
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007663PyDoc_STRVAR(rpartition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007664 "S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007665\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007666Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007667the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007668separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007669
7670static PyObject*
7671unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7672{
7673 return PyUnicode_RPartition((PyObject *)self, separator);
7674}
7675
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007676PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007677 PyObject *sep,
7678 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007679{
7680 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007681
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007682 s = PyUnicode_FromObject(s);
7683 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007684 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007685 if (sep != NULL) {
7686 sep = PyUnicode_FromObject(sep);
7687 if (sep == NULL) {
7688 Py_DECREF(s);
7689 return NULL;
7690 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007691 }
7692
7693 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7694
7695 Py_DECREF(s);
7696 Py_XDECREF(sep);
7697 return result;
7698}
7699
7700PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007701 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007702\n\
7703Return a list of the words in S, using sep as the\n\
7704delimiter string, starting at the end of the string and\n\
7705working to the front. If maxsplit is given, at most maxsplit\n\
7706splits are done. If sep is not specified, any whitespace string\n\
7707is a separator.");
7708
7709static PyObject*
7710unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7711{
7712 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007713 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007714
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007716 return NULL;
7717
7718 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007719 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007720 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007721 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007722 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007723 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007724}
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007727 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728\n\
7729Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007730Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007731is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732
7733static PyObject*
7734unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7735{
Guido van Rossum86662912000-04-11 15:38:46 +00007736 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Guido van Rossum86662912000-04-11 15:38:46 +00007738 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 return NULL;
7740
Guido van Rossum86662912000-04-11 15:38:46 +00007741 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742}
7743
7744static
7745PyObject *unicode_str(PyUnicodeObject *self)
7746{
Fred Drakee4315f52000-05-09 19:53:39 +00007747 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748}
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007751 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752\n\
7753Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007757unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 return fixup(self, fixswapcase);
7760}
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007763 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764\n\
7765Return a copy of the string S, where all characters have been mapped\n\
7766through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007767Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7768Unmapped characters are left untouched. Characters mapped to None\n\
7769are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770
7771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007772unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773{
Tim Petersced69f82003-09-16 20:30:58 +00007774 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007775 self->length,
7776 table,
7777 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007781 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007783Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784
7785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007786unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 return fixup(self, fixupper);
7789}
7790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007792 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793\n\
Georg Brandl98064072008-09-09 19:26:00 +00007794Pad a numeric string S with zeros on the left, to fill a field\n\
7795of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
7797static PyObject *
7798unicode_zfill(PyUnicodeObject *self, PyObject *args)
7799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007800 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 PyUnicodeObject *u;
7802
Martin v. Löwis18e16552006-02-15 17:27:45 +00007803 Py_ssize_t width;
7804 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 return NULL;
7806
7807 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007808 if (PyUnicode_CheckExact(self)) {
7809 Py_INCREF(self);
7810 return (PyObject*) self;
7811 }
7812 else
7813 return PyUnicode_FromUnicode(
7814 PyUnicode_AS_UNICODE(self),
7815 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007816 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 }
7818
7819 fill = width - self->length;
7820
7821 u = pad(self, fill, 0, '0');
7822
Walter Dörwald068325e2002-04-15 13:36:47 +00007823 if (u == NULL)
7824 return NULL;
7825
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 if (u->str[fill] == '+' || u->str[fill] == '-') {
7827 /* move sign to beginning of string */
7828 u->str[0] = u->str[fill];
7829 u->str[fill] = '0';
7830 }
7831
7832 return (PyObject*) u;
7833}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
7835#if 0
7836static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007837free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007839 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840}
7841#endif
7842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007843PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007844 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007846Return True if S starts with the specified prefix, False otherwise.\n\
7847With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007848With optional end, stop comparing S at that position.\n\
7849prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850
7851static PyObject *
7852unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007853 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854{
Georg Brandl24250812006-06-09 18:45:48 +00007855 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007857 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007858 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007859 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
Georg Brandl24250812006-06-09 18:45:48 +00007861 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007862 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7863 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007864 if (PyTuple_Check(subobj)) {
7865 Py_ssize_t i;
7866 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7867 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007868 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007869 if (substring == NULL)
7870 return NULL;
7871 result = tailmatch(self, substring, start, end, -1);
7872 Py_DECREF(substring);
7873 if (result) {
7874 Py_RETURN_TRUE;
7875 }
7876 }
7877 /* nothing matched */
7878 Py_RETURN_FALSE;
7879 }
7880 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007882 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007883 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007885 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886}
7887
7888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007889PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007890 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007892Return True if S ends with the specified suffix, False otherwise.\n\
7893With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007894With optional end, stop comparing S at that position.\n\
7895suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896
7897static PyObject *
7898unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007899 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900{
Georg Brandl24250812006-06-09 18:45:48 +00007901 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007903 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007904 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007905 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906
Georg Brandl24250812006-06-09 18:45:48 +00007907 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007908 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7909 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007910 if (PyTuple_Check(subobj)) {
7911 Py_ssize_t i;
7912 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7913 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007914 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007915 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007916 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007917 result = tailmatch(self, substring, start, end, +1);
7918 Py_DECREF(substring);
7919 if (result) {
7920 Py_RETURN_TRUE;
7921 }
7922 }
7923 Py_RETURN_FALSE;
7924 }
7925 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928
Georg Brandl24250812006-06-09 18:45:48 +00007929 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007931 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932}
7933
7934
Eric Smitha9f7d622008-02-17 19:46:49 +00007935/* Implements do_string_format, which is unicode because of stringlib */
7936#include "stringlib/string_format.h"
7937
7938PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007939 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007940\n\
7941");
7942
Eric Smithdc13b792008-05-30 18:10:04 +00007943static PyObject *
7944unicode__format__(PyObject *self, PyObject *args)
7945{
7946 PyObject *format_spec;
7947 PyObject *result = NULL;
7948 PyObject *tmp = NULL;
7949
7950 /* If 2.x, convert format_spec to the same type as value */
7951 /* This is to allow things like u''.format('') */
7952 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7953 goto done;
7954 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7955 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007956 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007957 goto done;
7958 }
7959 tmp = PyObject_Unicode(format_spec);
7960 if (tmp == NULL)
7961 goto done;
7962 format_spec = tmp;
7963
7964 result = _PyUnicode_FormatAdvanced(self,
7965 PyUnicode_AS_UNICODE(format_spec),
7966 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007967 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007968 Py_XDECREF(tmp);
7969 return result;
7970}
7971
Eric Smitha9f7d622008-02-17 19:46:49 +00007972PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007973 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007974\n\
7975");
7976
Robert Schuppenies901c9972008-06-10 10:10:31 +00007977static PyObject *
7978unicode__sizeof__(PyUnicodeObject *v)
7979{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007980 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7981 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007982}
7983
7984PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007985 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007986\n\
7987");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007988
7989static PyObject *
7990unicode_getnewargs(PyUnicodeObject *v)
7991{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007992 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007993}
7994
7995
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996static PyMethodDef unicode_methods[] = {
7997
7998 /* Order is according to common usage: often used methods should
7999 appear first, since lookup is done sequentially. */
8000
Georg Brandlecdc0a92006-03-30 12:19:07 +00008001 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008002 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8003 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008004 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008005 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8006 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8007 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8008 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8009 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8010 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8011 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00008012 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008013 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8014 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8015 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008016 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00008017 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008018/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8019 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8020 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8021 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008022 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00008023 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008024 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008025 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008026 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8027 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8028 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8029 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8030 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8031 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8032 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8033 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8034 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8035 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8036 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8037 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8038 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8039 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008040 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00008041 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8042 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8043 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8044 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00008045 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008046#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008047 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048#endif
8049
8050#if 0
8051 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008052 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053#endif
8054
Benjamin Peterson857ce152009-01-31 16:29:18 +00008055 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 {NULL, NULL}
8057};
8058
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008059static PyObject *
8060unicode_mod(PyObject *v, PyObject *w)
8061{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008062 if (!PyUnicode_Check(v)) {
8063 Py_INCREF(Py_NotImplemented);
8064 return Py_NotImplemented;
8065 }
8066 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008067}
8068
8069static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008070 0, /*nb_add*/
8071 0, /*nb_subtract*/
8072 0, /*nb_multiply*/
8073 0, /*nb_divide*/
8074 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008075};
8076
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008078 (lenfunc) unicode_length, /* sq_length */
8079 PyUnicode_Concat, /* sq_concat */
8080 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8081 (ssizeargfunc) unicode_getitem, /* sq_item */
8082 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8083 0, /* sq_ass_item */
8084 0, /* sq_ass_slice */
8085 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086};
8087
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008088static PyObject*
8089unicode_subscript(PyUnicodeObject* self, PyObject* item)
8090{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008091 if (PyIndex_Check(item)) {
8092 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008093 if (i == -1 && PyErr_Occurred())
8094 return NULL;
8095 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008096 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008097 return unicode_getitem(self, i);
8098 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008099 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008100 Py_UNICODE* source_buf;
8101 Py_UNICODE* result_buf;
8102 PyObject* result;
8103
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008104 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008105 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008106 return NULL;
8107 }
8108
8109 if (slicelength <= 0) {
8110 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008111 } else if (start == 0 && step == 1 && slicelength == self->length &&
8112 PyUnicode_CheckExact(self)) {
8113 Py_INCREF(self);
8114 return (PyObject *)self;
8115 } else if (step == 1) {
8116 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008117 } else {
8118 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008119 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8120 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008121
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008122 if (result_buf == NULL)
8123 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008124
8125 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8126 result_buf[i] = source_buf[cur];
8127 }
Tim Petersced69f82003-09-16 20:30:58 +00008128
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008129 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008130 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008131 return result;
8132 }
8133 } else {
8134 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8135 return NULL;
8136 }
8137}
8138
8139static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008140 (lenfunc)unicode_length, /* mp_length */
8141 (binaryfunc)unicode_subscript, /* mp_subscript */
8142 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008143};
8144
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008147 Py_ssize_t index,
8148 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149{
8150 if (index != 0) {
8151 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008152 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return -1;
8154 }
8155 *ptr = (void *) self->str;
8156 return PyUnicode_GET_DATA_SIZE(self);
8157}
8158
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159static Py_ssize_t
8160unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008161 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162{
8163 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008164 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 return -1;
8166}
8167
8168static int
8169unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008170 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171{
8172 if (lenp)
8173 *lenp = PyUnicode_GET_DATA_SIZE(self);
8174 return 1;
8175}
8176
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008177static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008179 Py_ssize_t index,
8180 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181{
8182 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008183
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 if (index != 0) {
8185 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008186 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 return -1;
8188 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008189 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008191 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008192 *ptr = (void *) PyString_AS_STRING(str);
8193 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194}
8195
8196/* Helpers for PyUnicode_Format() */
8197
8198static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008199getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008201 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008203 (*p_argidx)++;
8204 if (arglen < 0)
8205 return args;
8206 else
8207 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 }
8209 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008210 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 return NULL;
8212}
8213
8214#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008215#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008217#define F_ALT (1<<3)
8218#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
Martin v. Löwis18e16552006-02-15 17:27:45 +00008220static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008221strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008223 register Py_ssize_t i;
8224 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008226 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 return len;
8229}
8230
Neal Norwitzfc76d632006-01-10 06:03:13 +00008231static int
Eric Smith068f0652009-04-25 21:40:15 +00008232doubletounicode(Py_UNICODE *buffer, size_t len, int format_code,
8233 int precision, int flags, double x)
Neal Norwitzfc76d632006-01-10 06:03:13 +00008234{
Tim Peters15231542006-02-16 01:08:01 +00008235 Py_ssize_t result;
8236
Eric Smith068f0652009-04-25 21:40:15 +00008237 _PyOS_double_to_string((char *)buffer, len, x, format_code, precision,
8238 flags, NULL);
Tim Peters15231542006-02-16 01:08:01 +00008239 result = strtounicode(buffer, (char *)buffer);
8240 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008241}
8242
8243static int
8244longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8245{
Tim Peters15231542006-02-16 01:08:01 +00008246 Py_ssize_t result;
8247
Neal Norwitzfc76d632006-01-10 06:03:13 +00008248 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008249 result = strtounicode(buffer, (char *)buffer);
8250 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008251}
8252
Guido van Rossum078151d2002-08-11 04:24:12 +00008253/* XXX To save some code duplication, formatfloat/long/int could have been
8254 shared with stringobject.c, converting from 8-bit to Unicode after the
8255 formatting is done. */
8256
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257static int
8258formatfloat(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 size_t buflen,
8260 int flags,
8261 int prec,
8262 int type,
8263 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008266
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 x = PyFloat_AsDouble(v);
8268 if (x == -1.0 && PyErr_Occurred())
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008269 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008271 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008272 /* make sure that the decimal representation of precision really does
8273 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8274 if (prec > 0x7fffffffL) {
8275 PyErr_SetString(PyExc_OverflowError,
8276 "outrageously large precision "
8277 "for formatted float");
8278 return -1;
8279 }
8280
Mark Dickinson2e648ec2009-03-29 14:37:51 +00008281 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008282 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008283 /* Worst case length calc to ensure no buffer overrun:
8284
8285 'g' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008286 fmt = %#.<prec>g
8287 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8288 for any double rep.)
8289 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008290
8291 'f' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008292 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8293 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008294
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008295 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008296 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008297
8298 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008299 if (((type == 'g' || type == 'G') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008300 buflen <= (size_t)10 + (size_t)prec) ||
8301 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8302 PyErr_SetString(PyExc_OverflowError,
8303 "formatted float is too long (precision too large?)");
8304 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008305 }
Eric Smith068f0652009-04-25 21:40:15 +00008306 return doubletounicode(buf, buflen, type, prec,
8307 (flags&F_ALT)?Py_DTSF_ALT:0, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308}
8309
Tim Peters38fd5b62000-09-21 05:43:11 +00008310static PyObject*
8311formatlong(PyObject *val, int flags, int prec, int type)
8312{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008313 char *buf;
8314 int i, len;
8315 PyObject *str; /* temporary string object. */
8316 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008317
Benjamin Peterson857ce152009-01-31 16:29:18 +00008318 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8319 if (!str)
8320 return NULL;
8321 result = _PyUnicode_New(len);
8322 if (!result) {
8323 Py_DECREF(str);
8324 return NULL;
8325 }
8326 for (i = 0; i < len; i++)
8327 result->str[i] = buf[i];
8328 result->str[len] = 0;
8329 Py_DECREF(str);
8330 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008331}
8332
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333static int
8334formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008335 size_t buflen,
8336 int flags,
8337 int prec,
8338 int type,
8339 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008341 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008342 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8343 * + 1 + 1
8344 * = 24
8345 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008346 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008347 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 long x;
8349
8350 x = PyInt_AsLong(v);
8351 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008352 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008353 if (x < 0 && type == 'u') {
8354 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008355 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008356 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8357 sign = "-";
8358 else
8359 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008361 prec = 1;
8362
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008363 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8364 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008365 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008366 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008367 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008368 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008369 return -1;
8370 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008371
8372 if ((flags & F_ALT) &&
8373 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008374 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008375 * of issues that cause pain:
8376 * - when 0 is being converted, the C standard leaves off
8377 * the '0x' or '0X', which is inconsistent with other
8378 * %#x/%#X conversions and inconsistent with Python's
8379 * hex() function
8380 * - there are platforms that violate the standard and
8381 * convert 0 with the '0x' or '0X'
8382 * (Metrowerks, Compaq Tru64)
8383 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008384 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008385 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008386 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008387 * We can achieve the desired consistency by inserting our
8388 * own '0x' or '0X' prefix, and substituting %x/%X in place
8389 * of %#x/%#X.
8390 *
8391 * Note that this is the same approach as used in
8392 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008393 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008394 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8395 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008396 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008397 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008398 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8399 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008400 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008401 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008402 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008403 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008404 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008405 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406}
8407
8408static int
8409formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008410 size_t buflen,
8411 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008413 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008414 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008415 if (PyUnicode_GET_SIZE(v) != 1)
8416 goto onError;
8417 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008420 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008421 if (PyString_GET_SIZE(v) != 1)
8422 goto onError;
8423 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425
8426 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008427 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008429 x = PyInt_AsLong(v);
8430 if (x == -1 && PyErr_Occurred())
8431 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008432#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008433 if (x < 0 || x > 0x10ffff) {
8434 PyErr_SetString(PyExc_OverflowError,
8435 "%c arg not in range(0x110000) "
8436 "(wide Python build)");
8437 return -1;
8438 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008439#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008440 if (x < 0 || x > 0xffff) {
8441 PyErr_SetString(PyExc_OverflowError,
8442 "%c arg not in range(0x10000) "
8443 "(narrow Python build)");
8444 return -1;
8445 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008446#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008447 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 }
8449 buf[1] = '\0';
8450 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008452 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008453 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008454 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008455 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456}
8457
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008458/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8459
8460 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8461 chars are formatted. XXX This is a magic number. Each formatting
8462 routine does bounds checking to ensure no overflow, but a better
8463 solution may be to malloc a buffer of appropriate size for each
8464 format. For now, the current solution is sufficient.
8465*/
8466#define FORMATBUFLEN (size_t)120
8467
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008469 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
8471 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008472 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 int args_owned = 0;
8474 PyUnicodeObject *result = NULL;
8475 PyObject *dict = NULL;
8476 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008477
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008479 PyErr_BadInternalCall();
8480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 }
8482 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008483 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 fmt = PyUnicode_AS_UNICODE(uformat);
8486 fmtcnt = PyUnicode_GET_SIZE(uformat);
8487
8488 reslen = rescnt = fmtcnt + 100;
8489 result = _PyUnicode_New(reslen);
8490 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008491 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 res = PyUnicode_AS_UNICODE(result);
8493
8494 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008495 arglen = PyTuple_Size(args);
8496 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 }
8498 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008499 arglen = -1;
8500 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 }
Christian Heimese93237d2007-12-19 02:37:44 +00008502 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008503 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008504 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505
8506 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008507 if (*fmt != '%') {
8508 if (--rescnt < 0) {
8509 rescnt = fmtcnt + 100;
8510 reslen += rescnt;
8511 if (_PyUnicode_Resize(&result, reslen) < 0)
8512 goto onError;
8513 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8514 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008515 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008516 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008517 }
8518 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008519 /* Got a format specifier */
8520 int flags = 0;
8521 Py_ssize_t width = -1;
8522 int prec = -1;
8523 Py_UNICODE c = '\0';
8524 Py_UNICODE fill;
8525 int isnumok;
8526 PyObject *v = NULL;
8527 PyObject *temp = NULL;
8528 Py_UNICODE *pbuf;
8529 Py_UNICODE sign;
8530 Py_ssize_t len;
8531 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8532
8533 fmt++;
8534 if (*fmt == '(') {
8535 Py_UNICODE *keystart;
8536 Py_ssize_t keylen;
8537 PyObject *key;
8538 int pcount = 1;
8539
8540 if (dict == NULL) {
8541 PyErr_SetString(PyExc_TypeError,
8542 "format requires a mapping");
8543 goto onError;
8544 }
8545 ++fmt;
8546 --fmtcnt;
8547 keystart = fmt;
8548 /* Skip over balanced parentheses */
8549 while (pcount > 0 && --fmtcnt >= 0) {
8550 if (*fmt == ')')
8551 --pcount;
8552 else if (*fmt == '(')
8553 ++pcount;
8554 fmt++;
8555 }
8556 keylen = fmt - keystart - 1;
8557 if (fmtcnt < 0 || pcount > 0) {
8558 PyErr_SetString(PyExc_ValueError,
8559 "incomplete format key");
8560 goto onError;
8561 }
8562#if 0
8563 /* keys are converted to strings using UTF-8 and
8564 then looked up since Python uses strings to hold
8565 variables names etc. in its namespaces and we
8566 wouldn't want to break common idioms. */
8567 key = PyUnicode_EncodeUTF8(keystart,
8568 keylen,
8569 NULL);
8570#else
8571 key = PyUnicode_FromUnicode(keystart, keylen);
8572#endif
8573 if (key == NULL)
8574 goto onError;
8575 if (args_owned) {
8576 Py_DECREF(args);
8577 args_owned = 0;
8578 }
8579 args = PyObject_GetItem(dict, key);
8580 Py_DECREF(key);
8581 if (args == NULL) {
8582 goto onError;
8583 }
8584 args_owned = 1;
8585 arglen = -1;
8586 argidx = -2;
8587 }
8588 while (--fmtcnt >= 0) {
8589 switch (c = *fmt++) {
8590 case '-': flags |= F_LJUST; continue;
8591 case '+': flags |= F_SIGN; continue;
8592 case ' ': flags |= F_BLANK; continue;
8593 case '#': flags |= F_ALT; continue;
8594 case '0': flags |= F_ZERO; continue;
8595 }
8596 break;
8597 }
8598 if (c == '*') {
8599 v = getnextarg(args, arglen, &argidx);
8600 if (v == NULL)
8601 goto onError;
8602 if (!PyInt_Check(v)) {
8603 PyErr_SetString(PyExc_TypeError,
8604 "* wants int");
8605 goto onError;
8606 }
8607 width = PyInt_AsLong(v);
8608 if (width < 0) {
8609 flags |= F_LJUST;
8610 width = -width;
8611 }
8612 if (--fmtcnt >= 0)
8613 c = *fmt++;
8614 }
8615 else if (c >= '0' && c <= '9') {
8616 width = c - '0';
8617 while (--fmtcnt >= 0) {
8618 c = *fmt++;
8619 if (c < '0' || c > '9')
8620 break;
8621 if ((width*10) / 10 != width) {
8622 PyErr_SetString(PyExc_ValueError,
8623 "width too big");
8624 goto onError;
8625 }
8626 width = width*10 + (c - '0');
8627 }
8628 }
8629 if (c == '.') {
8630 prec = 0;
8631 if (--fmtcnt >= 0)
8632 c = *fmt++;
8633 if (c == '*') {
8634 v = getnextarg(args, arglen, &argidx);
8635 if (v == NULL)
8636 goto onError;
8637 if (!PyInt_Check(v)) {
8638 PyErr_SetString(PyExc_TypeError,
8639 "* wants int");
8640 goto onError;
8641 }
8642 prec = PyInt_AsLong(v);
8643 if (prec < 0)
8644 prec = 0;
8645 if (--fmtcnt >= 0)
8646 c = *fmt++;
8647 }
8648 else if (c >= '0' && c <= '9') {
8649 prec = c - '0';
8650 while (--fmtcnt >= 0) {
8651 c = Py_CHARMASK(*fmt++);
8652 if (c < '0' || c > '9')
8653 break;
8654 if ((prec*10) / 10 != prec) {
8655 PyErr_SetString(PyExc_ValueError,
8656 "prec too big");
8657 goto onError;
8658 }
8659 prec = prec*10 + (c - '0');
8660 }
8661 }
8662 } /* prec */
8663 if (fmtcnt >= 0) {
8664 if (c == 'h' || c == 'l' || c == 'L') {
8665 if (--fmtcnt >= 0)
8666 c = *fmt++;
8667 }
8668 }
8669 if (fmtcnt < 0) {
8670 PyErr_SetString(PyExc_ValueError,
8671 "incomplete format");
8672 goto onError;
8673 }
8674 if (c != '%') {
8675 v = getnextarg(args, arglen, &argidx);
8676 if (v == NULL)
8677 goto onError;
8678 }
8679 sign = 0;
8680 fill = ' ';
8681 switch (c) {
8682
8683 case '%':
8684 pbuf = formatbuf;
8685 /* presume that buffer length is at least 1 */
8686 pbuf[0] = '%';
8687 len = 1;
8688 break;
8689
8690 case 's':
8691 case 'r':
8692 if (PyUnicode_Check(v) && c == 's') {
8693 temp = v;
8694 Py_INCREF(temp);
8695 }
8696 else {
8697 PyObject *unicode;
8698 if (c == 's')
8699 temp = PyObject_Unicode(v);
8700 else
8701 temp = PyObject_Repr(v);
8702 if (temp == NULL)
8703 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008704 if (PyUnicode_Check(temp))
8705 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008706 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008707 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008708 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8709 PyString_GET_SIZE(temp),
8710 NULL,
8711 "strict");
8712 Py_DECREF(temp);
8713 temp = unicode;
8714 if (temp == NULL)
8715 goto onError;
8716 }
8717 else {
8718 Py_DECREF(temp);
8719 PyErr_SetString(PyExc_TypeError,
8720 "%s argument has non-string str()");
8721 goto onError;
8722 }
8723 }
8724 pbuf = PyUnicode_AS_UNICODE(temp);
8725 len = PyUnicode_GET_SIZE(temp);
8726 if (prec >= 0 && len > prec)
8727 len = prec;
8728 break;
8729
8730 case 'i':
8731 case 'd':
8732 case 'u':
8733 case 'o':
8734 case 'x':
8735 case 'X':
8736 if (c == 'i')
8737 c = 'd';
8738 isnumok = 0;
8739 if (PyNumber_Check(v)) {
8740 PyObject *iobj=NULL;
8741
8742 if (PyInt_Check(v) || (PyLong_Check(v))) {
8743 iobj = v;
8744 Py_INCREF(iobj);
8745 }
8746 else {
8747 iobj = PyNumber_Int(v);
8748 if (iobj==NULL) iobj = PyNumber_Long(v);
8749 }
8750 if (iobj!=NULL) {
8751 if (PyInt_Check(iobj)) {
8752 isnumok = 1;
8753 pbuf = formatbuf;
8754 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8755 flags, prec, c, iobj);
8756 Py_DECREF(iobj);
8757 if (len < 0)
8758 goto onError;
8759 sign = 1;
8760 }
8761 else if (PyLong_Check(iobj)) {
8762 isnumok = 1;
8763 temp = formatlong(iobj, flags, prec, c);
8764 Py_DECREF(iobj);
8765 if (!temp)
8766 goto onError;
8767 pbuf = PyUnicode_AS_UNICODE(temp);
8768 len = PyUnicode_GET_SIZE(temp);
8769 sign = 1;
8770 }
8771 else {
8772 Py_DECREF(iobj);
8773 }
8774 }
8775 }
8776 if (!isnumok) {
8777 PyErr_Format(PyExc_TypeError,
8778 "%%%c format: a number is required, "
8779 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8780 goto onError;
8781 }
8782 if (flags & F_ZERO)
8783 fill = '0';
8784 break;
8785
8786 case 'e':
8787 case 'E':
8788 case 'f':
8789 case 'F':
8790 case 'g':
8791 case 'G':
8792 if (c == 'F')
8793 c = 'f';
8794 pbuf = formatbuf;
8795 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8796 flags, prec, c, v);
8797 if (len < 0)
8798 goto onError;
8799 sign = 1;
8800 if (flags & F_ZERO)
8801 fill = '0';
8802 break;
8803
8804 case 'c':
8805 pbuf = formatbuf;
8806 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8807 if (len < 0)
8808 goto onError;
8809 break;
8810
8811 default:
8812 PyErr_Format(PyExc_ValueError,
8813 "unsupported format character '%c' (0x%x) "
8814 "at index %zd",
8815 (31<=c && c<=126) ? (char)c : '?',
8816 (int)c,
8817 (Py_ssize_t)(fmt - 1 -
8818 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008819 goto onError;
8820 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008821 if (sign) {
8822 if (*pbuf == '-' || *pbuf == '+') {
8823 sign = *pbuf++;
8824 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008825 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008826 else if (flags & F_SIGN)
8827 sign = '+';
8828 else if (flags & F_BLANK)
8829 sign = ' ';
8830 else
8831 sign = 0;
8832 }
8833 if (width < len)
8834 width = len;
8835 if (rescnt - (sign != 0) < width) {
8836 reslen -= rescnt;
8837 rescnt = width + fmtcnt + 100;
8838 reslen += rescnt;
8839 if (reslen < 0) {
8840 Py_XDECREF(temp);
8841 PyErr_NoMemory();
8842 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008843 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008844 if (_PyUnicode_Resize(&result, reslen) < 0) {
8845 Py_XDECREF(temp);
8846 goto onError;
8847 }
8848 res = PyUnicode_AS_UNICODE(result)
8849 + reslen - rescnt;
8850 }
8851 if (sign) {
8852 if (fill != ' ')
8853 *res++ = sign;
8854 rescnt--;
8855 if (width > len)
8856 width--;
8857 }
8858 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8859 assert(pbuf[0] == '0');
8860 assert(pbuf[1] == c);
8861 if (fill != ' ') {
8862 *res++ = *pbuf++;
8863 *res++ = *pbuf++;
8864 }
8865 rescnt -= 2;
8866 width -= 2;
8867 if (width < 0)
8868 width = 0;
8869 len -= 2;
8870 }
8871 if (width > len && !(flags & F_LJUST)) {
8872 do {
8873 --rescnt;
8874 *res++ = fill;
8875 } while (--width > len);
8876 }
8877 if (fill == ' ') {
8878 if (sign)
8879 *res++ = sign;
8880 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8881 assert(pbuf[0] == '0');
8882 assert(pbuf[1] == c);
8883 *res++ = *pbuf++;
8884 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008885 }
8886 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008887 Py_UNICODE_COPY(res, pbuf, len);
8888 res += len;
8889 rescnt -= len;
8890 while (--width >= len) {
8891 --rescnt;
8892 *res++ = ' ';
8893 }
8894 if (dict && (argidx < arglen) && c != '%') {
8895 PyErr_SetString(PyExc_TypeError,
8896 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008897 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008898 goto onError;
8899 }
8900 Py_XDECREF(temp);
8901 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 } /* until end */
8903 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008904 PyErr_SetString(PyExc_TypeError,
8905 "not all arguments converted during string formatting");
8906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 }
8908
Thomas Woutersa96affe2006-03-12 00:29:36 +00008909 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008910 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008912 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 }
8914 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 return (PyObject *)result;
8916
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008917 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 Py_XDECREF(result);
8919 Py_DECREF(uformat);
8920 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008921 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 }
8923 return NULL;
8924}
8925
8926static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008927 (readbufferproc) unicode_buffer_getreadbuf,
8928 (writebufferproc) unicode_buffer_getwritebuf,
8929 (segcountproc) unicode_buffer_getsegcount,
8930 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931};
8932
Jeremy Hylton938ace62002-07-17 16:30:39 +00008933static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008934unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8935
Tim Peters6d6c1a32001-08-02 04:15:00 +00008936static PyObject *
8937unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8938{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008939 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008940 static char *kwlist[] = {"string", "encoding", "errors", 0};
8941 char *encoding = NULL;
8942 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008943
Benjamin Peterson857ce152009-01-31 16:29:18 +00008944 if (type != &PyUnicode_Type)
8945 return unicode_subtype_new(type, args, kwds);
8946 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008947 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008948 return NULL;
8949 if (x == NULL)
8950 return (PyObject *)_PyUnicode_New(0);
8951 if (encoding == NULL && errors == NULL)
8952 return PyObject_Unicode(x);
8953 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008954 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008955}
8956
Guido van Rossume023fe02001-08-30 03:12:59 +00008957static PyObject *
8958unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8959{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008960 PyUnicodeObject *tmp, *pnew;
8961 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008962
Benjamin Peterson857ce152009-01-31 16:29:18 +00008963 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8964 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8965 if (tmp == NULL)
8966 return NULL;
8967 assert(PyUnicode_Check(tmp));
8968 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8969 if (pnew == NULL) {
8970 Py_DECREF(tmp);
8971 return NULL;
8972 }
8973 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8974 if (pnew->str == NULL) {
8975 _Py_ForgetReference((PyObject *)pnew);
8976 PyObject_Del(pnew);
8977 Py_DECREF(tmp);
8978 return PyErr_NoMemory();
8979 }
8980 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8981 pnew->length = n;
8982 pnew->hash = tmp->hash;
8983 Py_DECREF(tmp);
8984 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008985}
8986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008987PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008988 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008989\n\
8990Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008991encoding defaults to the current default string encoding.\n\
8992errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008993
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008995 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008996 "unicode", /* tp_name */
8997 sizeof(PyUnicodeObject), /* tp_size */
8998 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00009000 (destructor)unicode_dealloc, /* tp_dealloc */
9001 0, /* tp_print */
9002 0, /* tp_getattr */
9003 0, /* tp_setattr */
9004 0, /* tp_compare */
9005 unicode_repr, /* tp_repr */
9006 &unicode_as_number, /* tp_as_number */
9007 &unicode_as_sequence, /* tp_as_sequence */
9008 &unicode_as_mapping, /* tp_as_mapping */
9009 (hashfunc) unicode_hash, /* tp_hash*/
9010 0, /* tp_call*/
9011 (reprfunc) unicode_str, /* tp_str */
9012 PyObject_GenericGetAttr, /* tp_getattro */
9013 0, /* tp_setattro */
9014 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009015 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009016 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00009017 unicode_doc, /* tp_doc */
9018 0, /* tp_traverse */
9019 0, /* tp_clear */
9020 PyUnicode_RichCompare, /* tp_richcompare */
9021 0, /* tp_weaklistoffset */
9022 0, /* tp_iter */
9023 0, /* tp_iternext */
9024 unicode_methods, /* tp_methods */
9025 0, /* tp_members */
9026 0, /* tp_getset */
9027 &PyBaseString_Type, /* tp_base */
9028 0, /* tp_dict */
9029 0, /* tp_descr_get */
9030 0, /* tp_descr_set */
9031 0, /* tp_dictoffset */
9032 0, /* tp_init */
9033 0, /* tp_alloc */
9034 unicode_new, /* tp_new */
9035 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036};
9037
9038/* Initialize the Unicode implementation */
9039
Thomas Wouters78890102000-07-22 19:25:51 +00009040void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009042 int i;
9043
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009044 /* XXX - move this array to unicodectype.c ? */
9045 Py_UNICODE linebreak[] = {
9046 0x000A, /* LINE FEED */
9047 0x000D, /* CARRIAGE RETURN */
9048 0x001C, /* FILE SEPARATOR */
9049 0x001D, /* GROUP SEPARATOR */
9050 0x001E, /* RECORD SEPARATOR */
9051 0x0085, /* NEXT LINE */
9052 0x2028, /* LINE SEPARATOR */
9053 0x2029, /* PARAGRAPH SEPARATOR */
9054 };
9055
Fred Drakee4315f52000-05-09 19:53:39 +00009056 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009057 free_list = NULL;
9058 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009060 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009061 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009062
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009063 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009064 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009065 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009066 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009067 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009068
9069 /* initialize the linebreak bloom filter */
9070 bloom_linebreak = make_bloom_mask(
9071 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9072 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009073
9074 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075}
9076
9077/* Finalize the Unicode implementation */
9078
Christian Heimes3b718a72008-02-14 12:47:33 +00009079int
9080PyUnicode_ClearFreeList(void)
9081{
9082 int freelist_size = numfree;
9083 PyUnicodeObject *u;
9084
9085 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009086 PyUnicodeObject *v = u;
9087 u = *(PyUnicodeObject **)u;
9088 if (v->str)
9089 PyObject_DEL(v->str);
9090 Py_XDECREF(v->defenc);
9091 PyObject_Del(v);
9092 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009093 }
9094 free_list = NULL;
9095 assert(numfree == 0);
9096 return freelist_size;
9097}
9098
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099void
Thomas Wouters78890102000-07-22 19:25:51 +00009100_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009102 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009104 Py_XDECREF(unicode_empty);
9105 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009107 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009108 if (unicode_latin1[i]) {
9109 Py_DECREF(unicode_latin1[i]);
9110 unicode_latin1[i] = NULL;
9111 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009112 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009113 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009115
Anthony Baxterac6bd462006-04-13 02:06:09 +00009116#ifdef __cplusplus
9117}
9118#endif
9119
9120
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009121/*
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009122 Local variables:
9123 c-basic-offset: 4
9124 indent-tabs-mode: nil
9125 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009126*/