blob: 107ed295656e83cf13c4027178ea4c60f9a559c0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson857ce152009-01-31 16:29:18 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson857ce152009-01-31 16:29:18 +0000247 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000443 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson857ce152009-01-31 16:29:18 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
Mark Dickinson6b265f12009-03-18 16:07:26 +0000532#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533# define CONVERT_WCHAR_TO_SURROGATES
534#endif
535
536#ifdef CONVERT_WCHAR_TO_SURROGATES
537
538/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
540
541PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
543{
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
548
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
552 }
553
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
560 }
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
565
566 /* Copy the wchar_t data into the new object */
567 {
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
576 }
577 else
578 *u++ = *w++;
579 }
580 }
581 return (PyObject *)unicode;
582}
583
584#else
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000587 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588{
589 PyUnicodeObject *unicode;
590
591 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000592 PyErr_BadInternalCall();
593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 }
595
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
599
600 /* Copy the wchar_t data into the new object */
601#ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000603#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 }
611#endif
612
613 return (PyObject *)unicode;
614}
615
Mark Dickinson6b265f12009-03-18 16:07:26 +0000616#endif /* CONVERT_WCHAR_TO_SURROGATES */
617
618#undef CONVERT_WCHAR_TO_SURROGATES
619
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000620static void
621makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
622{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
628 }
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
637 }
638 *fmt++ = c;
639 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640}
641
642#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
643
644PyObject *
645PyUnicode_FromFormatV(const char *format, va_list vargs)
646{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000667
668#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000669 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000670#else
671#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000672 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000673#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000675#endif
676#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000677 /* step 1: count the number of %S/%R format specifications
678 * (we call PyObject_Str()/PyObject_Repr() for these objects
679 * once during step 3 and put the result in an array) */
680 for (f = format; *f; f++) {
681 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
682 ++callcount;
683 }
684 /* step 2: allocate memory for the results of
685 * PyObject_Str()/PyObject_Repr() calls */
686 if (callcount) {
687 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
688 if (!callresults) {
689 PyErr_NoMemory();
690 return NULL;
691 }
692 callresult = callresults;
693 }
694 /* step 3: figure out how large a buffer we need */
695 for (f = format; *f; f++) {
696 if (*f == '%') {
697 const char* p = f;
698 width = 0;
699 while (isdigit((unsigned)*f))
700 width = (width*10) + *f++ - '0';
701 while (*++f && *f != '%' && !isalpha((unsigned)*f))
702 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000703
Benjamin Peterson857ce152009-01-31 16:29:18 +0000704 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
705 * they don't affect the amount of space we reserve.
706 */
707 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000708 (f[1] == 'd' || f[1] == 'u'))
709 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000710
Benjamin Peterson857ce152009-01-31 16:29:18 +0000711 switch (*f) {
712 case 'c':
713 (void)va_arg(count, int);
714 /* fall through... */
715 case '%':
716 n++;
717 break;
718 case 'd': case 'u': case 'i': case 'x':
719 (void) va_arg(count, int);
720 /* 20 bytes is enough to hold a 64-bit
721 integer. Decimal takes the most space.
722 This isn't enough for octal.
723 If a width is specified we need more
724 (which we allocate later). */
725 if (width < 20)
726 width = 20;
727 n += width;
728 if (abuffersize < width)
729 abuffersize = width;
730 break;
731 case 's':
732 {
733 /* UTF-8 */
734 unsigned char*s;
735 s = va_arg(count, unsigned char*);
736 while (*s) {
737 if (*s < 128) {
738 n++; s++;
739 } else if (*s < 0xc0) {
740 /* invalid UTF-8 */
741 n++; s++;
742 } else if (*s < 0xc0) {
743 n++;
744 s++; if(!*s)break;
745 s++;
746 } else if (*s < 0xe0) {
747 n++;
748 s++; if(!*s)break;
749 s++; if(!*s)break;
750 s++;
751 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000752#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000753 n++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000754#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000755 n+=2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000756#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000757 s++; if(!*s)break;
758 s++; if(!*s)break;
759 s++; if(!*s)break;
760 s++;
761 }
762 }
763 break;
764 }
765 case 'U':
766 {
767 PyObject *obj = va_arg(count, PyObject *);
768 assert(obj && PyUnicode_Check(obj));
769 n += PyUnicode_GET_SIZE(obj);
770 break;
771 }
772 case 'V':
773 {
774 PyObject *obj = va_arg(count, PyObject *);
775 const char *str = va_arg(count, const char *);
776 assert(obj || str);
777 assert(!obj || PyUnicode_Check(obj));
778 if (obj)
779 n += PyUnicode_GET_SIZE(obj);
780 else
781 n += strlen(str);
782 break;
783 }
784 case 'S':
785 {
786 PyObject *obj = va_arg(count, PyObject *);
787 PyObject *str;
788 assert(obj);
789 str = PyObject_Str(obj);
790 if (!str)
791 goto fail;
792 n += PyUnicode_GET_SIZE(str);
793 /* Remember the str and switch to the next slot */
794 *callresult++ = str;
795 break;
796 }
797 case 'R':
798 {
799 PyObject *obj = va_arg(count, PyObject *);
800 PyObject *repr;
801 assert(obj);
802 repr = PyObject_Repr(obj);
803 if (!repr)
804 goto fail;
805 n += PyUnicode_GET_SIZE(repr);
806 /* Remember the repr and switch to the next slot */
807 *callresult++ = repr;
808 break;
809 }
810 case 'p':
811 (void) va_arg(count, int);
812 /* maximum 64-bit pointer representation:
813 * 0xffffffffffffffff
814 * so 19 characters is enough.
815 * XXX I count 18 -- what's the extra for?
816 */
817 n += 19;
818 break;
819 default:
820 /* if we stumble upon an unknown
821 formatting code, copy the rest of
822 the format string to the output
823 string. (we cannot just skip the
824 code, since there's no way to know
825 what's in the argument list) */
826 n += strlen(p);
827 goto expand;
828 }
829 } else
830 n++;
831 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000832 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000833 if (abuffersize > 20) {
834 abuffer = PyObject_Malloc(abuffersize);
835 if (!abuffer) {
836 PyErr_NoMemory();
837 goto fail;
838 }
839 realbuffer = abuffer;
840 }
841 else
842 realbuffer = buffer;
843 /* step 4: fill the buffer */
844 /* Since we've analyzed how much space we need for the worst case,
845 we don't have to resize the string.
846 There can be no errors beyond this point. */
847 string = PyUnicode_FromUnicode(NULL, n);
848 if (!string)
849 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000850
Benjamin Peterson857ce152009-01-31 16:29:18 +0000851 s = PyUnicode_AS_UNICODE(string);
852 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000853
Benjamin Peterson857ce152009-01-31 16:29:18 +0000854 for (f = format; *f; f++) {
855 if (*f == '%') {
856 const char* p = f++;
857 int longflag = 0;
858 int size_tflag = 0;
859 zeropad = (*f == '0');
860 /* parse the width.precision part */
861 width = 0;
862 while (isdigit((unsigned)*f))
863 width = (width*10) + *f++ - '0';
864 precision = 0;
865 if (*f == '.') {
866 f++;
867 while (isdigit((unsigned)*f))
868 precision = (precision*10) + *f++ - '0';
869 }
870 /* handle the long flag, but only for %ld and %lu.
871 others can be added when necessary. */
872 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
873 longflag = 1;
874 ++f;
875 }
876 /* handle the size_t flag. */
877 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
878 size_tflag = 1;
879 ++f;
880 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000881
Benjamin Peterson857ce152009-01-31 16:29:18 +0000882 switch (*f) {
883 case 'c':
884 *s++ = va_arg(vargs, int);
885 break;
886 case 'd':
887 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
888 if (longflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, long));
890 else if (size_tflag)
891 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
892 else
893 sprintf(realbuffer, fmt, va_arg(vargs, int));
894 appendstring(realbuffer);
895 break;
896 case 'u':
897 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
898 if (longflag)
899 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
900 else if (size_tflag)
901 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
902 else
903 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
904 appendstring(realbuffer);
905 break;
906 case 'i':
907 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
908 sprintf(realbuffer, fmt, va_arg(vargs, int));
909 appendstring(realbuffer);
910 break;
911 case 'x':
912 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
913 sprintf(realbuffer, fmt, va_arg(vargs, int));
914 appendstring(realbuffer);
915 break;
916 case 's':
917 {
918 /* Parameter must be UTF-8 encoded.
919 In case of encoding errors, use
920 the replacement character. */
921 PyObject *u;
922 p = va_arg(vargs, char*);
923 u = PyUnicode_DecodeUTF8(p, strlen(p),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000924 "replace");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000925 if (!u)
926 goto fail;
927 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000928 PyUnicode_GET_SIZE(u));
Benjamin Peterson857ce152009-01-31 16:29:18 +0000929 s += PyUnicode_GET_SIZE(u);
930 Py_DECREF(u);
931 break;
932 }
933 case 'U':
934 {
935 PyObject *obj = va_arg(vargs, PyObject *);
936 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
937 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
938 s += size;
939 break;
940 }
941 case 'V':
942 {
943 PyObject *obj = va_arg(vargs, PyObject *);
944 const char *str = va_arg(vargs, const char *);
945 if (obj) {
946 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
947 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
948 s += size;
949 } else {
950 appendstring(str);
951 }
952 break;
953 }
954 case 'S':
955 case 'R':
956 {
957 Py_UNICODE *ucopy;
958 Py_ssize_t usize;
959 Py_ssize_t upos;
960 /* unused, since we already have the result */
961 (void) va_arg(vargs, PyObject *);
962 ucopy = PyUnicode_AS_UNICODE(*callresult);
963 usize = PyUnicode_GET_SIZE(*callresult);
964 for (upos = 0; upos<usize;)
965 *s++ = ucopy[upos++];
966 /* We're done with the unicode()/repr() => forget it */
967 Py_DECREF(*callresult);
968 /* switch to next unicode()/repr() result */
969 ++callresult;
970 break;
971 }
972 case 'p':
973 sprintf(buffer, "%p", va_arg(vargs, void*));
974 /* %p is ill-defined: ensure leading 0x. */
975 if (buffer[1] == 'X')
976 buffer[1] = 'x';
977 else if (buffer[1] != 'x') {
978 memmove(buffer+2, buffer, strlen(buffer)+1);
979 buffer[0] = '0';
980 buffer[1] = 'x';
981 }
982 appendstring(buffer);
983 break;
984 case '%':
985 *s++ = '%';
986 break;
987 default:
988 appendstring(p);
989 goto end;
990 }
991 } else
992 *s++ = *f;
993 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000994
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000995 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000996 if (callresults)
997 PyObject_Free(callresults);
998 if (abuffer)
999 PyObject_Free(abuffer);
1000 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1001 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001002 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001003 if (callresults) {
1004 PyObject **callresult2 = callresults;
1005 while (callresult2 < callresult) {
1006 Py_DECREF(*callresult2);
1007 ++callresult2;
1008 }
1009 PyObject_Free(callresults);
1010 }
1011 if (abuffer)
1012 PyObject_Free(abuffer);
1013 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001014}
1015
1016#undef appendstring
1017
1018PyObject *
1019PyUnicode_FromFormat(const char *format, ...)
1020{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001021 PyObject* ret;
1022 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023
1024#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001025 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001026#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001027 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001028#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001029 ret = PyUnicode_FromFormatV(format, vargs);
1030 va_end(vargs);
1031 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001032}
1033
Martin v. Löwis18e16552006-02-15 17:27:45 +00001034Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001035 wchar_t *w,
1036 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037{
1038 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001039 PyErr_BadInternalCall();
1040 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001042
1043 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001045 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001046
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047#ifdef HAVE_USABLE_WCHAR_T
1048 memcpy(w, unicode->str, size * sizeof(wchar_t));
1049#else
1050 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001051 register Py_UNICODE *u;
1052 register Py_ssize_t i;
1053 u = PyUnicode_AS_UNICODE(unicode);
1054 for (i = size; i > 0; i--)
1055 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 }
1057#endif
1058
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001059 if (size > PyUnicode_GET_SIZE(unicode))
1060 return PyUnicode_GET_SIZE(unicode);
1061 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001062 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063}
1064
1065#endif
1066
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001067PyObject *PyUnicode_FromOrdinal(int ordinal)
1068{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001069 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001070
1071#ifdef Py_UNICODE_WIDE
1072 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001073 PyErr_SetString(PyExc_ValueError,
1074 "unichr() arg not in range(0x110000) "
1075 "(wide Python build)");
1076 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077 }
1078#else
1079 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001080 PyErr_SetString(PyExc_ValueError,
1081 "unichr() arg not in range(0x10000) "
1082 "(narrow Python build)");
1083 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084 }
1085#endif
1086
Hye-Shik Chang40574832004-04-06 07:24:51 +00001087 s[0] = (Py_UNICODE)ordinal;
1088 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001089}
1090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091PyObject *PyUnicode_FromObject(register PyObject *obj)
1092{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001094 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001096 Py_INCREF(obj);
1097 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001098 }
1099 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001100 /* For a Unicode subtype that's not a Unicode object,
1101 return a true Unicode object with the same data. */
1102 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1103 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001104 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1106}
1107
1108PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001109 const char *encoding,
1110 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001112 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001113 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001115
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001117 PyErr_BadInternalCall();
1118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001120
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001121#if 0
1122 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001123 that no encodings is given and then redirect to
1124 PyObject_Unicode() which then applies the additional logic for
1125 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001126
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001127 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001129
1130 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001131 if (PyUnicode_Check(obj)) {
1132 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001133 PyErr_SetString(PyExc_TypeError,
1134 "decoding Unicode is not supported");
1135 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001136 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001137 return PyObject_Unicode(obj);
1138 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001139#else
1140 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001141 PyErr_SetString(PyExc_TypeError,
1142 "decoding Unicode is not supported");
1143 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001144 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145#endif
1146
1147 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001148 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001149 s = PyString_AS_STRING(obj);
1150 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001151 }
Christian Heimes3497f942008-05-26 12:29:14 +00001152 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001153 /* Python 2.x specific */
1154 PyErr_Format(PyExc_TypeError,
1155 "decoding bytearray is not supported");
1156 return NULL;
1157 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001158 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001159 /* Overwrite the error message with something more useful in
1160 case of a TypeError. */
1161 if (PyErr_ExceptionMatches(PyExc_TypeError))
1162 PyErr_Format(PyExc_TypeError,
1163 "coercing to Unicode: need string or buffer, "
1164 "%.80s found",
1165 Py_TYPE(obj)->tp_name);
1166 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001167 }
Tim Petersced69f82003-09-16 20:30:58 +00001168
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001171 Py_INCREF(unicode_empty);
1172 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 }
Tim Petersced69f82003-09-16 20:30:58 +00001174 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001175 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001176
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001177 return v;
1178
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001179 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181}
1182
1183PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001184 Py_ssize_t size,
1185 const char *encoding,
1186 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187{
1188 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001189
1190 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001191 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001192
1193 /* Shortcuts for common default encodings */
1194 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001196 else if (strcmp(encoding, "latin-1") == 0)
1197 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001198#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1199 else if (strcmp(encoding, "mbcs") == 0)
1200 return PyUnicode_DecodeMBCS(s, size, errors);
1201#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001202 else if (strcmp(encoding, "ascii") == 0)
1203 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204
1205 /* Decode via the codec registry */
1206 buffer = PyBuffer_FromMemory((void *)s, size);
1207 if (buffer == NULL)
1208 goto onError;
1209 unicode = PyCodec_Decode(buffer, encoding, errors);
1210 if (unicode == NULL)
1211 goto onError;
1212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001214 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001215 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 Py_DECREF(unicode);
1217 goto onError;
1218 }
1219 Py_DECREF(buffer);
1220 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001221
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001222 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 Py_XDECREF(buffer);
1224 return NULL;
1225}
1226
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001227PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1228 const char *encoding,
1229 const char *errors)
1230{
1231 PyObject *v;
1232
1233 if (!PyUnicode_Check(unicode)) {
1234 PyErr_BadArgument();
1235 goto onError;
1236 }
1237
1238 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001239 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001240
1241 /* Decode via the codec registry */
1242 v = PyCodec_Decode(unicode, encoding, errors);
1243 if (v == NULL)
1244 goto onError;
1245 return v;
1246
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001247 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001248 return NULL;
1249}
1250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001252 Py_ssize_t size,
1253 const char *encoding,
1254 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255{
1256 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001257
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 unicode = PyUnicode_FromUnicode(s, size);
1259 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1262 Py_DECREF(unicode);
1263 return v;
1264}
1265
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001266PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1267 const char *encoding,
1268 const char *errors)
1269{
1270 PyObject *v;
1271
1272 if (!PyUnicode_Check(unicode)) {
1273 PyErr_BadArgument();
1274 goto onError;
1275 }
1276
1277 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001278 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001279
1280 /* Encode via the codec registry */
1281 v = PyCodec_Encode(unicode, encoding, errors);
1282 if (v == NULL)
1283 goto onError;
1284 return v;
1285
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001286 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001287 return NULL;
1288}
1289
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1291 const char *encoding,
1292 const char *errors)
1293{
1294 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001295
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 if (!PyUnicode_Check(unicode)) {
1297 PyErr_BadArgument();
1298 goto onError;
1299 }
Fred Drakee4315f52000-05-09 19:53:39 +00001300
Tim Petersced69f82003-09-16 20:30:58 +00001301 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001302 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001303
1304 /* Shortcuts for common default encodings */
1305 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001306 if (strcmp(encoding, "utf-8") == 0)
1307 return PyUnicode_AsUTF8String(unicode);
1308 else if (strcmp(encoding, "latin-1") == 0)
1309 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001310#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001311 else if (strcmp(encoding, "mbcs") == 0)
1312 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001313#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001314 else if (strcmp(encoding, "ascii") == 0)
1315 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317
1318 /* Encode via the codec registry */
1319 v = PyCodec_Encode(unicode, encoding, errors);
1320 if (v == NULL)
1321 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001322 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001324 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001325 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326 Py_DECREF(v);
1327 goto onError;
1328 }
1329 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001330
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001331 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 return NULL;
1333}
1334
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001335PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001336 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001337{
1338 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1339
1340 if (v)
1341 return v;
1342 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1343 if (v && errors == NULL)
1344 ((PyUnicodeObject *)unicode)->defenc = v;
1345 return v;
1346}
1347
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1349{
1350 if (!PyUnicode_Check(unicode)) {
1351 PyErr_BadArgument();
1352 goto onError;
1353 }
1354 return PyUnicode_AS_UNICODE(unicode);
1355
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001356 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 return NULL;
1358}
1359
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361{
1362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 goto onError;
1365 }
1366 return PyUnicode_GET_SIZE(unicode);
1367
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001368 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 return -1;
1370}
1371
Thomas Wouters78890102000-07-22 19:25:51 +00001372const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001373{
1374 return unicode_default_encoding;
1375}
1376
1377int PyUnicode_SetDefaultEncoding(const char *encoding)
1378{
1379 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001380
Fred Drakee4315f52000-05-09 19:53:39 +00001381 /* Make sure the encoding is valid. As side effect, this also
1382 loads the encoding into the codec registry cache. */
1383 v = _PyCodec_Lookup(encoding);
1384 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001385 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001386 Py_DECREF(v);
1387 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001388 encoding,
1389 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001390 return 0;
1391
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001392 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001393 return -1;
1394}
1395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396/* error handling callback helper:
1397 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001398 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 and adjust various state variables.
1400 return 0 on success, -1 on error
1401*/
1402
1403static
1404int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001405 const char *encoding, const char *reason,
1406 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1407 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1408 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001410 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001411
1412 PyObject *restuple = NULL;
1413 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001414 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1415 Py_ssize_t requiredsize;
1416 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001418 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001419 int res = -1;
1420
1421 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001422 *errorHandler = PyCodec_LookupError(errors);
1423 if (*errorHandler == NULL)
1424 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 }
1426
1427 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001428 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 encoding, input, insize, *startinpos, *endinpos, reason);
1430 if (*exceptionObject == NULL)
1431 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 }
1433 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001434 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1435 goto onError;
1436 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1437 goto onError;
1438 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 }
1441
1442 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1443 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001446 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 }
1449 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001450 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001451 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001452 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001453 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001454 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1455 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001456 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001457
1458 /* need more space? (at least enough for what we
1459 have+the replacement+the rest of the string (starting
1460 at the new input position), so we won't have to check space
1461 when there are no errors in the rest of the string) */
1462 repptr = PyUnicode_AS_UNICODE(repunicode);
1463 repsize = PyUnicode_GET_SIZE(repunicode);
1464 requiredsize = *outpos + repsize + insize-newpos;
1465 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001466 if (requiredsize<2*outsize)
1467 requiredsize = 2*outsize;
1468 if (_PyUnicode_Resize(output, requiredsize) < 0)
1469 goto onError;
1470 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001471 }
1472 *endinpos = newpos;
1473 *inptr = input + newpos;
1474 Py_UNICODE_COPY(*outptr, repptr, repsize);
1475 *outptr += repsize;
1476 *outpos += repsize;
1477 /* we made it! */
1478 res = 0;
1479
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 Py_XDECREF(restuple);
1482 return res;
1483}
1484
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485/* --- UTF-7 Codec -------------------------------------------------------- */
1486
1487/* see RFC2152 for details */
1488
Tim Petersced69f82003-09-16 20:30:58 +00001489static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001490char utf7_special[128] = {
1491 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1492 encoded:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001493 0 - not special
1494 1 - special
1495 2 - whitespace (optional)
1496 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1499 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1500 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1501 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1503 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1504 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1505
1506};
1507
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001508/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1509 warnings about the comparison always being false; since
1510 utf7_special[0] is 1, we can safely make that one comparison
1511 true */
1512
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001513#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001514 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001515 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516 (encodeO && (utf7_special[(c)] == 3)))
1517
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001518#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001519 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001520#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001521 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001522#define UB64(c) \
1523 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001524 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001526#define ENCODE(out, ch, bits) \
1527 while (bits >= 6) { \
1528 *out++ = B64(ch >> (bits-6)); \
1529 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530 }
1531
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001532#define DECODE(out, ch, bits, surrogate) \
1533 while (bits >= 16) { \
1534 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1535 bits -= 16; \
1536 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001537 /* We have already generated an error for the high surrogate \
1538 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001539 surrogate = 0; \
1540 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001542 it in a 16-bit character */ \
1543 surrogate = 1; \
1544 errmsg = "code pairs are not supported"; \
1545 goto utf7Error; \
1546 } else { \
1547 *out++ = outCh; \
1548 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001552 Py_ssize_t size,
1553 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001555 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1556}
1557
1558PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001559 Py_ssize_t size,
1560 const char *errors,
1561 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001563 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t startinpos;
1565 Py_ssize_t endinpos;
1566 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 const char *e;
1568 PyUnicodeObject *unicode;
1569 Py_UNICODE *p;
1570 const char *errmsg = "";
1571 int inShift = 0;
1572 unsigned int bitsleft = 0;
1573 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001574 int surrogate = 0;
1575 PyObject *errorHandler = NULL;
1576 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577
1578 unicode = _PyUnicode_New(size);
1579 if (!unicode)
1580 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001581 if (size == 0) {
1582 if (consumed)
1583 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001585 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586
1587 p = unicode->str;
1588 e = s + size;
1589
1590 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001591 Py_UNICODE ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001592 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001593 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594
1595 if (inShift) {
1596 if ((ch == '-') || !B64CHAR(ch)) {
1597 inShift = 0;
1598 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001599
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1601 if (bitsleft >= 6) {
1602 /* The shift sequence has a partial character in it. If
1603 bitsleft < 6 then we could just classify it as padding
1604 but that is not the case here */
1605
1606 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001607 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 }
1609 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001610 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 here so indicate the potential of a misencoded character. */
1612
1613 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1614 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1615 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001616 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 }
1618
1619 if (ch == '-') {
1620 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001621 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 inShift = 1;
1623 }
1624 } else if (SPECIAL(ch,0,0)) {
1625 errmsg = "unexpected special character";
Benjamin Peterson857ce152009-01-31 16:29:18 +00001626 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 } else {
1628 *p++ = ch;
1629 }
1630 } else {
1631 charsleft = (charsleft << 6) | UB64(ch);
1632 bitsleft += 6;
1633 s++;
1634 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1635 }
1636 }
1637 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001639 s++;
1640 if (s < e && *s == '-') {
1641 s++;
1642 *p++ = '+';
1643 } else
1644 {
1645 inShift = 1;
1646 bitsleft = 0;
1647 }
1648 }
1649 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001650 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 errmsg = "unexpected special character";
1652 s++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001653 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 }
1655 else {
1656 *p++ = ch;
1657 s++;
1658 }
1659 continue;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001660 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661 outpos = p-PyUnicode_AS_UNICODE(unicode);
1662 endinpos = s-starts;
1663 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001664 errors, &errorHandler,
1665 "utf7", errmsg,
1666 starts, size, &startinpos, &endinpos, &exc, &s,
1667 &unicode, &outpos, &p))
1668 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 }
1670
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001671 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 outpos = p-PyUnicode_AS_UNICODE(unicode);
1673 endinpos = size;
1674 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001675 errors, &errorHandler,
1676 "utf7", "unterminated shift sequence",
1677 starts, size, &startinpos, &endinpos, &exc, &s,
1678 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 if (s < e)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001681 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001683 if (consumed) {
1684 if(inShift)
1685 *consumed = startinpos;
1686 else
1687 *consumed = s-starts;
1688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001690 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 goto onError;
1692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 Py_XDECREF(errorHandler);
1694 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 return (PyObject *)unicode;
1696
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001697 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 Py_XDECREF(errorHandler);
1699 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700 Py_DECREF(unicode);
1701 return NULL;
1702}
1703
1704
1705PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001706 Py_ssize_t size,
1707 int encodeSetO,
1708 int encodeWhiteSpace,
1709 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001710{
1711 PyObject *v;
1712 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001713 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001715 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 unsigned int bitsleft = 0;
1717 unsigned long charsleft = 0;
1718 char * out;
1719 char * start;
1720
Neal Norwitze7d8be82008-07-31 17:17:14 +00001721 if (cbAllocated / 5 != size)
1722 return PyErr_NoMemory();
1723
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001725 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001727 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 if (v == NULL)
1729 return NULL;
1730
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001731 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 for (;i < size; ++i) {
1733 Py_UNICODE ch = s[i];
1734
1735 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001736 if (ch == '+') {
1737 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 *out++ = '-';
1739 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1740 charsleft = ch;
1741 bitsleft = 16;
1742 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001743 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001745 } else {
1746 *out++ = (char) ch;
1747 }
1748 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1750 *out++ = B64(charsleft << (6-bitsleft));
1751 charsleft = 0;
1752 bitsleft = 0;
1753 /* Characters not in the BASE64 set implicitly unshift the sequence
1754 so no '-' is required, except if the character is itself a '-' */
1755 if (B64CHAR(ch) || ch == '-') {
1756 *out++ = '-';
1757 }
1758 inShift = 0;
1759 *out++ = (char) ch;
1760 } else {
1761 bitsleft += 16;
1762 charsleft = (charsleft << 16) | ch;
1763 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1764
Mark Dickinson3e4caeb2009-02-21 20:27:01 +00001765 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001766 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 or '-' then the shift sequence will be terminated implicitly and we
1768 don't have to insert a '-'. */
1769
1770 if (bitsleft == 0) {
1771 if (i + 1 < size) {
1772 Py_UNICODE ch2 = s[i+1];
1773
1774 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001775
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 } else if (B64CHAR(ch2) || ch2 == '-') {
1777 *out++ = '-';
1778 inShift = 0;
1779 } else {
1780 inShift = 0;
1781 }
1782
1783 }
1784 else {
1785 *out++ = '-';
1786 inShift = 0;
1787 }
1788 }
Tim Petersced69f82003-09-16 20:30:58 +00001789 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001791 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 if (bitsleft) {
1793 *out++= B64(charsleft << (6-bitsleft) );
1794 *out++ = '-';
1795 }
1796
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001797 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001798 return v;
1799}
1800
1801#undef SPECIAL
1802#undef B64
1803#undef B64CHAR
1804#undef UB64
1805#undef ENCODE
1806#undef DECODE
1807
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808/* --- UTF-8 Codec -------------------------------------------------------- */
1809
Tim Petersced69f82003-09-16 20:30:58 +00001810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811char utf8_code_length[256] = {
1812 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1813 illegal prefix. see RFC 2279 for details */
1814 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1815 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1817 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1818 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1819 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1820 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1826 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1827 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1828 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1829 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1830};
1831
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001833 Py_ssize_t size,
1834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835{
Walter Dörwald69652032004-09-07 20:24:22 +00001836 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1837}
1838
1839PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001840 Py_ssize_t size,
1841 const char *errors,
1842 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001846 Py_ssize_t startinpos;
1847 Py_ssize_t endinpos;
1848 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 const char *e;
1850 PyUnicodeObject *unicode;
1851 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853 PyObject *errorHandler = NULL;
1854 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
1856 /* Note: size will always be longer than the resulting Unicode
1857 character count */
1858 unicode = _PyUnicode_New(size);
1859 if (!unicode)
1860 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001861 if (size == 0) {
1862 if (consumed)
1863 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866
1867 /* Unpack UTF-8 encoded data */
1868 p = unicode->str;
1869 e = s + size;
1870
1871 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001872 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001875 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 s++;
1877 continue;
1878 }
1879
1880 n = utf8_code_length[ch];
1881
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001882 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001883 if (consumed)
1884 break;
1885 else {
1886 errmsg = "unexpected end of data";
1887 startinpos = s-starts;
1888 endinpos = size;
1889 goto utf8Error;
1890 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892
1893 switch (n) {
1894
1895 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001897 startinpos = s-starts;
1898 endinpos = startinpos+1;
1899 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900
1901 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001902 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001903 startinpos = s-starts;
1904 endinpos = startinpos+1;
1905 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906
1907 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001908 if ((s[1] & 0xc0) != 0x80) {
1909 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001910 startinpos = s-starts;
1911 endinpos = startinpos+2;
1912 goto utf8Error;
1913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001915 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001916 startinpos = s-starts;
1917 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001918 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001919 goto utf8Error;
1920 }
1921 else
1922 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 break;
1924
1925 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001926 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001927 (s[2] & 0xc0) != 0x80) {
1928 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001929 startinpos = s-starts;
1930 endinpos = startinpos+3;
1931 goto utf8Error;
1932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001934 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001935 /* Note: UTF-8 encodings of surrogates are considered
1936 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001937
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001938 XXX For wide builds (UCS-4) we should probably try
1939 to recombine the surrogates into a single code
1940 unit.
1941 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001943 startinpos = s-starts;
1944 endinpos = startinpos+3;
1945 goto utf8Error;
1946 }
1947 else
1948 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001949 break;
1950
1951 case 4:
1952 if ((s[1] & 0xc0) != 0x80 ||
1953 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001954 (s[3] & 0xc0) != 0x80) {
1955 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001956 startinpos = s-starts;
1957 endinpos = startinpos+4;
1958 goto utf8Error;
1959 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001960 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001962 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001963 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001964 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001965 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001966 UTF-16 */
1967 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001968 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 startinpos = s-starts;
1970 endinpos = startinpos+4;
1971 goto utf8Error;
1972 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001973#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001974 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001975#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001976 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001977
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001978 /* translate from 10000..10FFFF to 0..FFFF */
1979 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001980
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001981 /* high surrogate = top 10 bits added to D800 */
1982 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001983
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001984 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001985 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001986#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 break;
1988
1989 default:
1990 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001991 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001992 startinpos = s-starts;
1993 endinpos = startinpos+n;
1994 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 }
1996 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001997 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001998
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001999 utf8Error:
2000 outpos = p-PyUnicode_AS_UNICODE(unicode);
2001 if (unicode_decode_call_errorhandler(
2002 errors, &errorHandler,
2003 "utf8", errmsg,
2004 starts, size, &startinpos, &endinpos, &exc, &s,
2005 &unicode, &outpos, &p))
2006 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 }
Walter Dörwald69652032004-09-07 20:24:22 +00002008 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002009 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010
2011 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002012 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 goto onError;
2014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002015 Py_XDECREF(errorHandler);
2016 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 return (PyObject *)unicode;
2018
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002019 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 Py_XDECREF(errorHandler);
2021 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 Py_DECREF(unicode);
2023 return NULL;
2024}
2025
Tim Peters602f7402002-04-27 18:03:26 +00002026/* Allocation strategy: if the string is short, convert into a stack buffer
2027 and allocate exactly as much space needed at the end. Else allocate the
2028 maximum possible needed (4 result bytes per Unicode character), and return
2029 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002030*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002031PyObject *
2032PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002033 Py_ssize_t size,
2034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035{
Tim Peters602f7402002-04-27 18:03:26 +00002036#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002037
Martin v. Löwis18e16552006-02-15 17:27:45 +00002038 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002039 PyObject *v; /* result string object */
2040 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002041 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002042 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002043 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002044
Tim Peters602f7402002-04-27 18:03:26 +00002045 assert(s != NULL);
2046 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
Tim Peters602f7402002-04-27 18:03:26 +00002048 if (size <= MAX_SHORT_UNICHARS) {
2049 /* Write into the stack buffer; nallocated can't overflow.
2050 * At the end, we'll allocate exactly as much heap space as it
2051 * turns out we need.
2052 */
2053 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2054 v = NULL; /* will allocate after we're done */
2055 p = stackbuf;
2056 }
2057 else {
2058 /* Overallocate on the heap, and give the excess back at the end. */
2059 nallocated = size * 4;
2060 if (nallocated / 4 != size) /* overflow! */
2061 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002062 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002063 if (v == NULL)
2064 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002065 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002066 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002067
Tim Peters602f7402002-04-27 18:03:26 +00002068 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002069 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002070
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002072 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002074
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002076 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002077 *p++ = (char)(0xc0 | (ch >> 6));
2078 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002079 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002080 else {
Tim Peters602f7402002-04-27 18:03:26 +00002081 /* Encode UCS2 Unicode ordinals */
2082 if (ch < 0x10000) {
2083 /* Special case: check for high surrogate */
2084 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2085 Py_UCS4 ch2 = s[i];
2086 /* Check for low surrogate and combine the two to
2087 form a UCS4 value */
2088 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002089 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002090 i++;
2091 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002092 }
Tim Peters602f7402002-04-27 18:03:26 +00002093 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002094 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002095 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002096 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2097 *p++ = (char)(0x80 | (ch & 0x3f));
2098 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002099 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002100 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002101 /* Encode UCS4 Unicode ordinals */
2102 *p++ = (char)(0xf0 | (ch >> 18));
2103 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2104 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2105 *p++ = (char)(0x80 | (ch & 0x3f));
2106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 if (v == NULL) {
2110 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002111 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002112 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002113 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002114 }
2115 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002116 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002117 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002118 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002119 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002122
Tim Peters602f7402002-04-27 18:03:26 +00002123#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2127{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 if (!PyUnicode_Check(unicode)) {
2129 PyErr_BadArgument();
2130 return NULL;
2131 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002132 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002133 PyUnicode_GET_SIZE(unicode),
2134 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135}
2136
Walter Dörwald6e390802007-08-17 16:41:28 +00002137/* --- UTF-32 Codec ------------------------------------------------------- */
2138
2139PyObject *
2140PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002141 Py_ssize_t size,
2142 const char *errors,
2143 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002144{
2145 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2146}
2147
2148PyObject *
2149PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002150 Py_ssize_t size,
2151 const char *errors,
2152 int *byteorder,
2153 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002154{
2155 const char *starts = s;
2156 Py_ssize_t startinpos;
2157 Py_ssize_t endinpos;
2158 Py_ssize_t outpos;
2159 PyUnicodeObject *unicode;
2160 Py_UNICODE *p;
2161#ifndef Py_UNICODE_WIDE
2162 int i, pairs;
2163#else
2164 const int pairs = 0;
2165#endif
2166 const unsigned char *q, *e;
2167 int bo = 0; /* assume native ordering by default */
2168 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002169 /* Offsets from q for retrieving bytes in the right order. */
2170#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2171 int iorder[] = {0, 1, 2, 3};
2172#else
2173 int iorder[] = {3, 2, 1, 0};
2174#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002175 PyObject *errorHandler = NULL;
2176 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002177 /* On narrow builds we split characters outside the BMP into two
2178 codepoints => count how much extra space we need. */
2179#ifndef Py_UNICODE_WIDE
2180 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 if (((Py_UCS4 *)s)[i] >= 0x10000)
2182 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002183#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002184
2185 /* This might be one to much, because of a BOM */
2186 unicode = _PyUnicode_New((size+3)/4+pairs);
2187 if (!unicode)
2188 return NULL;
2189 if (size == 0)
2190 return (PyObject *)unicode;
2191
2192 /* Unpack UTF-32 encoded data */
2193 p = unicode->str;
2194 q = (unsigned char *)s;
2195 e = q + size;
2196
2197 if (byteorder)
2198 bo = *byteorder;
2199
2200 /* Check for BOM marks (U+FEFF) in the input and adjust current
2201 byte order setting accordingly. In native mode, the leading BOM
2202 mark is skipped, in all other modes, it is copied to the output
2203 stream as-is (giving a ZWNBSP character). */
2204 if (bo == 0) {
2205 if (size >= 4) {
2206 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002207 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002209 if (bom == 0x0000FEFF) {
2210 q += 4;
2211 bo = -1;
2212 }
2213 else if (bom == 0xFFFE0000) {
2214 q += 4;
2215 bo = 1;
2216 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002217#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002218 if (bom == 0x0000FEFF) {
2219 q += 4;
2220 bo = 1;
2221 }
2222 else if (bom == 0xFFFE0000) {
2223 q += 4;
2224 bo = -1;
2225 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002226#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002227 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002228 }
2229
2230 if (bo == -1) {
2231 /* force LE */
2232 iorder[0] = 0;
2233 iorder[1] = 1;
2234 iorder[2] = 2;
2235 iorder[3] = 3;
2236 }
2237 else if (bo == 1) {
2238 /* force BE */
2239 iorder[0] = 3;
2240 iorder[1] = 2;
2241 iorder[2] = 1;
2242 iorder[3] = 0;
2243 }
2244
2245 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002246 Py_UCS4 ch;
2247 /* remaining bytes at the end? (size should be divisible by 4) */
2248 if (e-q<4) {
2249 if (consumed)
2250 break;
2251 errmsg = "truncated data";
2252 startinpos = ((const char *)q)-starts;
2253 endinpos = ((const char *)e)-starts;
2254 goto utf32Error;
2255 /* The remaining input chars are ignored if the callback
2256 chooses to skip the input */
2257 }
2258 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2259 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002260
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002261 if (ch >= 0x110000)
2262 {
2263 errmsg = "codepoint not in range(0x110000)";
2264 startinpos = ((const char *)q)-starts;
2265 endinpos = startinpos+4;
2266 goto utf32Error;
2267 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002268#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002269 if (ch >= 0x10000)
2270 {
2271 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2272 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2273 }
2274 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002275#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002276 *p++ = ch;
2277 q += 4;
2278 continue;
2279 utf32Error:
2280 outpos = p-PyUnicode_AS_UNICODE(unicode);
2281 if (unicode_decode_call_errorhandler(
2282 errors, &errorHandler,
2283 "utf32", errmsg,
2284 starts, size, &startinpos, &endinpos, &exc, &s,
2285 &unicode, &outpos, &p))
2286 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002287 }
2288
2289 if (byteorder)
2290 *byteorder = bo;
2291
2292 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002293 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002294
2295 /* Adjust length */
2296 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2297 goto onError;
2298
2299 Py_XDECREF(errorHandler);
2300 Py_XDECREF(exc);
2301 return (PyObject *)unicode;
2302
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002303 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002304 Py_DECREF(unicode);
2305 Py_XDECREF(errorHandler);
2306 Py_XDECREF(exc);
2307 return NULL;
2308}
2309
2310PyObject *
2311PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 Py_ssize_t size,
2313 const char *errors,
2314 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002315{
2316 PyObject *v;
2317 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002318 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002319#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002320 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002321#else
2322 const int pairs = 0;
2323#endif
2324 /* Offsets from p for storing byte pairs in the right order. */
2325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2326 int iorder[] = {0, 1, 2, 3};
2327#else
2328 int iorder[] = {3, 2, 1, 0};
2329#endif
2330
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002331#define STORECHAR(CH) \
2332 do { \
2333 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2334 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2335 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2336 p[iorder[0]] = (CH) & 0xff; \
2337 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002338 } while(0)
2339
2340 /* In narrow builds we can output surrogate pairs as one codepoint,
2341 so we need less space. */
2342#ifndef Py_UNICODE_WIDE
2343 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2345 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2346 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002347#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002348 nsize = (size - pairs + (byteorder == 0));
2349 bytesize = nsize * 4;
2350 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002351 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002352 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002353 if (v == NULL)
2354 return NULL;
2355
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002356 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002357 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002358 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002359 if (size == 0)
2360 return v;
2361
2362 if (byteorder == -1) {
2363 /* force LE */
2364 iorder[0] = 0;
2365 iorder[1] = 1;
2366 iorder[2] = 2;
2367 iorder[3] = 3;
2368 }
2369 else if (byteorder == 1) {
2370 /* force BE */
2371 iorder[0] = 3;
2372 iorder[1] = 2;
2373 iorder[2] = 1;
2374 iorder[3] = 0;
2375 }
2376
2377 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002378 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002379#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002380 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2381 Py_UCS4 ch2 = *s;
2382 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2383 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2384 s++;
2385 size--;
2386 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002387 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002388#endif
2389 STORECHAR(ch);
2390 }
2391 return v;
2392#undef STORECHAR
2393}
2394
2395PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2396{
2397 if (!PyUnicode_Check(unicode)) {
2398 PyErr_BadArgument();
2399 return NULL;
2400 }
2401 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 PyUnicode_GET_SIZE(unicode),
2403 NULL,
2404 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002405}
2406
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407/* --- UTF-16 Codec ------------------------------------------------------- */
2408
Tim Peters772747b2001-08-09 22:21:55 +00002409PyObject *
2410PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002411 Py_ssize_t size,
2412 const char *errors,
2413 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414{
Walter Dörwald69652032004-09-07 20:24:22 +00002415 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2416}
2417
2418PyObject *
2419PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002420 Py_ssize_t size,
2421 const char *errors,
2422 int *byteorder,
2423 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002425 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002426 Py_ssize_t startinpos;
2427 Py_ssize_t endinpos;
2428 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 PyUnicodeObject *unicode;
2430 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002431 const unsigned char *q, *e;
2432 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002433 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002434 /* Offsets from q for retrieving byte pairs in the right order. */
2435#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2436 int ihi = 1, ilo = 0;
2437#else
2438 int ihi = 0, ilo = 1;
2439#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002440 PyObject *errorHandler = NULL;
2441 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442
2443 /* Note: size will always be longer than the resulting Unicode
2444 character count */
2445 unicode = _PyUnicode_New(size);
2446 if (!unicode)
2447 return NULL;
2448 if (size == 0)
2449 return (PyObject *)unicode;
2450
2451 /* Unpack UTF-16 encoded data */
2452 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002453 q = (unsigned char *)s;
2454 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455
2456 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002457 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002459 /* Check for BOM marks (U+FEFF) in the input and adjust current
2460 byte order setting accordingly. In native mode, the leading BOM
2461 mark is skipped, in all other modes, it is copied to the output
2462 stream as-is (giving a ZWNBSP character). */
2463 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002464 if (size >= 2) {
2465 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002466#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002467 if (bom == 0xFEFF) {
2468 q += 2;
2469 bo = -1;
2470 }
2471 else if (bom == 0xFFFE) {
2472 q += 2;
2473 bo = 1;
2474 }
Tim Petersced69f82003-09-16 20:30:58 +00002475#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002476 if (bom == 0xFEFF) {
2477 q += 2;
2478 bo = 1;
2479 }
2480 else if (bom == 0xFFFE) {
2481 q += 2;
2482 bo = -1;
2483 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002484#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002485 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
Tim Peters772747b2001-08-09 22:21:55 +00002488 if (bo == -1) {
2489 /* force LE */
2490 ihi = 1;
2491 ilo = 0;
2492 }
2493 else if (bo == 1) {
2494 /* force BE */
2495 ihi = 0;
2496 ilo = 1;
2497 }
2498
2499 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002500 Py_UNICODE ch;
2501 /* remaining bytes at the end? (size should be even) */
2502 if (e-q<2) {
2503 if (consumed)
2504 break;
2505 errmsg = "truncated data";
2506 startinpos = ((const char *)q)-starts;
2507 endinpos = ((const char *)e)-starts;
2508 goto utf16Error;
2509 /* The remaining input chars are ignored if the callback
2510 chooses to skip the input */
2511 }
2512 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002513
Benjamin Peterson857ce152009-01-31 16:29:18 +00002514 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002515
2516 if (ch < 0xD800 || ch > 0xDFFF) {
2517 *p++ = ch;
2518 continue;
2519 }
2520
2521 /* UTF-16 code pair: */
2522 if (q >= e) {
2523 errmsg = "unexpected end of data";
2524 startinpos = (((const char *)q)-2)-starts;
2525 endinpos = ((const char *)e)-starts;
2526 goto utf16Error;
2527 }
2528 if (0xD800 <= ch && ch <= 0xDBFF) {
2529 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2530 q += 2;
2531 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002532#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002533 *p++ = ch;
2534 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002535#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002537#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002538 continue;
2539 }
2540 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002541 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002542 startinpos = (((const char *)q)-4)-starts;
2543 endinpos = startinpos+2;
2544 goto utf16Error;
2545 }
2546
Benjamin Peterson857ce152009-01-31 16:29:18 +00002547 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002548 errmsg = "illegal encoding";
2549 startinpos = (((const char *)q)-2)-starts;
2550 endinpos = startinpos+2;
2551 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002552
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002553 utf16Error:
2554 outpos = p-PyUnicode_AS_UNICODE(unicode);
2555 if (unicode_decode_call_errorhandler(
2556 errors, &errorHandler,
2557 "utf16", errmsg,
2558 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2559 &unicode, &outpos, &p))
2560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 }
2562
2563 if (byteorder)
2564 *byteorder = bo;
2565
Walter Dörwald69652032004-09-07 20:24:22 +00002566 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002567 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002568
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002570 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 goto onError;
2572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 Py_XDECREF(errorHandler);
2574 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 return (PyObject *)unicode;
2576
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002577 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 Py_XDECREF(errorHandler);
2580 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 return NULL;
2582}
2583
Tim Peters772747b2001-08-09 22:21:55 +00002584PyObject *
2585PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 Py_ssize_t size,
2587 const char *errors,
2588 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589{
2590 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002591 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002592 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002593#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002594 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002595#else
2596 const int pairs = 0;
2597#endif
Tim Peters772747b2001-08-09 22:21:55 +00002598 /* Offsets from p for storing byte pairs in the right order. */
2599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2600 int ihi = 1, ilo = 0;
2601#else
2602 int ihi = 0, ilo = 1;
2603#endif
2604
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002605#define STORECHAR(CH) \
2606 do { \
2607 p[ihi] = ((CH) >> 8) & 0xff; \
2608 p[ilo] = (CH) & 0xff; \
2609 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002610 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002612#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002613 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002614 if (s[i] >= 0x10000)
2615 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002616#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002617 /* 2 * (size + pairs + (byteorder == 0)) */
2618 if (size > PY_SSIZE_T_MAX ||
2619 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002620 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002621 nsize = size + pairs + (byteorder == 0);
2622 bytesize = nsize * 2;
2623 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002624 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002625 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 if (v == NULL)
2627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002629 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002632 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002633 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002634
2635 if (byteorder == -1) {
2636 /* force LE */
2637 ihi = 1;
2638 ilo = 0;
2639 }
2640 else if (byteorder == 1) {
2641 /* force BE */
2642 ihi = 0;
2643 ilo = 1;
2644 }
2645
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002646 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002647 Py_UNICODE ch = *s++;
2648 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002649#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002650 if (ch >= 0x10000) {
2651 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2652 ch = 0xD800 | ((ch-0x10000) >> 10);
2653 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002654#endif
Tim Peters772747b2001-08-09 22:21:55 +00002655 STORECHAR(ch);
2656 if (ch2)
2657 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002660#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661}
2662
2663PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2664{
2665 if (!PyUnicode_Check(unicode)) {
2666 PyErr_BadArgument();
2667 return NULL;
2668 }
2669 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002670 PyUnicode_GET_SIZE(unicode),
2671 NULL,
2672 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673}
2674
2675/* --- Unicode Escape Codec ----------------------------------------------- */
2676
Fredrik Lundh06d12682001-01-24 07:59:11 +00002677static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002678
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002680 Py_ssize_t size,
2681 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002684 Py_ssize_t startinpos;
2685 Py_ssize_t endinpos;
2686 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002691 char* message;
2692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 PyObject *errorHandler = NULL;
2694 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 /* Escaped strings will always be longer than the resulting
2697 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002698 length after conversion to the true value.
2699 (but if the error callback returns a long replacement string
2700 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 v = _PyUnicode_New(size);
2702 if (v == NULL)
2703 goto onError;
2704 if (size == 0)
2705 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002709
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 while (s < end) {
2711 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002712 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714
2715 /* Non-escape characters are interpreted as Unicode ordinals */
2716 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002717 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 continue;
2719 }
2720
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 /* \ - Escapes */
2723 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002724 c = *s++;
2725 if (s > end)
2726 c = '\0'; /* Invalid after \ */
2727 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002729 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 case '\n': break;
2731 case '\\': *p++ = '\\'; break;
2732 case '\'': *p++ = '\''; break;
2733 case '\"': *p++ = '\"'; break;
2734 case 'b': *p++ = '\b'; break;
2735 case 'f': *p++ = '\014'; break; /* FF */
2736 case 't': *p++ = '\t'; break;
2737 case 'n': *p++ = '\n'; break;
2738 case 'r': *p++ = '\r'; break;
2739 case 'v': *p++ = '\013'; break; /* VT */
2740 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2741
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002742 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 case '0': case '1': case '2': case '3':
2744 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002745 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002746 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002747 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002748 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002749 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002751 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 break;
2753
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002754 /* hex escapes */
2755 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757 digits = 2;
2758 message = "truncated \\xXX escape";
2759 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002761 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763 digits = 4;
2764 message = "truncated \\uXXXX escape";
2765 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002767 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002768 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002769 digits = 8;
2770 message = "truncated \\UXXXXXXXX escape";
2771 hexescape:
2772 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 outpos = p-PyUnicode_AS_UNICODE(v);
2774 if (s+digits>end) {
2775 endinpos = size;
2776 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002777 errors, &errorHandler,
2778 "unicodeescape", "end of string in escape sequence",
2779 starts, size, &startinpos, &endinpos, &exc, &s,
2780 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 goto onError;
2782 goto nextByte;
2783 }
2784 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002786 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 endinpos = (s+i+1)-starts;
2788 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002789 errors, &errorHandler,
2790 "unicodeescape", message,
2791 starts, size, &startinpos, &endinpos, &exc, &s,
2792 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002793 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002794 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002795 }
2796 chr = (chr<<4) & ~0xF;
2797 if (c >= '0' && c <= '9')
2798 chr += c - '0';
2799 else if (c >= 'a' && c <= 'f')
2800 chr += 10 + c - 'a';
2801 else
2802 chr += 10 + c - 'A';
2803 }
2804 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002805 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002806 /* _decoding_error will have already written into the
2807 target buffer. */
2808 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002810 /* when we get here, chr is a 32-bit unicode character */
2811 if (chr <= 0xffff)
2812 /* UCS-2 character */
2813 *p++ = (Py_UNICODE) chr;
2814 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002815 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002816 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002817#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002818 *p++ = chr;
2819#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002820 chr -= 0x10000L;
2821 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002822 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002823#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002824 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 endinpos = s-starts;
2826 outpos = p-PyUnicode_AS_UNICODE(v);
2827 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002828 errors, &errorHandler,
2829 "unicodeescape", "illegal Unicode character",
2830 starts, size, &startinpos, &endinpos, &exc, &s,
2831 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002832 goto onError;
2833 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002834 break;
2835
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002836 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002837 case 'N':
2838 message = "malformed \\N character escape";
2839 if (ucnhash_CAPI == NULL) {
2840 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002841 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002842 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002843 if (m == NULL)
2844 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002845 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002846 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002847 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002848 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002849 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002850 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002851 if (ucnhash_CAPI == NULL)
2852 goto ucnhashError;
2853 }
2854 if (*s == '{') {
2855 const char *start = s+1;
2856 /* look for the closing brace */
2857 while (*s != '}' && s < end)
2858 s++;
2859 if (s > start && s < end && *s == '}') {
2860 /* found a name. look it up in the unicode database */
2861 message = "unknown Unicode character name";
2862 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002863 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002864 goto store;
2865 }
2866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 endinpos = s-starts;
2868 outpos = p-PyUnicode_AS_UNICODE(v);
2869 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002870 errors, &errorHandler,
2871 "unicodeescape", message,
2872 starts, size, &startinpos, &endinpos, &exc, &s,
2873 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002874 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002875 break;
2876
2877 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002878 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 message = "\\ at end of string";
2880 s--;
2881 endinpos = s-starts;
2882 outpos = p-PyUnicode_AS_UNICODE(v);
2883 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002884 errors, &errorHandler,
2885 "unicodeescape", message,
2886 starts, size, &startinpos, &endinpos, &exc, &s,
2887 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002888 goto onError;
2889 }
2890 else {
2891 *p++ = '\\';
2892 *p++ = (unsigned char)s[-1];
2893 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002896 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002899 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002901 Py_XDECREF(errorHandler);
2902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002904
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002905 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002906 PyErr_SetString(
2907 PyExc_UnicodeError,
2908 "\\N escapes not supported (can't load unicodedata module)"
2909 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002910 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 Py_XDECREF(errorHandler);
2912 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002913 return NULL;
2914
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002915 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 Py_XDECREF(errorHandler);
2918 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 return NULL;
2920}
2921
2922/* Return a Unicode-Escape string version of the Unicode object.
2923
2924 If quotes is true, the string is enclosed in u"" or u'' quotes as
2925 appropriate.
2926
2927*/
2928
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002929Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002930 Py_ssize_t size,
2931 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002932{
2933 /* like wcschr, but doesn't stop at NULL characters */
2934
2935 while (size-- > 0) {
2936 if (*s == ch)
2937 return s;
2938 s++;
2939 }
2940
2941 return NULL;
2942}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002943
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944static
2945PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002946 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 int quotes)
2948{
2949 PyObject *repr;
2950 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002952 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002953#ifdef Py_UNICODE_WIDE
2954 const Py_ssize_t expandsize = 10;
2955#else
2956 const Py_ssize_t expandsize = 6;
2957#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958
Neal Norwitz17753ec2006-08-21 22:21:19 +00002959 /* XXX(nnorwitz): rather than over-allocating, it would be
2960 better to choose a different scheme. Perhaps scan the
2961 first N-chars of the string and allocate based on that size.
2962 */
2963 /* Initial allocation is based on the longest-possible unichr
2964 escape.
2965
2966 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2967 unichr, so in this case it's the longest unichr escape. In
2968 narrow (UTF-16) builds this is five chars per source unichr
2969 since there are two unichrs in the surrogate pair, so in narrow
2970 (UTF-16) builds it's not the longest unichr escape.
2971
2972 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2973 so in the narrow (UTF-16) build case it's the longest unichr
2974 escape.
2975 */
2976
Neal Norwitze7d8be82008-07-31 17:17:14 +00002977 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002978 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002979
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002980 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002981 2
2982 + expandsize*size
2983 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 if (repr == NULL)
2985 return NULL;
2986
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002987 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988
2989 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002991 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 !findchar(s, size, '"')) ? '"' : '\'';
2993 }
2994 while (size-- > 0) {
2995 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002996
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002997 /* Escape quotes and backslashes */
2998 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002999 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 *p++ = '\\';
3001 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003002 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003004
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003005#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003006 /* Map 21-bit characters to '\U00xxxxxx' */
3007 else if (ch >= 0x10000) {
3008 *p++ = '\\';
3009 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003010 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3011 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3012 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3013 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3014 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3015 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3016 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003017 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003018 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003019 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003020#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003021 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3022 else if (ch >= 0xD800 && ch < 0xDC00) {
3023 Py_UNICODE ch2;
3024 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003025
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003026 ch2 = *s++;
3027 size--;
3028 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3029 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3030 *p++ = '\\';
3031 *p++ = 'U';
3032 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3033 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3034 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3035 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3036 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3037 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3038 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3039 *p++ = hexdigit[ucs & 0x0000000F];
3040 continue;
3041 }
3042 /* Fall through: isolated surrogates are copied as-is */
3043 s--;
3044 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003045 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003046#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003049 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 *p++ = '\\';
3051 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003052 *p++ = hexdigit[(ch >> 12) & 0x000F];
3053 *p++ = hexdigit[(ch >> 8) & 0x000F];
3054 *p++ = hexdigit[(ch >> 4) & 0x000F];
3055 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003057
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003058 /* Map special whitespace to '\t', \n', '\r' */
3059 else if (ch == '\t') {
3060 *p++ = '\\';
3061 *p++ = 't';
3062 }
3063 else if (ch == '\n') {
3064 *p++ = '\\';
3065 *p++ = 'n';
3066 }
3067 else if (ch == '\r') {
3068 *p++ = '\\';
3069 *p++ = 'r';
3070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003071
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003072 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003073 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003075 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003076 *p++ = hexdigit[(ch >> 4) & 0x000F];
3077 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003078 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003079
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 /* Copy everything else as-is */
3081 else
3082 *p++ = (char) ch;
3083 }
3084 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003085 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086
3087 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003088 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 return repr;
3090}
3091
3092PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003093 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094{
3095 return unicodeescape_string(s, size, 0);
3096}
3097
3098PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3099{
3100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
3102 return NULL;
3103 }
3104 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003105 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106}
3107
3108/* --- Raw Unicode Escape Codec ------------------------------------------- */
3109
3110PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003111 Py_ssize_t size,
3112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003115 Py_ssize_t startinpos;
3116 Py_ssize_t endinpos;
3117 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 const char *end;
3121 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 PyObject *errorHandler = NULL;
3123 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003124
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 /* Escaped strings will always be longer than the resulting
3126 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 length after conversion to the true value. (But decoding error
3128 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 v = _PyUnicode_New(size);
3130 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003133 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 end = s + size;
3136 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003137 unsigned char c;
3138 Py_UCS4 x;
3139 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003140 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003142 /* Non-escape characters are interpreted as Unicode ordinals */
3143 if (*s != '\\') {
3144 *p++ = (unsigned char)*s++;
3145 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003146 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003147 startinpos = s-starts;
3148
3149 /* \u-escapes are only interpreted iff the number of leading
3150 backslashes if odd */
3151 bs = s;
3152 for (;s < end;) {
3153 if (*s != '\\')
3154 break;
3155 *p++ = (unsigned char)*s++;
3156 }
3157 if (((s - bs) & 1) == 0 ||
3158 s >= end ||
3159 (*s != 'u' && *s != 'U')) {
3160 continue;
3161 }
3162 p--;
3163 count = *s=='u' ? 4 : 8;
3164 s++;
3165
3166 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3167 outpos = p-PyUnicode_AS_UNICODE(v);
3168 for (x = 0, i = 0; i < count; ++i, ++s) {
3169 c = (unsigned char)*s;
3170 if (!isxdigit(c)) {
3171 endinpos = s-starts;
3172 if (unicode_decode_call_errorhandler(
3173 errors, &errorHandler,
3174 "rawunicodeescape", "truncated \\uXXXX",
3175 starts, size, &startinpos, &endinpos, &exc, &s,
3176 &v, &outpos, &p))
3177 goto onError;
3178 goto nextByte;
3179 }
3180 x = (x<<4) & ~0xF;
3181 if (c >= '0' && c <= '9')
3182 x += c - '0';
3183 else if (c >= 'a' && c <= 'f')
3184 x += 10 + c - 'a';
3185 else
3186 x += 10 + c - 'A';
3187 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003188 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003189 /* UCS-2 character */
3190 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003191 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003192 /* UCS-4 character. Either store directly, or as
3193 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003194#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003195 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003196#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003197 x -= 0x10000L;
3198 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3199 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003200#endif
3201 } else {
3202 endinpos = s-starts;
3203 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003204 if (unicode_decode_call_errorhandler(
3205 errors, &errorHandler,
3206 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003207 starts, size, &startinpos, &endinpos, &exc, &s,
3208 &v, &outpos, &p))
3209 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003210 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003211 nextByte:
3212 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003214 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003215 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003219
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003220 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 Py_XDECREF(errorHandler);
3223 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 return NULL;
3225}
3226
3227PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003228 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229{
3230 PyObject *repr;
3231 char *p;
3232 char *q;
3233
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003234 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003235#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003236 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003237#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003238 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003239#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003240
Neal Norwitze7d8be82008-07-31 17:17:14 +00003241 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003242 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003243
Neal Norwitze7d8be82008-07-31 17:17:14 +00003244 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 if (repr == NULL)
3246 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003247 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003250 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 while (size-- > 0) {
3252 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003253#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003254 /* Map 32-bit characters to '\Uxxxxxxxx' */
3255 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003256 *p++ = '\\';
3257 *p++ = 'U';
3258 *p++ = hexdigit[(ch >> 28) & 0xf];
3259 *p++ = hexdigit[(ch >> 24) & 0xf];
3260 *p++ = hexdigit[(ch >> 20) & 0xf];
3261 *p++ = hexdigit[(ch >> 16) & 0xf];
3262 *p++ = hexdigit[(ch >> 12) & 0xf];
3263 *p++ = hexdigit[(ch >> 8) & 0xf];
3264 *p++ = hexdigit[(ch >> 4) & 0xf];
3265 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003266 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003267 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003268#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3270 if (ch >= 0xD800 && ch < 0xDC00) {
3271 Py_UNICODE ch2;
3272 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003273
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003274 ch2 = *s++;
3275 size--;
3276 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3277 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3278 *p++ = '\\';
3279 *p++ = 'U';
3280 *p++ = hexdigit[(ucs >> 28) & 0xf];
3281 *p++ = hexdigit[(ucs >> 24) & 0xf];
3282 *p++ = hexdigit[(ucs >> 20) & 0xf];
3283 *p++ = hexdigit[(ucs >> 16) & 0xf];
3284 *p++ = hexdigit[(ucs >> 12) & 0xf];
3285 *p++ = hexdigit[(ucs >> 8) & 0xf];
3286 *p++ = hexdigit[(ucs >> 4) & 0xf];
3287 *p++ = hexdigit[ucs & 0xf];
3288 continue;
3289 }
3290 /* Fall through: isolated surrogates are copied as-is */
3291 s--;
3292 size++;
3293 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003294#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 /* Map 16-bit characters to '\uxxxx' */
3296 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 *p++ = '\\';
3298 *p++ = 'u';
3299 *p++ = hexdigit[(ch >> 12) & 0xf];
3300 *p++ = hexdigit[(ch >> 8) & 0xf];
3301 *p++ = hexdigit[(ch >> 4) & 0xf];
3302 *p++ = hexdigit[ch & 15];
3303 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003304 /* Copy everything else as-is */
3305 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 *p++ = (char) ch;
3307 }
3308 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003309 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 return repr;
3311}
3312
3313PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3314{
3315 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003316 PyErr_BadArgument();
3317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 }
3319 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003320 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321}
3322
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003323/* --- Unicode Internal Codec ------------------------------------------- */
3324
3325PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003326 Py_ssize_t size,
3327 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003328{
3329 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003330 Py_ssize_t startinpos;
3331 Py_ssize_t endinpos;
3332 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003333 PyUnicodeObject *v;
3334 Py_UNICODE *p;
3335 const char *end;
3336 const char *reason;
3337 PyObject *errorHandler = NULL;
3338 PyObject *exc = NULL;
3339
Neal Norwitzd43069c2006-01-08 01:12:10 +00003340#ifdef Py_UNICODE_WIDE
3341 Py_UNICODE unimax = PyUnicode_GetMax();
3342#endif
3343
Armin Rigo7ccbca92006-10-04 12:17:45 +00003344 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003345 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3346 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003347 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003348 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003349 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003350 p = PyUnicode_AS_UNICODE(v);
3351 end = s + size;
3352
3353 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003354 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003355 /* We have to sanity check the raw data, otherwise doom looms for
3356 some malformed UCS-4 data. */
3357 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003358#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003359 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003361 end-s < Py_UNICODE_SIZE
3362 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003363 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003364 startinpos = s - starts;
3365 if (end-s < Py_UNICODE_SIZE) {
3366 endinpos = end-starts;
3367 reason = "truncated input";
3368 }
3369 else {
3370 endinpos = s - starts + Py_UNICODE_SIZE;
3371 reason = "illegal code point (> 0x10FFFF)";
3372 }
3373 outpos = p - PyUnicode_AS_UNICODE(v);
3374 if (unicode_decode_call_errorhandler(
3375 errors, &errorHandler,
3376 "unicode_internal", reason,
3377 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003378 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003379 goto onError;
3380 }
3381 }
3382 else {
3383 p++;
3384 s += Py_UNICODE_SIZE;
3385 }
3386 }
3387
Martin v. Löwis412fb672006-04-13 06:34:32 +00003388 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 goto onError;
3390 Py_XDECREF(errorHandler);
3391 Py_XDECREF(exc);
3392 return (PyObject *)v;
3393
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003394 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003395 Py_XDECREF(v);
3396 Py_XDECREF(errorHandler);
3397 Py_XDECREF(exc);
3398 return NULL;
3399}
3400
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401/* --- Latin-1 Codec ------------------------------------------------------ */
3402
3403PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404 Py_ssize_t size,
3405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406{
3407 PyUnicodeObject *v;
3408 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003409
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003411 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003412 Py_UNICODE r = *(unsigned char*)s;
3413 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003414 }
3415
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 v = _PyUnicode_New(size);
3417 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003420 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 p = PyUnicode_AS_UNICODE(v);
3422 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003423 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003425
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003426 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 Py_XDECREF(v);
3428 return NULL;
3429}
3430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431/* create or adjust a UnicodeEncodeError */
3432static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003433 const char *encoding,
3434 const Py_UNICODE *unicode, Py_ssize_t size,
3435 Py_ssize_t startpos, Py_ssize_t endpos,
3436 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003439 *exceptionObject = PyUnicodeEncodeError_Create(
3440 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 }
3442 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003443 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3444 goto onError;
3445 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3446 goto onError;
3447 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3448 goto onError;
3449 return;
3450 onError:
3451 Py_DECREF(*exceptionObject);
3452 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
3454}
3455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456/* raises a UnicodeEncodeError */
3457static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003458 const char *encoding,
3459 const Py_UNICODE *unicode, Py_ssize_t size,
3460 Py_ssize_t startpos, Py_ssize_t endpos,
3461 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462{
3463 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467}
3468
3469/* error handling callback helper:
3470 build arguments, call the callback and check the arguments,
3471 put the result into newpos and return the replacement string, which
3472 has to be freed by the caller */
3473static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 PyObject **errorHandler,
3475 const char *encoding, const char *reason,
3476 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3477 Py_ssize_t startpos, Py_ssize_t endpos,
3478 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003480 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481
3482 PyObject *restuple;
3483 PyObject *resunicode;
3484
3485 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003486 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003488 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 }
3490
3491 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003492 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495
3496 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003497 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003501 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003502 Py_DECREF(restuple);
3503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 }
3505 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 &resunicode, newpos)) {
3507 Py_DECREF(restuple);
3508 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 }
3510 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003511 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003512 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003513 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3514 Py_DECREF(restuple);
3515 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 Py_INCREF(resunicode);
3518 Py_DECREF(restuple);
3519 return resunicode;
3520}
3521
3522static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003523 Py_ssize_t size,
3524 const char *errors,
3525 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526{
3527 /* output object */
3528 PyObject *res;
3529 /* pointers to the beginning and end+1 of input */
3530 const Py_UNICODE *startp = p;
3531 const Py_UNICODE *endp = p + size;
3532 /* pointer to the beginning of the unencodable characters */
3533 /* const Py_UNICODE *badp = NULL; */
3534 /* pointer into the output */
3535 char *str;
3536 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003537 Py_ssize_t respos = 0;
3538 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003539 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3540 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 PyObject *errorHandler = NULL;
3542 PyObject *exc = NULL;
3543 /* the following variable is used for caching string comparisons
3544 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3545 int known_errorHandler = -1;
3546
3547 /* allocate enough for a simple encoding without
3548 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003549 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 if (res == NULL)
3551 goto onError;
3552 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003553 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003554 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 ressize = size;
3556
3557 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003558 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003560 /* can we encode this? */
3561 if (c<limit) {
3562 /* no overflow check, because we know that the space is enough */
3563 *str++ = (char)c;
3564 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003565 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003566 else {
3567 Py_ssize_t unicodepos = p-startp;
3568 Py_ssize_t requiredsize;
3569 PyObject *repunicode;
3570 Py_ssize_t repsize;
3571 Py_ssize_t newpos;
3572 Py_ssize_t respos;
3573 Py_UNICODE *uni2;
3574 /* startpos for collecting unencodable chars */
3575 const Py_UNICODE *collstart = p;
3576 const Py_UNICODE *collend = p;
3577 /* find all unecodable characters */
3578 while ((collend < endp) && ((*collend)>=limit))
3579 ++collend;
3580 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3581 if (known_errorHandler==-1) {
3582 if ((errors==NULL) || (!strcmp(errors, "strict")))
3583 known_errorHandler = 1;
3584 else if (!strcmp(errors, "replace"))
3585 known_errorHandler = 2;
3586 else if (!strcmp(errors, "ignore"))
3587 known_errorHandler = 3;
3588 else if (!strcmp(errors, "xmlcharrefreplace"))
3589 known_errorHandler = 4;
3590 else
3591 known_errorHandler = 0;
3592 }
3593 switch (known_errorHandler) {
3594 case 1: /* strict */
3595 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3596 goto onError;
3597 case 2: /* replace */
3598 while (collstart++<collend)
3599 *str++ = '?'; /* fall through */
3600 case 3: /* ignore */
3601 p = collend;
3602 break;
3603 case 4: /* xmlcharrefreplace */
3604 respos = str-PyString_AS_STRING(res);
3605 /* determine replacement size (temporarily (mis)uses p) */
3606 for (p = collstart, repsize = 0; p < collend; ++p) {
3607 if (*p<10)
3608 repsize += 2+1+1;
3609 else if (*p<100)
3610 repsize += 2+2+1;
3611 else if (*p<1000)
3612 repsize += 2+3+1;
3613 else if (*p<10000)
3614 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003615#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003616 else
3617 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003618#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003619 else if (*p<100000)
3620 repsize += 2+5+1;
3621 else if (*p<1000000)
3622 repsize += 2+6+1;
3623 else
3624 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003625#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003626 }
3627 requiredsize = respos+repsize+(endp-collend);
3628 if (requiredsize > ressize) {
3629 if (requiredsize<2*ressize)
3630 requiredsize = 2*ressize;
3631 if (_PyString_Resize(&res, requiredsize))
3632 goto onError;
3633 str = PyString_AS_STRING(res) + respos;
3634 ressize = requiredsize;
3635 }
3636 /* generate replacement (temporarily (mis)uses p) */
3637 for (p = collstart; p < collend; ++p) {
3638 str += sprintf(str, "&#%d;", (int)*p);
3639 }
3640 p = collend;
3641 break;
3642 default:
3643 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3644 encoding, reason, startp, size, &exc,
3645 collstart-startp, collend-startp, &newpos);
3646 if (repunicode == NULL)
3647 goto onError;
3648 /* need more space? (at least enough for what we have+the
3649 replacement+the rest of the string, so we won't have to
3650 check space for encodable characters) */
3651 respos = str-PyString_AS_STRING(res);
3652 repsize = PyUnicode_GET_SIZE(repunicode);
3653 requiredsize = respos+repsize+(endp-collend);
3654 if (requiredsize > ressize) {
3655 if (requiredsize<2*ressize)
3656 requiredsize = 2*ressize;
3657 if (_PyString_Resize(&res, requiredsize)) {
3658 Py_DECREF(repunicode);
3659 goto onError;
3660 }
3661 str = PyString_AS_STRING(res) + respos;
3662 ressize = requiredsize;
3663 }
3664 /* check if there is anything unencodable in the replacement
3665 and copy it to the output */
3666 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3667 c = *uni2;
3668 if (c >= limit) {
3669 raise_encode_exception(&exc, encoding, startp, size,
3670 unicodepos, unicodepos+1, reason);
3671 Py_DECREF(repunicode);
3672 goto onError;
3673 }
3674 *str = (char)c;
3675 }
3676 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003677 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003678 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003679 }
3680 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003682 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003684 /* If this falls res will be NULL */
3685 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_XDECREF(errorHandler);
3687 Py_XDECREF(exc);
3688 return res;
3689
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003690 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 Py_XDECREF(res);
3692 Py_XDECREF(errorHandler);
3693 Py_XDECREF(exc);
3694 return NULL;
3695}
3696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003698 Py_ssize_t size,
3699 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702}
3703
3704PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3705{
3706 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003707 PyErr_BadArgument();
3708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 }
3710 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003711 PyUnicode_GET_SIZE(unicode),
3712 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713}
3714
3715/* --- 7-bit ASCII Codec -------------------------------------------------- */
3716
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003718 Py_ssize_t size,
3719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 PyUnicodeObject *v;
3723 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003724 Py_ssize_t startinpos;
3725 Py_ssize_t endinpos;
3726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 const char *e;
3728 PyObject *errorHandler = NULL;
3729 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003732 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003733 Py_UNICODE r = *(unsigned char*)s;
3734 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003735 }
Tim Petersced69f82003-09-16 20:30:58 +00003736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 v = _PyUnicode_New(size);
3738 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003739 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003741 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 e = s + size;
3744 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003745 register unsigned char c = (unsigned char)*s;
3746 if (c < 128) {
3747 *p++ = c;
3748 ++s;
3749 }
3750 else {
3751 startinpos = s-starts;
3752 endinpos = startinpos + 1;
3753 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3754 if (unicode_decode_call_errorhandler(
3755 errors, &errorHandler,
3756 "ascii", "ordinal not in range(128)",
3757 starts, size, &startinpos, &endinpos, &exc, &s,
3758 &v, &outpos, &p))
3759 goto onError;
3760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003762 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003763 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3764 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_XDECREF(errorHandler);
3766 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003769 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 Py_XDECREF(errorHandler);
3772 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 return NULL;
3774}
3775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003777 Py_ssize_t size,
3778 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781}
3782
3783PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3784{
3785 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003786 PyErr_BadArgument();
3787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 }
3789 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003790 PyUnicode_GET_SIZE(unicode),
3791 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792}
3793
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003794#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003795
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003796/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003797
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003798#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003799#define NEED_RETRY
3800#endif
3801
3802/* XXX This code is limited to "true" double-byte encodings, as
3803 a) it assumes an incomplete character consists of a single byte, and
3804 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003805 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003806
3807static int is_dbcs_lead_byte(const char *s, int offset)
3808{
3809 const char *curr = s + offset;
3810
3811 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003812 const char *prev = CharPrev(s, curr);
3813 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003814 }
3815 return 0;
3816}
3817
3818/*
3819 * Decode MBCS string into unicode object. If 'final' is set, converts
3820 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3821 */
3822static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003823 const char *s, /* MBCS string */
3824 int size, /* sizeof MBCS string */
3825 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003826{
3827 Py_UNICODE *p;
3828 Py_ssize_t n = 0;
3829 int usize = 0;
3830
3831 assert(size >= 0);
3832
3833 /* Skip trailing lead-byte unless 'final' is set */
3834 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003835 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003836
3837 /* First get the size of the result */
3838 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003839 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3840 if (usize == 0) {
3841 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3842 return -1;
3843 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003844 }
3845
3846 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003847 /* Create unicode object */
3848 *v = _PyUnicode_New(usize);
3849 if (*v == NULL)
3850 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851 }
3852 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 /* Extend unicode object */
3854 n = PyUnicode_GET_SIZE(*v);
3855 if (_PyUnicode_Resize(v, n + usize) < 0)
3856 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003857 }
3858
3859 /* Do the conversion */
3860 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003861 p = PyUnicode_AS_UNICODE(*v) + n;
3862 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3863 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3864 return -1;
3865 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866 }
3867
3868 return size;
3869}
3870
3871PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003872 Py_ssize_t size,
3873 const char *errors,
3874 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003875{
3876 PyUnicodeObject *v = NULL;
3877 int done;
3878
3879 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003880 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003881
3882#ifdef NEED_RETRY
3883 retry:
3884 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003885 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 else
3887#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003888 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003889
3890 if (done < 0) {
3891 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003892 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 }
3894
3895 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003896 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003897
3898#ifdef NEED_RETRY
3899 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003900 s += done;
3901 size -= done;
3902 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003903 }
3904#endif
3905
3906 return (PyObject *)v;
3907}
3908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003909PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003910 Py_ssize_t size,
3911 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003912{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003913 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3914}
3915
3916/*
3917 * Convert unicode into string object (MBCS).
3918 * Returns 0 if succeed, -1 otherwise.
3919 */
3920static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003921 const Py_UNICODE *p, /* unicode */
3922 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923{
3924 int mbcssize = 0;
3925 Py_ssize_t n = 0;
3926
3927 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003928
3929 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003931 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3932 if (mbcssize == 0) {
3933 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3934 return -1;
3935 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003936 }
3937
Martin v. Löwisd8251432006-06-14 05:21:04 +00003938 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003939 /* Create string object */
3940 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3941 if (*repr == NULL)
3942 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003943 }
3944 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003945 /* Extend string object */
3946 n = PyString_Size(*repr);
3947 if (_PyString_Resize(repr, n + mbcssize) < 0)
3948 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003949 }
3950
3951 /* Do the conversion */
3952 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003953 char *s = PyString_AS_STRING(*repr) + n;
3954 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3955 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3956 return -1;
3957 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003958 }
3959
3960 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003961}
3962
3963PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003964 Py_ssize_t size,
3965 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003966{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003967 PyObject *repr = NULL;
3968 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003969
Martin v. Löwisd8251432006-06-14 05:21:04 +00003970#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003971 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003973 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003974 else
3975#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003976 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003977
Martin v. Löwisd8251432006-06-14 05:21:04 +00003978 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 Py_XDECREF(repr);
3980 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003981 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003982
3983#ifdef NEED_RETRY
3984 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003985 p += INT_MAX;
3986 size -= INT_MAX;
3987 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003988 }
3989#endif
3990
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003991 return repr;
3992}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003993
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003994PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3995{
3996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 return NULL;
3999 }
4000 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004001 PyUnicode_GET_SIZE(unicode),
4002 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004003}
4004
Martin v. Löwisd8251432006-06-14 05:21:04 +00004005#undef NEED_RETRY
4006
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004007#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004008
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009/* --- Character Mapping Codec -------------------------------------------- */
4010
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 Py_ssize_t size,
4013 PyObject *mapping,
4014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004017 Py_ssize_t startinpos;
4018 Py_ssize_t endinpos;
4019 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 PyUnicodeObject *v;
4022 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 PyObject *errorHandler = NULL;
4025 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004026 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004027 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004028
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 /* Default to Latin-1 */
4030 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004031 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032
4033 v = _PyUnicode_New(size);
4034 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004035 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004037 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004040 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004041 mapstring = PyUnicode_AS_UNICODE(mapping);
4042 maplen = PyUnicode_GET_SIZE(mapping);
4043 while (s < e) {
4044 unsigned char ch = *s;
4045 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004047 if (ch < maplen)
4048 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004050 if (x == 0xfffe) {
4051 /* undefined mapping */
4052 outpos = p-PyUnicode_AS_UNICODE(v);
4053 startinpos = s-starts;
4054 endinpos = startinpos+1;
4055 if (unicode_decode_call_errorhandler(
4056 errors, &errorHandler,
4057 "charmap", "character maps to <undefined>",
4058 starts, size, &startinpos, &endinpos, &exc, &s,
4059 &v, &outpos, &p)) {
4060 goto onError;
4061 }
4062 continue;
4063 }
4064 *p++ = x;
4065 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004066 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004067 }
4068 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004069 while (s < e) {
4070 unsigned char ch = *s;
4071 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004072
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004073 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4074 w = PyInt_FromLong((long)ch);
4075 if (w == NULL)
4076 goto onError;
4077 x = PyObject_GetItem(mapping, w);
4078 Py_DECREF(w);
4079 if (x == NULL) {
4080 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4081 /* No mapping found means: mapping is undefined. */
4082 PyErr_Clear();
4083 x = Py_None;
4084 Py_INCREF(x);
4085 } else
4086 goto onError;
4087 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004088
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 /* Apply mapping */
4090 if (PyInt_Check(x)) {
4091 long value = PyInt_AS_LONG(x);
4092 if (value < 0 || value > 65535) {
4093 PyErr_SetString(PyExc_TypeError,
4094 "character mapping must be in range(65536)");
4095 Py_DECREF(x);
4096 goto onError;
4097 }
4098 *p++ = (Py_UNICODE)value;
4099 }
4100 else if (x == Py_None) {
4101 /* undefined mapping */
4102 outpos = p-PyUnicode_AS_UNICODE(v);
4103 startinpos = s-starts;
4104 endinpos = startinpos+1;
4105 if (unicode_decode_call_errorhandler(
4106 errors, &errorHandler,
4107 "charmap", "character maps to <undefined>",
4108 starts, size, &startinpos, &endinpos, &exc, &s,
4109 &v, &outpos, &p)) {
4110 Py_DECREF(x);
4111 goto onError;
4112 }
4113 Py_DECREF(x);
4114 continue;
4115 }
4116 else if (PyUnicode_Check(x)) {
4117 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004118
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004119 if (targetsize == 1)
4120 /* 1-1 mapping */
4121 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004122
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004123 else if (targetsize > 1) {
4124 /* 1-n mapping */
4125 if (targetsize > extrachars) {
4126 /* resize first */
4127 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4128 Py_ssize_t needed = (targetsize - extrachars) + \
4129 (targetsize << 2);
4130 extrachars += needed;
4131 /* XXX overflow detection missing */
4132 if (_PyUnicode_Resize(&v,
4133 PyUnicode_GET_SIZE(v) + needed) < 0) {
4134 Py_DECREF(x);
4135 goto onError;
4136 }
4137 p = PyUnicode_AS_UNICODE(v) + oldpos;
4138 }
4139 Py_UNICODE_COPY(p,
4140 PyUnicode_AS_UNICODE(x),
4141 targetsize);
4142 p += targetsize;
4143 extrachars -= targetsize;
4144 }
4145 /* 1-0 mapping: skip the character */
4146 }
4147 else {
4148 /* wrong return value */
4149 PyErr_SetString(PyExc_TypeError,
4150 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004151 Py_DECREF(x);
4152 goto onError;
4153 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004154 Py_DECREF(x);
4155 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 }
4158 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004159 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4160 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 Py_XDECREF(errorHandler);
4162 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004164
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004165 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 Py_XDECREF(errorHandler);
4167 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 Py_XDECREF(v);
4169 return NULL;
4170}
4171
Martin v. Löwis3f767792006-06-04 19:36:28 +00004172/* Charmap encoding: the lookup table */
4173
4174struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004175 PyObject_HEAD
4176 unsigned char level1[32];
4177 int count2, count3;
4178 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004179};
4180
4181static PyObject*
4182encoding_map_size(PyObject *obj, PyObject* args)
4183{
4184 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004185 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004186 128*map->count3);
4187}
4188
4189static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004190 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004191 PyDoc_STR("Return the size (in bytes) of this object") },
4192 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004193};
4194
4195static void
4196encoding_map_dealloc(PyObject* o)
4197{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004198 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004199}
4200
4201static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004202 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004203 "EncodingMap", /*tp_name*/
4204 sizeof(struct encoding_map), /*tp_basicsize*/
4205 0, /*tp_itemsize*/
4206 /* methods */
4207 encoding_map_dealloc, /*tp_dealloc*/
4208 0, /*tp_print*/
4209 0, /*tp_getattr*/
4210 0, /*tp_setattr*/
4211 0, /*tp_compare*/
4212 0, /*tp_repr*/
4213 0, /*tp_as_number*/
4214 0, /*tp_as_sequence*/
4215 0, /*tp_as_mapping*/
4216 0, /*tp_hash*/
4217 0, /*tp_call*/
4218 0, /*tp_str*/
4219 0, /*tp_getattro*/
4220 0, /*tp_setattro*/
4221 0, /*tp_as_buffer*/
4222 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4223 0, /*tp_doc*/
4224 0, /*tp_traverse*/
4225 0, /*tp_clear*/
4226 0, /*tp_richcompare*/
4227 0, /*tp_weaklistoffset*/
4228 0, /*tp_iter*/
4229 0, /*tp_iternext*/
4230 encoding_map_methods, /*tp_methods*/
4231 0, /*tp_members*/
4232 0, /*tp_getset*/
4233 0, /*tp_base*/
4234 0, /*tp_dict*/
4235 0, /*tp_descr_get*/
4236 0, /*tp_descr_set*/
4237 0, /*tp_dictoffset*/
4238 0, /*tp_init*/
4239 0, /*tp_alloc*/
4240 0, /*tp_new*/
4241 0, /*tp_free*/
4242 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004243};
4244
4245PyObject*
4246PyUnicode_BuildEncodingMap(PyObject* string)
4247{
4248 Py_UNICODE *decode;
4249 PyObject *result;
4250 struct encoding_map *mresult;
4251 int i;
4252 int need_dict = 0;
4253 unsigned char level1[32];
4254 unsigned char level2[512];
4255 unsigned char *mlevel1, *mlevel2, *mlevel3;
4256 int count2 = 0, count3 = 0;
4257
4258 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4259 PyErr_BadArgument();
4260 return NULL;
4261 }
4262 decode = PyUnicode_AS_UNICODE(string);
4263 memset(level1, 0xFF, sizeof level1);
4264 memset(level2, 0xFF, sizeof level2);
4265
4266 /* If there isn't a one-to-one mapping of NULL to \0,
4267 or if there are non-BMP characters, we need to use
4268 a mapping dictionary. */
4269 if (decode[0] != 0)
4270 need_dict = 1;
4271 for (i = 1; i < 256; i++) {
4272 int l1, l2;
4273 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004274#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004275 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004276#endif
4277 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004278 need_dict = 1;
4279 break;
4280 }
4281 if (decode[i] == 0xFFFE)
4282 /* unmapped character */
4283 continue;
4284 l1 = decode[i] >> 11;
4285 l2 = decode[i] >> 7;
4286 if (level1[l1] == 0xFF)
4287 level1[l1] = count2++;
4288 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004289 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004290 }
4291
4292 if (count2 >= 0xFF || count3 >= 0xFF)
4293 need_dict = 1;
4294
4295 if (need_dict) {
4296 PyObject *result = PyDict_New();
4297 PyObject *key, *value;
4298 if (!result)
4299 return NULL;
4300 for (i = 0; i < 256; i++) {
4301 key = value = NULL;
4302 key = PyInt_FromLong(decode[i]);
4303 value = PyInt_FromLong(i);
4304 if (!key || !value)
4305 goto failed1;
4306 if (PyDict_SetItem(result, key, value) == -1)
4307 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004308 Py_DECREF(key);
4309 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004310 }
4311 return result;
4312 failed1:
4313 Py_XDECREF(key);
4314 Py_XDECREF(value);
4315 Py_DECREF(result);
4316 return NULL;
4317 }
4318
4319 /* Create a three-level trie */
4320 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4321 16*count2 + 128*count3 - 1);
4322 if (!result)
4323 return PyErr_NoMemory();
4324 PyObject_Init(result, &EncodingMapType);
4325 mresult = (struct encoding_map*)result;
4326 mresult->count2 = count2;
4327 mresult->count3 = count3;
4328 mlevel1 = mresult->level1;
4329 mlevel2 = mresult->level23;
4330 mlevel3 = mresult->level23 + 16*count2;
4331 memcpy(mlevel1, level1, 32);
4332 memset(mlevel2, 0xFF, 16*count2);
4333 memset(mlevel3, 0, 128*count3);
4334 count3 = 0;
4335 for (i = 1; i < 256; i++) {
4336 int o1, o2, o3, i2, i3;
4337 if (decode[i] == 0xFFFE)
4338 /* unmapped character */
4339 continue;
4340 o1 = decode[i]>>11;
4341 o2 = (decode[i]>>7) & 0xF;
4342 i2 = 16*mlevel1[o1] + o2;
4343 if (mlevel2[i2] == 0xFF)
4344 mlevel2[i2] = count3++;
4345 o3 = decode[i] & 0x7F;
4346 i3 = 128*mlevel2[i2] + o3;
4347 mlevel3[i3] = i;
4348 }
4349 return result;
4350}
4351
4352static int
4353encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4354{
4355 struct encoding_map *map = (struct encoding_map*)mapping;
4356 int l1 = c>>11;
4357 int l2 = (c>>7) & 0xF;
4358 int l3 = c & 0x7F;
4359 int i;
4360
4361#ifdef Py_UNICODE_WIDE
4362 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004363 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004364 }
4365#endif
4366 if (c == 0)
4367 return 0;
4368 /* level 1*/
4369 i = map->level1[l1];
4370 if (i == 0xFF) {
4371 return -1;
4372 }
4373 /* level 2*/
4374 i = map->level23[16*i+l2];
4375 if (i == 0xFF) {
4376 return -1;
4377 }
4378 /* level 3 */
4379 i = map->level23[16*map->count2 + 128*i + l3];
4380 if (i == 0) {
4381 return -1;
4382 }
4383 return i;
4384}
4385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386/* Lookup the character ch in the mapping. If the character
4387 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004388 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 PyObject *w = PyInt_FromLong((long)c);
4392 PyObject *x;
4393
4394 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 x = PyObject_GetItem(mapping, w);
4397 Py_DECREF(w);
4398 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004399 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4400 /* No mapping found means: mapping is undefined. */
4401 PyErr_Clear();
4402 x = Py_None;
4403 Py_INCREF(x);
4404 return x;
4405 } else
4406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004408 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004409 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004411 long value = PyInt_AS_LONG(x);
4412 if (value < 0 || value > 255) {
4413 PyErr_SetString(PyExc_TypeError,
4414 "character mapping must be in range(256)");
4415 Py_DECREF(x);
4416 return NULL;
4417 }
4418 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004420 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004421 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004423 /* wrong return value */
4424 PyErr_SetString(PyExc_TypeError,
4425 "character mapping must return integer, None or str");
4426 Py_DECREF(x);
4427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 }
4429}
4430
Martin v. Löwis3f767792006-06-04 19:36:28 +00004431static int
4432charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4433{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004434 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4435 /* exponentially overallocate to minimize reallocations */
4436 if (requiredsize < 2*outsize)
4437 requiredsize = 2*outsize;
4438 if (_PyString_Resize(outobj, requiredsize)) {
4439 return 0;
4440 }
4441 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004442}
4443
Benjamin Peterson857ce152009-01-31 16:29:18 +00004444typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004445 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004446}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447/* lookup the character, put the result in the output string and adjust
4448 various state variables. Reallocate the output string if not enough
4449 space is available. Return a new reference to the object that
4450 was put in the output buffer, or Py_None, if the mapping was undefined
4451 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004452 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004454charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004455 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004457 PyObject *rep;
4458 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004459 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460
Christian Heimese93237d2007-12-19 02:37:44 +00004461 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004462 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004463 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004464 if (res == -1)
4465 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004466 if (outsize<requiredsize)
4467 if (!charmapencode_resize(outobj, outpos, requiredsize))
4468 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004469 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004470 outstart[(*outpos)++] = (char)res;
4471 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472 }
4473
4474 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004476 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004477 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004478 Py_DECREF(rep);
4479 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004480 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004481 if (PyInt_Check(rep)) {
4482 Py_ssize_t requiredsize = *outpos+1;
4483 if (outsize<requiredsize)
4484 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4485 Py_DECREF(rep);
4486 return enc_EXCEPTION;
4487 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004488 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004489 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004490 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004491 else {
4492 const char *repchars = PyString_AS_STRING(rep);
4493 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4494 Py_ssize_t requiredsize = *outpos+repsize;
4495 if (outsize<requiredsize)
4496 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4497 Py_DECREF(rep);
4498 return enc_EXCEPTION;
4499 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004500 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004501 memcpy(outstart + *outpos, repchars, repsize);
4502 *outpos += repsize;
4503 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 }
Georg Brandl9f167602006-06-04 21:46:16 +00004505 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004506 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507}
4508
4509/* handle an error in PyUnicode_EncodeCharmap
4510 Return 0 on success, -1 on error */
4511static
4512int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004513 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004515 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517{
4518 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004519 Py_ssize_t repsize;
4520 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 Py_UNICODE *uni2;
4522 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004523 Py_ssize_t collstartpos = *inpos;
4524 Py_ssize_t collendpos = *inpos+1;
4525 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 char *encoding = "charmap";
4527 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004528 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 /* find all unencodable characters */
4531 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004532 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004533 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004534 int res = encoding_map_lookup(p[collendpos], mapping);
4535 if (res != -1)
4536 break;
4537 ++collendpos;
4538 continue;
4539 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004540
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004541 rep = charmapencode_lookup(p[collendpos], mapping);
4542 if (rep==NULL)
4543 return -1;
4544 else if (rep!=Py_None) {
4545 Py_DECREF(rep);
4546 break;
4547 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004548 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004549 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 }
4551 /* cache callback name lookup
4552 * (if not done yet, i.e. it's the first error) */
4553 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004554 if ((errors==NULL) || (!strcmp(errors, "strict")))
4555 *known_errorHandler = 1;
4556 else if (!strcmp(errors, "replace"))
4557 *known_errorHandler = 2;
4558 else if (!strcmp(errors, "ignore"))
4559 *known_errorHandler = 3;
4560 else if (!strcmp(errors, "xmlcharrefreplace"))
4561 *known_errorHandler = 4;
4562 else
4563 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 }
4565 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004566 case 1: /* strict */
4567 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4568 return -1;
4569 case 2: /* replace */
4570 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004571 x = charmapencode_output('?', mapping, res, respos);
4572 if (x==enc_EXCEPTION) {
4573 return -1;
4574 }
4575 else if (x==enc_FAILED) {
4576 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4577 return -1;
4578 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004579 }
4580 /* fall through */
4581 case 3: /* ignore */
4582 *inpos = collendpos;
4583 break;
4584 case 4: /* xmlcharrefreplace */
4585 /* generate replacement (temporarily (mis)uses p) */
4586 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004587 char buffer[2+29+1+1];
4588 char *cp;
4589 sprintf(buffer, "&#%d;", (int)p[collpos]);
4590 for (cp = buffer; *cp; ++cp) {
4591 x = charmapencode_output(*cp, mapping, res, respos);
4592 if (x==enc_EXCEPTION)
4593 return -1;
4594 else if (x==enc_FAILED) {
4595 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4596 return -1;
4597 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004598 }
4599 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004600 *inpos = collendpos;
4601 break;
4602 default:
4603 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004604 encoding, reason, p, size, exceptionObject,
4605 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004606 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004607 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004608 /* generate replacement */
4609 repsize = PyUnicode_GET_SIZE(repunicode);
4610 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004611 x = charmapencode_output(*uni2, mapping, res, respos);
4612 if (x==enc_EXCEPTION) {
4613 return -1;
4614 }
4615 else if (x==enc_FAILED) {
4616 Py_DECREF(repunicode);
4617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4618 return -1;
4619 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004620 }
4621 *inpos = newpos;
4622 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 }
4624 return 0;
4625}
4626
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004628 Py_ssize_t size,
4629 PyObject *mapping,
4630 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 /* output object */
4633 PyObject *res = NULL;
4634 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 PyObject *errorHandler = NULL;
4639 PyObject *exc = NULL;
4640 /* the following variable is used for caching string comparisons
4641 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4642 * 3=ignore, 4=xmlcharrefreplace */
4643 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644
4645 /* Default to Latin-1 */
4646 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004647 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 /* allocate enough for a simple encoding without
4650 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004651 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 if (res == NULL)
4653 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004654 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004655 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004658 /* try to encode it */
4659 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4660 if (x==enc_EXCEPTION) /* error */
4661 goto onError;
4662 if (x==enc_FAILED) { /* unencodable character */
4663 if (charmap_encoding_error(p, size, &inpos, mapping,
4664 &exc,
4665 &known_errorHandler, &errorHandler, errors,
4666 &res, &respos)) {
4667 goto onError;
4668 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004669 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004670 else
4671 /* done with this character => adjust input position */
4672 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004676 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004677 if (_PyString_Resize(&res, respos))
4678 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 }
4680 Py_XDECREF(exc);
4681 Py_XDECREF(errorHandler);
4682 return res;
4683
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004684 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 Py_XDECREF(res);
4686 Py_XDECREF(exc);
4687 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 return NULL;
4689}
4690
4691PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004692 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693{
4694 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004695 PyErr_BadArgument();
4696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 }
4698 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004699 PyUnicode_GET_SIZE(unicode),
4700 mapping,
4701 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702}
4703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704/* create or adjust a UnicodeTranslateError */
4705static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004706 const Py_UNICODE *unicode, Py_ssize_t size,
4707 Py_ssize_t startpos, Py_ssize_t endpos,
4708 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004711 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004712 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 }
4714 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004715 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4716 goto onError;
4717 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4718 goto onError;
4719 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4720 goto onError;
4721 return;
4722 onError:
4723 Py_DECREF(*exceptionObject);
4724 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 }
4726}
4727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728/* raises a UnicodeTranslateError */
4729static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004730 const Py_UNICODE *unicode, Py_ssize_t size,
4731 Py_ssize_t startpos, Py_ssize_t endpos,
4732 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733{
4734 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004735 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004737 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738}
4739
4740/* error handling callback helper:
4741 build arguments, call the callback and check the arguments,
4742 put the result into newpos and return the replacement string, which
4743 has to be freed by the caller */
4744static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004745 PyObject **errorHandler,
4746 const char *reason,
4747 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4748 Py_ssize_t startpos, Py_ssize_t endpos,
4749 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004751 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752
Martin v. Löwis412fb672006-04-13 06:34:32 +00004753 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 PyObject *restuple;
4755 PyObject *resunicode;
4756
4757 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004758 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004759 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004760 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 }
4762
4763 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004764 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004766 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767
4768 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004769 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004771 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004773 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004774 Py_DECREF(restuple);
4775 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 }
4777 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004778 &resunicode, &i_newpos)) {
4779 Py_DECREF(restuple);
4780 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004782 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004783 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 else
4785 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004786 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004787 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4788 Py_DECREF(restuple);
4789 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004790 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 Py_INCREF(resunicode);
4792 Py_DECREF(restuple);
4793 return resunicode;
4794}
4795
4796/* Lookup the character ch in the mapping and put the result in result,
4797 which must be decrefed by the caller.
4798 Return 0 on success, -1 on error */
4799static
4800int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4801{
4802 PyObject *w = PyInt_FromLong((long)c);
4803 PyObject *x;
4804
4805 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 x = PyObject_GetItem(mapping, w);
4808 Py_DECREF(w);
4809 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004810 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4811 /* No mapping found means: use 1:1 mapping. */
4812 PyErr_Clear();
4813 *result = NULL;
4814 return 0;
4815 } else
4816 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 }
4818 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004819 *result = x;
4820 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 }
4822 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004823 long value = PyInt_AS_LONG(x);
4824 long max = PyUnicode_GetMax();
4825 if (value < 0 || value > max) {
4826 PyErr_Format(PyExc_TypeError,
4827 "character mapping must be in range(0x%lx)", max+1);
4828 Py_DECREF(x);
4829 return -1;
4830 }
4831 *result = x;
4832 return 0;
4833 }
4834 else if (PyUnicode_Check(x)) {
4835 *result = x;
4836 return 0;
4837 }
4838 else {
4839 /* wrong return value */
4840 PyErr_SetString(PyExc_TypeError,
4841 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004842 Py_DECREF(x);
4843 return -1;
4844 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845}
4846/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 if not reallocate and adjust various state variables.
4848 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849static
Walter Dörwald4894c302003-10-24 14:25:28 +00004850int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004851 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004853 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004854 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004855 /* remember old output position */
4856 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4857 /* exponentially overallocate to minimize reallocations */
4858 if (requiredsize < 2 * oldsize)
4859 requiredsize = 2 * oldsize;
4860 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4861 return -1;
4862 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 }
4864 return 0;
4865}
4866/* lookup the character, put the result in the output string and adjust
4867 various state variables. Return a new reference to the object that
4868 was put in the output buffer in *result, or Py_None, if the mapping was
4869 undefined (in which case no character was written).
4870 The called must decref result.
4871 Return 0 on success, -1 on error. */
4872static
Walter Dörwald4894c302003-10-24 14:25:28 +00004873int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004874 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4875 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876{
Walter Dörwald4894c302003-10-24 14:25:28 +00004877 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004878 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004880 /* not found => default to 1:1 mapping */
4881 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 }
4883 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004884 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004886 /* no overflow check, because we know that the space is enough */
4887 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888 }
4889 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004890 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4891 if (repsize==1) {
4892 /* no overflow check, because we know that the space is enough */
4893 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4894 }
4895 else if (repsize!=0) {
4896 /* more than one character */
4897 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4898 (insize - (curinp-startinp)) +
4899 repsize - 1;
4900 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4901 return -1;
4902 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4903 *outp += repsize;
4904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 }
4906 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004907 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 return 0;
4909}
4910
4911PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004912 Py_ssize_t size,
4913 PyObject *mapping,
4914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 /* output object */
4917 PyObject *res = NULL;
4918 /* pointers to the beginning and end+1 of input */
4919 const Py_UNICODE *startp = p;
4920 const Py_UNICODE *endp = p + size;
4921 /* pointer into the output */
4922 Py_UNICODE *str;
4923 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 char *reason = "character maps to <undefined>";
4926 PyObject *errorHandler = NULL;
4927 PyObject *exc = NULL;
4928 /* the following variable is used for caching string comparisons
4929 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4930 * 3=ignore, 4=xmlcharrefreplace */
4931 int known_errorHandler = -1;
4932
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004934 PyErr_BadArgument();
4935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937
4938 /* allocate enough for a simple 1:1 translation without
4939 replacements, if we need more, we'll resize */
4940 res = PyUnicode_FromUnicode(NULL, size);
4941 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004942 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004944 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004948 /* try to encode it */
4949 PyObject *x = NULL;
4950 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4951 Py_XDECREF(x);
4952 goto onError;
4953 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004954 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 if (x!=Py_None) /* it worked => adjust input pointer */
4956 ++p;
4957 else { /* untranslatable character */
4958 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4959 Py_ssize_t repsize;
4960 Py_ssize_t newpos;
4961 Py_UNICODE *uni2;
4962 /* startpos for collecting untranslatable chars */
4963 const Py_UNICODE *collstart = p;
4964 const Py_UNICODE *collend = p+1;
4965 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004967 /* find all untranslatable characters */
4968 while (collend < endp) {
4969 if (charmaptranslate_lookup(*collend, mapping, &x))
4970 goto onError;
4971 Py_XDECREF(x);
4972 if (x!=Py_None)
4973 break;
4974 ++collend;
4975 }
4976 /* cache callback name lookup
4977 * (if not done yet, i.e. it's the first error) */
4978 if (known_errorHandler==-1) {
4979 if ((errors==NULL) || (!strcmp(errors, "strict")))
4980 known_errorHandler = 1;
4981 else if (!strcmp(errors, "replace"))
4982 known_errorHandler = 2;
4983 else if (!strcmp(errors, "ignore"))
4984 known_errorHandler = 3;
4985 else if (!strcmp(errors, "xmlcharrefreplace"))
4986 known_errorHandler = 4;
4987 else
4988 known_errorHandler = 0;
4989 }
4990 switch (known_errorHandler) {
4991 case 1: /* strict */
4992 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004993 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004994 case 2: /* replace */
4995 /* No need to check for space, this is a 1:1 replacement */
4996 for (coll = collstart; coll<collend; ++coll)
4997 *str++ = '?';
4998 /* fall through */
4999 case 3: /* ignore */
5000 p = collend;
5001 break;
5002 case 4: /* xmlcharrefreplace */
5003 /* generate replacement (temporarily (mis)uses p) */
5004 for (p = collstart; p < collend; ++p) {
5005 char buffer[2+29+1+1];
5006 char *cp;
5007 sprintf(buffer, "&#%d;", (int)*p);
5008 if (charmaptranslate_makespace(&res, &str,
5009 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5010 goto onError;
5011 for (cp = buffer; *cp; ++cp)
5012 *str++ = *cp;
5013 }
5014 p = collend;
5015 break;
5016 default:
5017 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5018 reason, startp, size, &exc,
5019 collstart-startp, collend-startp, &newpos);
5020 if (repunicode == NULL)
5021 goto onError;
5022 /* generate replacement */
5023 repsize = PyUnicode_GET_SIZE(repunicode);
5024 if (charmaptranslate_makespace(&res, &str,
5025 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5026 Py_DECREF(repunicode);
5027 goto onError;
5028 }
5029 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5030 *str++ = *uni2;
5031 p = startp + newpos;
5032 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005033 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005034 }
5035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 /* Resize if we allocated to much */
5037 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005038 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005039 if (PyUnicode_Resize(&res, respos) < 0)
5040 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 }
5042 Py_XDECREF(exc);
5043 Py_XDECREF(errorHandler);
5044 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005046 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 Py_XDECREF(res);
5048 Py_XDECREF(exc);
5049 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 return NULL;
5051}
5052
5053PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005054 PyObject *mapping,
5055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056{
5057 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005058
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 str = PyUnicode_FromObject(str);
5060 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005061 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005063 PyUnicode_GET_SIZE(str),
5064 mapping,
5065 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 Py_DECREF(str);
5067 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005068
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005069 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 Py_XDECREF(str);
5071 return NULL;
5072}
Tim Petersced69f82003-09-16 20:30:58 +00005073
Guido van Rossum9e896b32000-04-05 20:11:21 +00005074/* --- Decimal Encoder ---------------------------------------------------- */
5075
5076int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005077 Py_ssize_t length,
5078 char *output,
5079 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005080{
5081 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 PyObject *errorHandler = NULL;
5083 PyObject *exc = NULL;
5084 const char *encoding = "decimal";
5085 const char *reason = "invalid decimal Unicode string";
5086 /* the following variable is used for caching string comparisons
5087 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5088 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005089
5090 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005091 PyErr_BadArgument();
5092 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005093 }
5094
5095 p = s;
5096 end = s + length;
5097 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005098 register Py_UNICODE ch = *p;
5099 int decimal;
5100 PyObject *repunicode;
5101 Py_ssize_t repsize;
5102 Py_ssize_t newpos;
5103 Py_UNICODE *uni2;
5104 Py_UNICODE *collstart;
5105 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005106
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005107 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005108 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 ++p;
5110 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005111 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005112 decimal = Py_UNICODE_TODECIMAL(ch);
5113 if (decimal >= 0) {
5114 *output++ = '0' + decimal;
5115 ++p;
5116 continue;
5117 }
5118 if (0 < ch && ch < 256) {
5119 *output++ = (char)ch;
5120 ++p;
5121 continue;
5122 }
5123 /* All other characters are considered unencodable */
5124 collstart = p;
5125 collend = p+1;
5126 while (collend < end) {
5127 if ((0 < *collend && *collend < 256) ||
5128 !Py_UNICODE_ISSPACE(*collend) ||
5129 Py_UNICODE_TODECIMAL(*collend))
5130 break;
5131 }
5132 /* cache callback name lookup
5133 * (if not done yet, i.e. it's the first error) */
5134 if (known_errorHandler==-1) {
5135 if ((errors==NULL) || (!strcmp(errors, "strict")))
5136 known_errorHandler = 1;
5137 else if (!strcmp(errors, "replace"))
5138 known_errorHandler = 2;
5139 else if (!strcmp(errors, "ignore"))
5140 known_errorHandler = 3;
5141 else if (!strcmp(errors, "xmlcharrefreplace"))
5142 known_errorHandler = 4;
5143 else
5144 known_errorHandler = 0;
5145 }
5146 switch (known_errorHandler) {
5147 case 1: /* strict */
5148 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5149 goto onError;
5150 case 2: /* replace */
5151 for (p = collstart; p < collend; ++p)
5152 *output++ = '?';
5153 /* fall through */
5154 case 3: /* ignore */
5155 p = collend;
5156 break;
5157 case 4: /* xmlcharrefreplace */
5158 /* generate replacement (temporarily (mis)uses p) */
5159 for (p = collstart; p < collend; ++p)
5160 output += sprintf(output, "&#%d;", (int)*p);
5161 p = collend;
5162 break;
5163 default:
5164 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5165 encoding, reason, s, length, &exc,
5166 collstart-s, collend-s, &newpos);
5167 if (repunicode == NULL)
5168 goto onError;
5169 /* generate replacement */
5170 repsize = PyUnicode_GET_SIZE(repunicode);
5171 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5172 Py_UNICODE ch = *uni2;
5173 if (Py_UNICODE_ISSPACE(ch))
5174 *output++ = ' ';
5175 else {
5176 decimal = Py_UNICODE_TODECIMAL(ch);
5177 if (decimal >= 0)
5178 *output++ = '0' + decimal;
5179 else if (0 < ch && ch < 256)
5180 *output++ = (char)ch;
5181 else {
5182 Py_DECREF(repunicode);
5183 raise_encode_exception(&exc, encoding,
5184 s, length, collstart-s, collend-s, reason);
5185 goto onError;
5186 }
5187 }
5188 }
5189 p = s + newpos;
5190 Py_DECREF(repunicode);
5191 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005192 }
5193 /* 0-terminate the output string */
5194 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 Py_XDECREF(exc);
5196 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005197 return 0;
5198
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005199 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 Py_XDECREF(exc);
5201 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005202 return -1;
5203}
5204
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205/* --- Helpers ------------------------------------------------------------ */
5206
Eric Smitha9f7d622008-02-17 19:46:49 +00005207#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005208
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005209#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005210
Fredrik Lundha50d2012006-05-26 17:04:58 +00005211#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005212
5213#include "stringlib/count.h"
5214#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005215#include "stringlib/partition.h"
5216
Fredrik Lundhc8162812006-05-26 19:33:03 +00005217/* helper macro to fixup start/end slice values */
5218#define FIX_START_END(obj) \
5219 if (start < 0) \
5220 start += (obj)->length; \
5221 if (start < 0) \
5222 start = 0; \
5223 if (end > (obj)->length) \
5224 end = (obj)->length; \
5225 if (end < 0) \
5226 end += (obj)->length; \
5227 if (end < 0) \
5228 end = 0;
5229
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005231 PyObject *substr,
5232 Py_ssize_t start,
5233 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005236 PyUnicodeObject* str_obj;
5237 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005238
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005239 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5240 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005241 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005242 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5243 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005244 Py_DECREF(str_obj);
5245 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 }
Tim Petersced69f82003-09-16 20:30:58 +00005247
Fredrik Lundhc8162812006-05-26 19:33:03 +00005248 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005249
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005250 result = stringlib_count(
5251 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5252 );
5253
5254 Py_DECREF(sub_obj);
5255 Py_DECREF(str_obj);
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 return result;
5258}
5259
Martin v. Löwis18e16552006-02-15 17:27:45 +00005260Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005261 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005262 Py_ssize_t start,
5263 Py_ssize_t end,
5264 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005266 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005267
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005268 str = PyUnicode_FromObject(str);
5269 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005270 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005271 sub = PyUnicode_FromObject(sub);
5272 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005273 Py_DECREF(str);
5274 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 }
Tim Petersced69f82003-09-16 20:30:58 +00005276
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005277 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005278 result = stringlib_find_slice(
5279 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5280 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5281 start, end
5282 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005283 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005284 result = stringlib_rfind_slice(
5285 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5286 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5287 start, end
5288 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005289
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005290 Py_DECREF(str);
5291 Py_DECREF(sub);
5292
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 return result;
5294}
5295
Tim Petersced69f82003-09-16 20:30:58 +00005296static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005298 PyUnicodeObject *substring,
5299 Py_ssize_t start,
5300 Py_ssize_t end,
5301 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 if (substring->length == 0)
5304 return 1;
5305
Fredrik Lundhc8162812006-05-26 19:33:03 +00005306 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
5308 end -= substring->length;
5309 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005310 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
5312 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005313 if (Py_UNICODE_MATCH(self, end, substring))
5314 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 } else {
5316 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005317 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 }
5319
5320 return 0;
5321}
5322
Martin v. Löwis18e16552006-02-15 17:27:45 +00005323Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005324 PyObject *substr,
5325 Py_ssize_t start,
5326 Py_ssize_t end,
5327 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 str = PyUnicode_FromObject(str);
5332 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005333 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 substr = PyUnicode_FromObject(substr);
5335 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005336 Py_DECREF(str);
5337 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 }
Tim Petersced69f82003-09-16 20:30:58 +00005339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005341 (PyUnicodeObject *)substr,
5342 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 Py_DECREF(str);
5344 Py_DECREF(substr);
5345 return result;
5346}
5347
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348/* Apply fixfct filter to the Unicode object self and return a
5349 reference to the modified object */
5350
Tim Petersced69f82003-09-16 20:30:58 +00005351static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005353 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
5355
5356 PyUnicodeObject *u;
5357
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005358 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005360 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005361
5362 Py_UNICODE_COPY(u->str, self->str, self->length);
5363
Tim Peters7a29bd52001-09-12 03:03:31 +00005364 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005365 /* fixfct should return TRUE if it modified the buffer. If
5366 FALSE, return a reference to the original buffer instead
5367 (to save space, not time) */
5368 Py_INCREF(self);
5369 Py_DECREF(u);
5370 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 }
5372 return (PyObject*) u;
5373}
5374
Tim Petersced69f82003-09-16 20:30:58 +00005375static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376int fixupper(PyUnicodeObject *self)
5377{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005378 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 Py_UNICODE *s = self->str;
5380 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005381
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005384
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005385 ch = Py_UNICODE_TOUPPER(*s);
5386 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005388 *s = ch;
5389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 s++;
5391 }
5392
5393 return status;
5394}
5395
Tim Petersced69f82003-09-16 20:30:58 +00005396static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397int fixlower(PyUnicodeObject *self)
5398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 Py_UNICODE *s = self->str;
5401 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005402
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005404 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005405
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005406 ch = Py_UNICODE_TOLOWER(*s);
5407 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005409 *s = ch;
5410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 s++;
5412 }
5413
5414 return status;
5415}
5416
Tim Petersced69f82003-09-16 20:30:58 +00005417static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418int fixswapcase(PyUnicodeObject *self)
5419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005420 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 Py_UNICODE *s = self->str;
5422 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 while (len-- > 0) {
5425 if (Py_UNICODE_ISUPPER(*s)) {
5426 *s = Py_UNICODE_TOLOWER(*s);
5427 status = 1;
5428 } else if (Py_UNICODE_ISLOWER(*s)) {
5429 *s = Py_UNICODE_TOUPPER(*s);
5430 status = 1;
5431 }
5432 s++;
5433 }
5434
5435 return status;
5436}
5437
Tim Petersced69f82003-09-16 20:30:58 +00005438static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439int fixcapitalize(PyUnicodeObject *self)
5440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005441 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005442 Py_UNICODE *s = self->str;
5443 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005444
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005445 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005447 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005448 *s = Py_UNICODE_TOUPPER(*s);
5449 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005451 s++;
5452 while (--len > 0) {
5453 if (Py_UNICODE_ISUPPER(*s)) {
5454 *s = Py_UNICODE_TOLOWER(*s);
5455 status = 1;
5456 }
5457 s++;
5458 }
5459 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460}
5461
5462static
5463int fixtitle(PyUnicodeObject *self)
5464{
5465 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5466 register Py_UNICODE *e;
5467 int previous_is_cased;
5468
5469 /* Shortcut for single character strings */
5470 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005471 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5472 if (*p != ch) {
5473 *p = ch;
5474 return 1;
5475 }
5476 else
5477 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 }
Tim Petersced69f82003-09-16 20:30:58 +00005479
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 e = p + PyUnicode_GET_SIZE(self);
5481 previous_is_cased = 0;
5482 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005483 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005484
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005485 if (previous_is_cased)
5486 *p = Py_UNICODE_TOLOWER(ch);
5487 else
5488 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005489
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005490 if (Py_UNICODE_ISLOWER(ch) ||
5491 Py_UNICODE_ISUPPER(ch) ||
5492 Py_UNICODE_ISTITLE(ch))
5493 previous_is_cased = 1;
5494 else
5495 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 }
5497 return 1;
5498}
5499
Tim Peters8ce9f162004-08-27 01:49:32 +00005500PyObject *
5501PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502{
Tim Peters8ce9f162004-08-27 01:49:32 +00005503 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005504 const Py_UNICODE blank = ' ';
5505 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005506 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005507 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005508 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5509 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005510 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5511 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005513 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005514 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 fseq = PySequence_Fast(seq, "");
5517 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005518 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005519 }
5520
Tim Peters91879ab2004-08-27 22:35:44 +00005521 /* Grrrr. A codec may be invoked to convert str objects to
5522 * Unicode, and so it's possible to call back into Python code
5523 * during PyUnicode_FromObject(), and so it's possible for a sick
5524 * codec to change the size of fseq (if seq is a list). Therefore
5525 * we have to keep refetching the size -- can't assume seqlen
5526 * is invariant.
5527 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 seqlen = PySequence_Fast_GET_SIZE(fseq);
5529 /* If empty sequence, return u"". */
5530 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005531 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5532 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 }
5534 /* If singleton sequence with an exact Unicode, return that. */
5535 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005536 item = PySequence_Fast_GET_ITEM(fseq, 0);
5537 if (PyUnicode_CheckExact(item)) {
5538 Py_INCREF(item);
5539 res = (PyUnicodeObject *)item;
5540 goto Done;
5541 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 }
5543
Tim Peters05eba1f2004-08-27 21:32:02 +00005544 /* At least two items to join, or one that isn't exact Unicode. */
5545 if (seqlen > 1) {
5546 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005547 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005548 sep = &blank;
5549 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005551 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005552 internal_separator = PyUnicode_FromObject(separator);
5553 if (internal_separator == NULL)
5554 goto onError;
5555 sep = PyUnicode_AS_UNICODE(internal_separator);
5556 seplen = PyUnicode_GET_SIZE(internal_separator);
5557 /* In case PyUnicode_FromObject() mutated seq. */
5558 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005559 }
5560 }
5561
5562 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005563 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005564 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005565 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005566 res_p = PyUnicode_AS_UNICODE(res);
5567 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005568
Tim Peters05eba1f2004-08-27 21:32:02 +00005569 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005570 Py_ssize_t itemlen;
5571 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005572
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005573 item = PySequence_Fast_GET_ITEM(fseq, i);
5574 /* Convert item to Unicode. */
5575 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5576 PyErr_Format(PyExc_TypeError,
5577 "sequence item %zd: expected string or Unicode,"
5578 " %.80s found",
5579 i, Py_TYPE(item)->tp_name);
5580 goto onError;
5581 }
5582 item = PyUnicode_FromObject(item);
5583 if (item == NULL)
5584 goto onError;
5585 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005586
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005587 /* In case PyUnicode_FromObject() mutated seq. */
5588 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005589
Tim Peters8ce9f162004-08-27 01:49:32 +00005590 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005591 itemlen = PyUnicode_GET_SIZE(item);
5592 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005593 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005594 goto Overflow;
5595 if (i < seqlen - 1) {
5596 new_res_used += seplen;
5597 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005598 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005599 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005600 if (new_res_used > res_alloc) {
5601 /* double allocated size until it's big enough */
5602 do {
5603 res_alloc += res_alloc;
5604 if (res_alloc <= 0)
5605 goto Overflow;
5606 } while (new_res_used > res_alloc);
5607 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5608 Py_DECREF(item);
5609 goto onError;
5610 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005611 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005612 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005613
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005614 /* Copy item, and maybe the separator. */
5615 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5616 res_p += itemlen;
5617 if (i < seqlen - 1) {
5618 Py_UNICODE_COPY(res_p, sep, seplen);
5619 res_p += seplen;
5620 }
5621 Py_DECREF(item);
5622 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005623 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005624
Tim Peters05eba1f2004-08-27 21:32:02 +00005625 /* Shrink res to match the used area; this probably can't fail,
5626 * but it's cheap to check.
5627 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005628 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005629 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005630
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005631 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005632 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005633 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 return (PyObject *)res;
5635
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005636 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005637 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005638 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005639 Py_DECREF(item);
5640 /* fall through */
5641
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005642 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005643 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005644 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005645 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 return NULL;
5647}
5648
Tim Petersced69f82003-09-16 20:30:58 +00005649static
5650PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005651 Py_ssize_t left,
5652 Py_ssize_t right,
5653 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654{
5655 PyUnicodeObject *u;
5656
5657 if (left < 0)
5658 left = 0;
5659 if (right < 0)
5660 right = 0;
5661
Tim Peters7a29bd52001-09-12 03:03:31 +00005662 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 Py_INCREF(self);
5664 return self;
5665 }
5666
Neal Norwitze7d8be82008-07-31 17:17:14 +00005667 if (left > PY_SSIZE_T_MAX - self->length ||
5668 right > PY_SSIZE_T_MAX - (left + self->length)) {
5669 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5670 return NULL;
5671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 u = _PyUnicode_New(left + self->length + right);
5673 if (u) {
5674 if (left)
5675 Py_UNICODE_FILL(u->str, fill, left);
5676 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5677 if (right)
5678 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5679 }
5680
5681 return u;
5682}
5683
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005684#define SPLIT_APPEND(data, left, right) \
5685 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5686 if (!str) \
5687 goto onError; \
5688 if (PyList_Append(list, str)) { \
5689 Py_DECREF(str); \
5690 goto onError; \
5691 } \
5692 else \
5693 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
5695static
5696PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005697 PyObject *list,
5698 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005700 register Py_ssize_t i;
5701 register Py_ssize_t j;
5702 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005704 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705
5706 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005707 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005708 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005709 i++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005710 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005711 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5712 i++;
5713 if (j < i) {
5714 if (maxcount-- <= 0)
5715 break;
5716 SPLIT_APPEND(buf, j, i);
5717 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5718 i++;
5719 j = i;
5720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 }
5722 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005723 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
5725 return list;
5726
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005727 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 Py_DECREF(list);
5729 return NULL;
5730}
5731
5732PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005733 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005735 register Py_ssize_t i;
5736 register Py_ssize_t j;
5737 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 PyObject *list;
5739 PyObject *str;
5740 Py_UNICODE *data;
5741
5742 string = PyUnicode_FromObject(string);
5743 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 data = PyUnicode_AS_UNICODE(string);
5746 len = PyUnicode_GET_SIZE(string);
5747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 list = PyList_New(0);
5749 if (!list)
5750 goto onError;
5751
5752 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005753 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005754
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005755 /* Find a line and append it */
5756 while (i < len && !BLOOM_LINEBREAK(data[i]))
5757 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005759 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005760 eol = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005761 if (i < len) {
5762 if (data[i] == '\r' && i + 1 < len &&
5763 data[i+1] == '\n')
5764 i += 2;
5765 else
5766 i++;
5767 if (keepends)
5768 eol = i;
5769 }
5770 SPLIT_APPEND(data, j, eol);
5771 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 }
5773 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005774 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 }
5776
5777 Py_DECREF(string);
5778 return list;
5779
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005780 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005781 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 Py_DECREF(string);
5783 return NULL;
5784}
5785
Tim Petersced69f82003-09-16 20:30:58 +00005786static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787PyObject *split_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005788 PyObject *list,
5789 Py_UNICODE ch,
5790 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 register Py_ssize_t i;
5793 register Py_ssize_t j;
5794 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005796 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
5798 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005799 if (buf[i] == ch) {
5800 if (maxcount-- <= 0)
5801 break;
5802 SPLIT_APPEND(buf, j, i);
5803 i = j = i + 1;
5804 } else
5805 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 }
5807 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005808 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 }
5810 return list;
5811
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005812 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 Py_DECREF(list);
5814 return NULL;
5815}
5816
Tim Petersced69f82003-09-16 20:30:58 +00005817static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818PyObject *split_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005819 PyObject *list,
5820 PyUnicodeObject *substring,
5821 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005823 register Py_ssize_t i;
5824 register Py_ssize_t j;
5825 Py_ssize_t len = self->length;
5826 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 PyObject *str;
5828
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005829 for (i = j = 0; i <= len - sublen; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005830 if (Py_UNICODE_MATCH(self, i, substring)) {
5831 if (maxcount-- <= 0)
5832 break;
5833 SPLIT_APPEND(self->str, j, i);
5834 i = j = i + sublen;
5835 } else
5836 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 }
5838 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005839 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
5841 return list;
5842
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_DECREF(list);
5845 return NULL;
5846}
5847
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005848static
5849PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005850 PyObject *list,
5851 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 register Py_ssize_t i;
5854 register Py_ssize_t j;
5855 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005857 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858
5859 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005860 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005861 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005862 i--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005863 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005864 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5865 i--;
5866 if (j > i) {
5867 if (maxcount-- <= 0)
5868 break;
5869 SPLIT_APPEND(buf, i + 1, j + 1);
5870 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5871 i--;
5872 j = i;
5873 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874 }
5875 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005876 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005877 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005878 if (PyList_Reverse(list) < 0)
5879 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005880 return list;
5881
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005882 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005883 Py_DECREF(list);
5884 return NULL;
5885}
5886
Benjamin Peterson857ce152009-01-31 16:29:18 +00005887static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005888PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005889 PyObject *list,
5890 Py_UNICODE ch,
5891 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005893 register Py_ssize_t i;
5894 register Py_ssize_t j;
5895 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005896 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005897 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005898
5899 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005900 if (buf[i] == ch) {
5901 if (maxcount-- <= 0)
5902 break;
5903 SPLIT_APPEND(buf, i + 1, j + 1);
5904 j = i = i - 1;
5905 } else
5906 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005907 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005908 if (j >= -1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005909 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005910 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005911 if (PyList_Reverse(list) < 0)
5912 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005913 return list;
5914
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005915 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005916 Py_DECREF(list);
5917 return NULL;
5918}
5919
Benjamin Peterson857ce152009-01-31 16:29:18 +00005920static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005921PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005922 PyObject *list,
5923 PyUnicodeObject *substring,
5924 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005925{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 register Py_ssize_t i;
5927 register Py_ssize_t j;
5928 Py_ssize_t len = self->length;
5929 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005930 PyObject *str;
5931
5932 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005933 if (Py_UNICODE_MATCH(self, i, substring)) {
5934 if (maxcount-- <= 0)
5935 break;
5936 SPLIT_APPEND(self->str, i + sublen, j);
5937 j = i;
5938 i -= sublen;
5939 } else
5940 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005941 }
5942 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005943 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005944 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005945 if (PyList_Reverse(list) < 0)
5946 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005947 return list;
5948
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005949 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005950 Py_DECREF(list);
5951 return NULL;
5952}
5953
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954#undef SPLIT_APPEND
5955
5956static
5957PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005958 PyUnicodeObject *substring,
5959 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960{
5961 PyObject *list;
5962
5963 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005964 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965
5966 list = PyList_New(0);
5967 if (!list)
5968 return NULL;
5969
5970 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005971 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
5973 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005974 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
5976 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005977 Py_DECREF(list);
5978 PyErr_SetString(PyExc_ValueError, "empty separator");
5979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 }
5981 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005982 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983}
5984
Tim Petersced69f82003-09-16 20:30:58 +00005985static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005986PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005987 PyUnicodeObject *substring,
5988 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005989{
5990 PyObject *list;
5991
5992 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005993 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994
5995 list = PyList_New(0);
5996 if (!list)
5997 return NULL;
5998
5999 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006000 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006001
6002 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006003 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004
6005 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006006 Py_DECREF(list);
6007 PyErr_SetString(PyExc_ValueError, "empty separator");
6008 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006009 }
6010 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006011 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012}
6013
6014static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006016 PyUnicodeObject *str1,
6017 PyUnicodeObject *str2,
6018 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019{
6020 PyUnicodeObject *u;
6021
6022 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006023 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
Fredrik Lundh347ee272006-05-24 16:35:18 +00006025 if (str1->length == str2->length) {
6026 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00006027 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006028 if (str1->length == 1) {
6029 /* replace characters */
6030 Py_UNICODE u1, u2;
6031 if (!findchar(self->str, self->length, str1->str[0]))
6032 goto nothing;
6033 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6034 if (!u)
6035 return NULL;
6036 Py_UNICODE_COPY(u->str, self->str, self->length);
6037 u1 = str1->str[0];
6038 u2 = str2->str[0];
6039 for (i = 0; i < u->length; i++)
6040 if (u->str[i] == u1) {
6041 if (--maxcount < 0)
6042 break;
6043 u->str[i] = u2;
6044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006046 i = fastsearch(
6047 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00006049 if (i < 0)
6050 goto nothing;
6051 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6052 if (!u)
6053 return NULL;
6054 Py_UNICODE_COPY(u->str, self->str, self->length);
6055 while (i <= self->length - str1->length)
6056 if (Py_UNICODE_MATCH(self, i, str1)) {
6057 if (--maxcount < 0)
6058 break;
6059 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6060 i += str1->length;
6061 } else
6062 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006065
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006066 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006067 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 Py_UNICODE *p;
6069
6070 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006071 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 if (n > maxcount)
6073 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006074 if (n == 0)
6075 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006076 /* new_size = self->length + n * (str2->length - str1->length)); */
6077 delta = (str2->length - str1->length);
6078 if (delta == 0) {
6079 new_size = self->length;
6080 } else {
6081 product = n * (str2->length - str1->length);
6082 if ((product / (str2->length - str1->length)) != n) {
6083 PyErr_SetString(PyExc_OverflowError,
6084 "replace string is too long");
6085 return NULL;
6086 }
6087 new_size = self->length + product;
6088 if (new_size < 0) {
6089 PyErr_SetString(PyExc_OverflowError,
6090 "replace string is too long");
6091 return NULL;
6092 }
6093 }
6094 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006095 if (!u)
6096 return NULL;
6097 i = 0;
6098 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006099 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006100 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006101 while (n-- > 0) {
6102 /* look for next match */
6103 j = i;
6104 while (j <= e) {
6105 if (Py_UNICODE_MATCH(self, j, str1))
6106 break;
6107 j++;
6108 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006109 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006110 if (j > e)
6111 break;
6112 /* copy unchanged part [i:j] */
6113 Py_UNICODE_COPY(p, self->str+i, j-i);
6114 p += j - i;
6115 }
6116 /* copy substitution string */
6117 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006118 Py_UNICODE_COPY(p, str2->str, str2->length);
6119 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006120 }
6121 i = j + str1->length;
6122 }
6123 if (i < self->length)
6124 /* copy tail [i:] */
6125 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006126 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006127 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006128 while (n > 0) {
6129 Py_UNICODE_COPY(p, str2->str, str2->length);
6130 p += str2->length;
6131 if (--n <= 0)
6132 break;
6133 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006135 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 }
6137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006139
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006140 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006141 /* nothing to replace; return original string (when possible) */
6142 if (PyUnicode_CheckExact(self)) {
6143 Py_INCREF(self);
6144 return (PyObject *) self;
6145 }
6146 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147}
6148
6149/* --- Unicode Object Methods --------------------------------------------- */
6150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006152 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153\n\
6154Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006155characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006158unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 return fixup(self, fixtitle);
6161}
6162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006163PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006164 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165\n\
6166Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006167have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006170unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return fixup(self, fixcapitalize);
6173}
6174
6175#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006176PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006177 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178\n\
6179Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181
6182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006183unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
6185 PyObject *list;
6186 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006187 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 /* Split into words */
6190 list = split(self, NULL, -1);
6191 if (!list)
6192 return NULL;
6193
6194 /* Capitalize each word */
6195 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6196 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006197 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 if (item == NULL)
6199 goto onError;
6200 Py_DECREF(PyList_GET_ITEM(list, i));
6201 PyList_SET_ITEM(list, i, item);
6202 }
6203
6204 /* Join the words to form a new string */
6205 item = PyUnicode_Join(NULL, list);
6206
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 Py_DECREF(list);
6209 return (PyObject *)item;
6210}
6211#endif
6212
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006213/* Argument converter. Coerces to a single unicode character */
6214
6215static int
6216convert_uc(PyObject *obj, void *addr)
6217{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006218 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6219 PyObject *uniobj;
6220 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006221
Benjamin Peterson857ce152009-01-31 16:29:18 +00006222 uniobj = PyUnicode_FromObject(obj);
6223 if (uniobj == NULL) {
6224 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006225 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006226 return 0;
6227 }
6228 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6229 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006230 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006231 Py_DECREF(uniobj);
6232 return 0;
6233 }
6234 unistr = PyUnicode_AS_UNICODE(uniobj);
6235 *fillcharloc = unistr[0];
6236 Py_DECREF(uniobj);
6237 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006238}
6239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006240PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006241 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006243Return S centered in a Unicode string of length width. Padding is\n\
6244done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245
6246static PyObject *
6247unicode_center(PyUnicodeObject *self, PyObject *args)
6248{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006249 Py_ssize_t marg, left;
6250 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006251 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252
Thomas Woutersde017742006-02-16 19:34:37 +00006253 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 return NULL;
6255
Tim Peters7a29bd52001-09-12 03:03:31 +00006256 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 Py_INCREF(self);
6258 return (PyObject*) self;
6259 }
6260
6261 marg = width - self->length;
6262 left = marg / 2 + (marg & width & 1);
6263
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006264 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Marc-André Lemburge5034372000-08-08 08:04:29 +00006267#if 0
6268
6269/* This code should go into some future Unicode collation support
6270 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006271 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006272
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006273/* speedy UTF-16 code point order comparison */
6274/* gleaned from: */
6275/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6276
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006277static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006278{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006279 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006280 0, 0, 0, 0, 0, 0, 0, 0,
6281 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006282 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006283};
6284
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285static int
6286unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6287{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006288 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 Py_UNICODE *s1 = str1->str;
6291 Py_UNICODE *s2 = str2->str;
6292
6293 len1 = str1->length;
6294 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006297 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006298
6299 c1 = *s1++;
6300 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006301
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006302 if (c1 > (1<<11) * 26)
6303 c1 += utf16Fixup[c1>>11];
6304 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006305 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006306 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006307
6308 if (c1 != c2)
6309 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006310
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006311 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
6313
6314 return (len1 < len2) ? -1 : (len1 != len2);
6315}
6316
Marc-André Lemburge5034372000-08-08 08:04:29 +00006317#else
6318
6319static int
6320unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6321{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006322 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006323
6324 Py_UNICODE *s1 = str1->str;
6325 Py_UNICODE *s2 = str2->str;
6326
6327 len1 = str1->length;
6328 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006329
Marc-André Lemburge5034372000-08-08 08:04:29 +00006330 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006331 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006332
Fredrik Lundh45714e92001-06-26 16:39:36 +00006333 c1 = *s1++;
6334 c2 = *s2++;
6335
6336 if (c1 != c2)
6337 return (c1 < c2) ? -1 : 1;
6338
Marc-André Lemburge5034372000-08-08 08:04:29 +00006339 len1--; len2--;
6340 }
6341
6342 return (len1 < len2) ? -1 : (len1 != len2);
6343}
6344
6345#endif
6346
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006348 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
6350 PyUnicodeObject *u = NULL, *v = NULL;
6351 int result;
6352
6353 /* Coerce the two arguments */
6354 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6355 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006356 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6358 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
Thomas Wouters7e474022000-07-16 12:04:32 +00006361 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006363 Py_DECREF(u);
6364 Py_DECREF(v);
6365 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 }
6367
6368 result = unicode_compare(u, v);
6369
6370 Py_DECREF(u);
6371 Py_DECREF(v);
6372 return result;
6373
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006374 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 Py_XDECREF(u);
6376 Py_XDECREF(v);
6377 return -1;
6378}
6379
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006380PyObject *PyUnicode_RichCompare(PyObject *left,
6381 PyObject *right,
6382 int op)
6383{
6384 int result;
6385
6386 result = PyUnicode_Compare(left, right);
6387 if (result == -1 && PyErr_Occurred())
6388 goto onError;
6389
6390 /* Convert the return value to a Boolean */
6391 switch (op) {
6392 case Py_EQ:
6393 result = (result == 0);
6394 break;
6395 case Py_NE:
6396 result = (result != 0);
6397 break;
6398 case Py_LE:
6399 result = (result <= 0);
6400 break;
6401 case Py_GE:
6402 result = (result >= 0);
6403 break;
6404 case Py_LT:
6405 result = (result == -1);
6406 break;
6407 case Py_GT:
6408 result = (result == 1);
6409 break;
6410 }
6411 return PyBool_FromLong(result);
6412
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006413 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006414
6415 /* Standard case
6416
6417 Type errors mean that PyUnicode_FromObject() could not convert
6418 one of the arguments (usually the right hand side) to Unicode,
6419 ie. we can't handle the comparison request. However, it is
6420 possible that the other object knows a comparison method, which
6421 is why we return Py_NotImplemented to give the other object a
6422 chance.
6423
6424 */
6425 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6426 PyErr_Clear();
6427 Py_INCREF(Py_NotImplemented);
6428 return Py_NotImplemented;
6429 }
6430 if (op != Py_EQ && op != Py_NE)
6431 return NULL;
6432
6433 /* Equality comparison.
6434
6435 This is a special case: we silence any PyExc_UnicodeDecodeError
6436 and instead turn it into a PyErr_UnicodeWarning.
6437
6438 */
6439 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6440 return NULL;
6441 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006442 if (PyErr_Warn(PyExc_UnicodeWarning,
6443 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006444 "Unicode equal comparison "
6445 "failed to convert both arguments to Unicode - "
6446 "interpreting them as being unequal" :
6447 "Unicode unequal comparison "
6448 "failed to convert both arguments to Unicode - "
6449 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006450 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006451 return NULL;
6452 result = (op == Py_NE);
6453 return PyBool_FromLong(result);
6454}
6455
Guido van Rossum403d68b2000-03-13 15:55:09 +00006456int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006457 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006458{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006459 PyObject *str, *sub;
6460 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006461
6462 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006463 sub = PyUnicode_FromObject(element);
6464 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006465 PyErr_SetString(PyExc_TypeError,
6466 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006467 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006468 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006469
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006470 str = PyUnicode_FromObject(container);
6471 if (!str) {
6472 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006473 return -1;
6474 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006475
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006476 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006477
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006478 Py_DECREF(str);
6479 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006480
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006481 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006482}
6483
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484/* Concat to string or Unicode object giving a new Unicode object. */
6485
6486PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006487 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488{
6489 PyUnicodeObject *u = NULL, *v = NULL, *w;
6490
6491 /* Coerce the two arguments */
6492 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6493 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6496 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
6499 /* Shortcuts */
6500 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006501 Py_DECREF(v);
6502 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 }
6504 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006505 Py_DECREF(u);
6506 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 }
6508
6509 /* Concat the two Unicode strings */
6510 w = _PyUnicode_New(u->length + v->length);
6511 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006512 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 Py_UNICODE_COPY(w->str, u->str, u->length);
6514 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6515
6516 Py_DECREF(u);
6517 Py_DECREF(v);
6518 return (PyObject *)w;
6519
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006520 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 Py_XDECREF(u);
6522 Py_XDECREF(v);
6523 return NULL;
6524}
6525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006526PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006527 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006529Return the number of non-overlapping occurrences of substring sub in\n\
6530Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006531interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
6533static PyObject *
6534unicode_count(PyUnicodeObject *self, PyObject *args)
6535{
6536 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006538 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 PyObject *result;
6540
Guido van Rossumb8872e62000-05-09 14:14:27 +00006541 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006542 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 return NULL;
6544
6545 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006546 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006548 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006549
Fredrik Lundhc8162812006-05-26 19:33:03 +00006550 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006552 result = PyInt_FromSsize_t(
6553 stringlib_count(self->str + start, end - start,
6554 substring->str, substring->length)
6555 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556
6557 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006558
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 return result;
6560}
6561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006562PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006563 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006565Encodes S using the codec registered for encoding. encoding defaults\n\
6566to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006567handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006568a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6569'xmlcharrefreplace' as well as any other name registered with\n\
6570codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
6572static PyObject *
6573unicode_encode(PyUnicodeObject *self, PyObject *args)
6574{
6575 char *encoding = NULL;
6576 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006577 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006578
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6580 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006581 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006582 if (v == NULL)
6583 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006584 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006585 PyErr_Format(PyExc_TypeError,
6586 "encoder did not return a string/unicode object "
6587 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006588 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006589 Py_DECREF(v);
6590 return NULL;
6591 }
6592 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006593
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006594 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006595 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006596}
6597
6598PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006599 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006600\n\
6601Decodes S using the codec registered for encoding. encoding defaults\n\
6602to the default encoding. errors may be given to set a different error\n\
6603handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6604a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6605as well as any other name registerd with codecs.register_error that is\n\
6606able to handle UnicodeDecodeErrors.");
6607
6608static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006609unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006610{
6611 char *encoding = NULL;
6612 char *errors = NULL;
6613 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006614
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006615 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6616 return NULL;
6617 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006618 if (v == NULL)
6619 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006620 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006621 PyErr_Format(PyExc_TypeError,
6622 "decoder did not return a string/unicode object "
6623 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006624 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006625 Py_DECREF(v);
6626 return NULL;
6627 }
6628 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006629
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006630 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632}
6633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006635 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636\n\
6637Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006638If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
6640static PyObject*
6641unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6642{
6643 Py_UNICODE *e;
6644 Py_UNICODE *p;
6645 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006646 Py_UNICODE *qe;
6647 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 PyUnicodeObject *u;
6649 int tabsize = 8;
6650
6651 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
Thomas Wouters7e474022000-07-16 12:04:32 +00006654 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006655 i = 0; /* chars up to and including most recent \n or \r */
6656 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6657 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 for (p = self->str; p < e; p++)
6659 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006660 if (tabsize > 0) {
6661 incr = tabsize - (j % tabsize); /* cannot overflow */
6662 if (j > PY_SSIZE_T_MAX - incr)
6663 goto overflow1;
6664 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006665 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006668 if (j > PY_SSIZE_T_MAX - 1)
6669 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 j++;
6671 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006672 if (i > PY_SSIZE_T_MAX - j)
6673 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006675 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 }
6677 }
6678
Guido van Rossum5bdff602008-03-11 21:18:06 +00006679 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006680 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006681
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 /* Second pass: create output string and fill it */
6683 u = _PyUnicode_New(i + j);
6684 if (!u)
6685 return NULL;
6686
Guido van Rossum5bdff602008-03-11 21:18:06 +00006687 j = 0; /* same as in first pass */
6688 q = u->str; /* next output char */
6689 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
6691 for (p = self->str; p < e; p++)
6692 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006693 if (tabsize > 0) {
6694 i = tabsize - (j % tabsize);
6695 j += i;
6696 while (i--) {
6697 if (q >= qe)
6698 goto overflow2;
6699 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006700 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006701 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006702 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006703 else {
6704 if (q >= qe)
6705 goto overflow2;
6706 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006707 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 if (*p == '\n' || *p == '\r')
6709 j = 0;
6710 }
6711
6712 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006713
6714 overflow2:
6715 Py_DECREF(u);
6716 overflow1:
6717 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719}
6720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006721PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006722 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723\n\
6724Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006725such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726arguments start and end are interpreted as in slice notation.\n\
6727\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006728Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
6730static PyObject *
6731unicode_find(PyUnicodeObject *self, PyObject *args)
6732{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006733 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006734 Py_ssize_t start;
6735 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006736 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
Facundo Batista57d56692007-11-16 18:04:14 +00006738 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006741 result = stringlib_find_slice(
6742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6744 start, end
6745 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
6747 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006748
6749 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750}
6751
6752static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
6755 if (index < 0 || index >= self->length) {
6756 PyErr_SetString(PyExc_IndexError, "string index out of range");
6757 return NULL;
6758 }
6759
6760 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6761}
6762
6763static long
6764unicode_hash(PyUnicodeObject *self)
6765{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006766 /* Since Unicode objects compare equal to their ASCII string
6767 counterparts, they should use the individual character values
6768 as basis for their hash value. This is needed to assure that
6769 strings and Unicode objects behave in the same way as
6770 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771
Martin v. Löwis18e16552006-02-15 17:27:45 +00006772 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006773 register Py_UNICODE *p;
6774 register long x;
6775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006777 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006778 len = PyUnicode_GET_SIZE(self);
6779 p = PyUnicode_AS_UNICODE(self);
6780 x = *p << 7;
6781 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006782 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006783 x ^= PyUnicode_GET_SIZE(self);
6784 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006785 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006786 self->hash = x;
6787 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006790PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006793Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794
6795static PyObject *
6796unicode_index(PyUnicodeObject *self, PyObject *args)
6797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006798 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006799 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006800 Py_ssize_t start;
6801 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
Facundo Batista57d56692007-11-16 18:04:14 +00006803 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006806 result = stringlib_find_slice(
6807 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6808 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6809 start, end
6810 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 if (result < 0) {
6815 PyErr_SetString(PyExc_ValueError, "substring not found");
6816 return NULL;
6817 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006818
Martin v. Löwis18e16552006-02-15 17:27:45 +00006819 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006823 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006826at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
6828static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006829unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830{
6831 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6832 register const Py_UNICODE *e;
6833 int cased;
6834
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 /* Shortcut for single character strings */
6836 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006837 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006839 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006840 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006841 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 e = p + PyUnicode_GET_SIZE(self);
6844 cased = 0;
6845 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006846 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006847
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6849 return PyBool_FromLong(0);
6850 else if (!cased && Py_UNICODE_ISLOWER(ch))
6851 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854}
6855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006857 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006859Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006860at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
6862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006863unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864{
6865 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6866 register const Py_UNICODE *e;
6867 int cased;
6868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006871 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006873 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006874 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006875 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006876
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 e = p + PyUnicode_GET_SIZE(self);
6878 cased = 0;
6879 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006880 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006881
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006882 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6883 return PyBool_FromLong(0);
6884 else if (!cased && Py_UNICODE_ISUPPER(ch))
6885 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888}
6889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006891 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006893Return True if S is a titlecased string and there is at least one\n\
6894character in S, i.e. upper- and titlecase characters may only\n\
6895follow uncased characters and lowercase characters only cased ones.\n\
6896Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
6898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006899unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6902 register const Py_UNICODE *e;
6903 int cased, previous_is_cased;
6904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 /* Shortcut for single character strings */
6906 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006907 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6908 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006910 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006911 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006912 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 e = p + PyUnicode_GET_SIZE(self);
6915 cased = 0;
6916 previous_is_cased = 0;
6917 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006918 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006919
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006920 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6921 if (previous_is_cased)
6922 return PyBool_FromLong(0);
6923 previous_is_cased = 1;
6924 cased = 1;
6925 }
6926 else if (Py_UNICODE_ISLOWER(ch)) {
6927 if (!previous_is_cased)
6928 return PyBool_FromLong(0);
6929 previous_is_cased = 1;
6930 cased = 1;
6931 }
6932 else
6933 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006939 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006941Return True if all characters in S are whitespace\n\
6942and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 Py_UNICODE_ISSPACE(*p))
6953 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006961 if (!Py_UNICODE_ISSPACE(*p))
6962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965}
6966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006968 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006969\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006970Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006972
6973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006974unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006975{
6976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977 register const Py_UNICODE *e;
6978
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006979 /* Shortcut for single character strings */
6980 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 Py_UNICODE_ISALPHA(*p))
6982 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006983
6984 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006985 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006986 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006987
6988 e = p + PyUnicode_GET_SIZE(self);
6989 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006990 if (!Py_UNICODE_ISALPHA(*p))
6991 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006994}
6995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006996PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006997 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006998\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006999Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007000and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007001
7002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007003unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007004{
7005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006 register const Py_UNICODE *e;
7007
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007008 /* Shortcut for single character strings */
7009 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007010 Py_UNICODE_ISALNUM(*p))
7011 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007012
7013 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007014 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007015 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007016
7017 e = p + PyUnicode_GET_SIZE(self);
7018 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007019 if (!Py_UNICODE_ISALNUM(*p))
7020 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007022 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007026 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007028Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007032unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
7034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035 register const Py_UNICODE *e;
7036
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 /* Shortcut for single character strings */
7038 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007039 Py_UNICODE_ISDECIMAL(*p))
7040 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007042 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007043 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007044 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007045
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 e = p + PyUnicode_GET_SIZE(self);
7047 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007048 if (!Py_UNICODE_ISDECIMAL(*p))
7049 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007051 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007055 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007057Return True if all characters in S are digits\n\
7058and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007061unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062{
7063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7064 register const Py_UNICODE *e;
7065
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 /* Shortcut for single character strings */
7067 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007068 Py_UNICODE_ISDIGIT(*p))
7069 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007071 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007072 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007073 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007074
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 e = p + PyUnicode_GET_SIZE(self);
7076 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007077 if (!Py_UNICODE_ISDIGIT(*p))
7078 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007080 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081}
7082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007083PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007084 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007086Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088
7089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007090unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091{
7092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7093 register const Py_UNICODE *e;
7094
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 /* Shortcut for single character strings */
7096 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007097 Py_UNICODE_ISNUMERIC(*p))
7098 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007100 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007101 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007102 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007103
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 e = p + PyUnicode_GET_SIZE(self);
7105 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007106 if (!Py_UNICODE_ISNUMERIC(*p))
7107 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007109 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(join__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007113 "S.join(sequence) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
7115Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117
7118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007119unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007121 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122}
7123
Martin v. Löwis18e16552006-02-15 17:27:45 +00007124static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125unicode_length(PyUnicodeObject *self)
7126{
7127 return self->length;
7128}
7129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007130PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007131 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007133Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007134done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135
7136static PyObject *
7137unicode_ljust(PyUnicodeObject *self, PyObject *args)
7138{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007139 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007140 Py_UNICODE fillchar = ' ';
7141
Martin v. Löwis412fb672006-04-13 06:34:32 +00007142 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 return NULL;
7144
Tim Peters7a29bd52001-09-12 03:03:31 +00007145 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 Py_INCREF(self);
7147 return (PyObject*) self;
7148 }
7149
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007150 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151}
7152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007153PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007154 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
7158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007159unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 return fixup(self, fixlower);
7162}
7163
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007164#define LEFTSTRIP 0
7165#define RIGHTSTRIP 1
7166#define BOTHSTRIP 2
7167
7168/* Arrays indexed by above */
7169static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7170
7171#define STRIPNAME(i) (stripformat[i]+3)
7172
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173/* externally visible for str.strip(unicode) */
7174PyObject *
7175_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7176{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007177 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7178 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7179 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7180 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7181 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007183 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007184
Benjamin Peterson857ce152009-01-31 16:29:18 +00007185 i = 0;
7186 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007187 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7188 i++;
7189 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007190 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007191
Benjamin Peterson857ce152009-01-31 16:29:18 +00007192 j = len;
7193 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007194 do {
7195 j--;
7196 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7197 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007198 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199
Benjamin Peterson857ce152009-01-31 16:29:18 +00007200 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007201 Py_INCREF(self);
7202 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007203 }
7204 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007205 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007206}
7207
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208
7209static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007210do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007212 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7213 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007214
Benjamin Peterson857ce152009-01-31 16:29:18 +00007215 i = 0;
7216 if (striptype != RIGHTSTRIP) {
7217 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7218 i++;
7219 }
7220 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007221
Benjamin Peterson857ce152009-01-31 16:29:18 +00007222 j = len;
7223 if (striptype != LEFTSTRIP) {
7224 do {
7225 j--;
7226 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7227 j++;
7228 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007229
Benjamin Peterson857ce152009-01-31 16:29:18 +00007230 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7231 Py_INCREF(self);
7232 return (PyObject*)self;
7233 }
7234 else
7235 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236}
7237
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007238
7239static PyObject *
7240do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7241{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007242 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007243
Benjamin Peterson857ce152009-01-31 16:29:18 +00007244 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7245 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007246
Benjamin Peterson857ce152009-01-31 16:29:18 +00007247 if (sep != NULL && sep != Py_None) {
7248 if (PyUnicode_Check(sep))
7249 return _PyUnicode_XStrip(self, striptype, sep);
7250 else if (PyString_Check(sep)) {
7251 PyObject *res;
7252 sep = PyUnicode_FromObject(sep);
7253 if (sep==NULL)
7254 return NULL;
7255 res = _PyUnicode_XStrip(self, striptype, sep);
7256 Py_DECREF(sep);
7257 return res;
7258 }
7259 else {
7260 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007261 "%s arg must be None, unicode or str",
7262 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007263 return NULL;
7264 }
7265 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007266
Benjamin Peterson857ce152009-01-31 16:29:18 +00007267 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007268}
7269
7270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007271PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007272 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007273\n\
7274Return a copy of the string S with leading and trailing\n\
7275whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007276If chars is given and not None, remove characters in chars instead.\n\
7277If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007278
7279static PyObject *
7280unicode_strip(PyUnicodeObject *self, PyObject *args)
7281{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007282 if (PyTuple_GET_SIZE(args) == 0)
7283 return do_strip(self, BOTHSTRIP); /* Common case */
7284 else
7285 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007286}
7287
7288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007289PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007290 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007291\n\
7292Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007293If chars is given and not None, remove characters in chars instead.\n\
7294If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007295
7296static PyObject *
7297unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7298{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007299 if (PyTuple_GET_SIZE(args) == 0)
7300 return do_strip(self, LEFTSTRIP); /* Common case */
7301 else
7302 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007303}
7304
7305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007307 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007308\n\
7309Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007310If chars is given and not None, remove characters in chars instead.\n\
7311If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007312
7313static PyObject *
7314unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7315{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007316 if (PyTuple_GET_SIZE(args) == 0)
7317 return do_strip(self, RIGHTSTRIP); /* Common case */
7318 else
7319 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007320}
7321
7322
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007324unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325{
7326 PyUnicodeObject *u;
7327 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007328 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007329 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331 if (len < 0)
7332 len = 0;
7333
Tim Peters7a29bd52001-09-12 03:03:31 +00007334 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 /* no repeat, return original string */
7336 Py_INCREF(str);
7337 return (PyObject*) str;
7338 }
Tim Peters8f422462000-09-09 06:13:41 +00007339
7340 /* ensure # of chars needed doesn't overflow int and # of bytes
7341 * needed doesn't overflow size_t
7342 */
7343 nchars = len * str->length;
7344 if (len && nchars / len != str->length) {
7345 PyErr_SetString(PyExc_OverflowError,
7346 "repeated string is too long");
7347 return NULL;
7348 }
7349 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7350 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7351 PyErr_SetString(PyExc_OverflowError,
7352 "repeated string is too long");
7353 return NULL;
7354 }
7355 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 if (!u)
7357 return NULL;
7358
7359 p = u->str;
7360
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007361 if (str->length == 1 && len > 0) {
7362 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007363 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007364 Py_ssize_t done = 0; /* number of characters copied this far */
7365 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007366 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007367 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007368 }
7369 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007370 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007371 Py_UNICODE_COPY(p+done, p, n);
7372 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007373 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
7376 return (PyObject*) u;
7377}
7378
7379PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007380 PyObject *subobj,
7381 PyObject *replobj,
7382 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383{
7384 PyObject *self;
7385 PyObject *str1;
7386 PyObject *str2;
7387 PyObject *result;
7388
7389 self = PyUnicode_FromObject(obj);
7390 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 str1 = PyUnicode_FromObject(subobj);
7393 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007394 Py_DECREF(self);
7395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 }
7397 str2 = PyUnicode_FromObject(replobj);
7398 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007399 Py_DECREF(self);
7400 Py_DECREF(str1);
7401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 }
Tim Petersced69f82003-09-16 20:30:58 +00007403 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007404 (PyUnicodeObject *)str1,
7405 (PyUnicodeObject *)str2,
7406 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 Py_DECREF(self);
7408 Py_DECREF(str1);
7409 Py_DECREF(str2);
7410 return result;
7411}
7412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007413PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007414 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415\n\
7416Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007417old replaced by new. If the optional argument count is\n\
7418given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
7420static PyObject*
7421unicode_replace(PyUnicodeObject *self, PyObject *args)
7422{
7423 PyUnicodeObject *str1;
7424 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007425 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 PyObject *result;
7427
Martin v. Löwis18e16552006-02-15 17:27:45 +00007428 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 return NULL;
7430 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7431 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007434 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007435 Py_DECREF(str1);
7436 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
7439 result = replace(self, str1, str2, maxcount);
7440
7441 Py_DECREF(str1);
7442 Py_DECREF(str2);
7443 return result;
7444}
7445
7446static
7447PyObject *unicode_repr(PyObject *unicode)
7448{
7449 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007450 PyUnicode_GET_SIZE(unicode),
7451 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452}
7453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007454PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007455 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456\n\
7457Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007458such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459arguments start and end are interpreted as in slice notation.\n\
7460\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007461Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462
7463static PyObject *
7464unicode_rfind(PyUnicodeObject *self, PyObject *args)
7465{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007466 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007467 Py_ssize_t start;
7468 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007469 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
Facundo Batista57d56692007-11-16 18:04:14 +00007471 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007474 result = stringlib_rfind_slice(
7475 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7476 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7477 start, end
7478 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
7480 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007481
7482 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483}
7484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007485PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007486 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489
7490static PyObject *
7491unicode_rindex(PyUnicodeObject *self, PyObject *args)
7492{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007493 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007494 Py_ssize_t start;
7495 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007496 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
Facundo Batista57d56692007-11-16 18:04:14 +00007498 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007501 result = stringlib_rfind_slice(
7502 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7503 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7504 start, end
7505 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
7507 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007508
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 if (result < 0) {
7510 PyErr_SetString(PyExc_ValueError, "substring not found");
7511 return NULL;
7512 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514}
7515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007516PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007517 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007519Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007520done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521
7522static PyObject *
7523unicode_rjust(PyUnicodeObject *self, PyObject *args)
7524{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007525 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007526 Py_UNICODE fillchar = ' ';
7527
Martin v. Löwis412fb672006-04-13 06:34:32 +00007528 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 return NULL;
7530
Tim Peters7a29bd52001-09-12 03:03:31 +00007531 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 Py_INCREF(self);
7533 return (PyObject*) self;
7534 }
7535
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007536 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537}
7538
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541{
7542 /* standard clamping */
7543 if (start < 0)
7544 start = 0;
7545 if (end < 0)
7546 end = 0;
7547 if (end > self->length)
7548 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007549 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 /* full slice, return original string */
7551 Py_INCREF(self);
7552 return (PyObject*) self;
7553 }
7554 if (start > end)
7555 start = end;
7556 /* copy slice */
7557 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007558 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559}
7560
7561PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007562 PyObject *sep,
7563 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
7565 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007566
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 s = PyUnicode_FromObject(s);
7568 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007569 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007570 if (sep != NULL) {
7571 sep = PyUnicode_FromObject(sep);
7572 if (sep == NULL) {
7573 Py_DECREF(s);
7574 return NULL;
7575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 }
7577
7578 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7579
7580 Py_DECREF(s);
7581 Py_XDECREF(sep);
7582 return result;
7583}
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007586 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587\n\
7588Return a list of the words in S, using sep as the\n\
7589delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007590splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007591whitespace string is a separator and empty strings are\n\
7592removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
7594static PyObject*
7595unicode_split(PyUnicodeObject *self, PyObject *args)
7596{
7597 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007598 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
Martin v. Löwis18e16552006-02-15 17:27:45 +00007600 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 return NULL;
7602
7603 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007604 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007606 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007608 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609}
7610
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007611PyObject *
7612PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7613{
7614 PyObject* str_obj;
7615 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007616 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007617
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007618 str_obj = PyUnicode_FromObject(str_in);
7619 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007620 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007621 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007622 if (!sep_obj) {
7623 Py_DECREF(str_obj);
7624 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007625 }
7626
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007627 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007628 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7629 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7630 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007631
Fredrik Lundhb9479482006-05-26 17:22:38 +00007632 Py_DECREF(sep_obj);
7633 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007634
7635 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007636}
7637
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007638
7639PyObject *
7640PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7641{
7642 PyObject* str_obj;
7643 PyObject* sep_obj;
7644 PyObject* out;
7645
7646 str_obj = PyUnicode_FromObject(str_in);
7647 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007648 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007649 sep_obj = PyUnicode_FromObject(sep_in);
7650 if (!sep_obj) {
7651 Py_DECREF(str_obj);
7652 return NULL;
7653 }
7654
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007655 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007656 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7657 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7658 );
7659
7660 Py_DECREF(sep_obj);
7661 Py_DECREF(str_obj);
7662
7663 return out;
7664}
7665
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007666PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007667 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007668\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007669Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007670the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007671found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007672
7673static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007674unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007675{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007676 return PyUnicode_Partition((PyObject *)self, separator);
7677}
7678
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007679PyDoc_STRVAR(rpartition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007680 "S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007681\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007682Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007683the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007684separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007685
7686static PyObject*
7687unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7688{
7689 return PyUnicode_RPartition((PyObject *)self, separator);
7690}
7691
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007692PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007693 PyObject *sep,
7694 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007695{
7696 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007697
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007698 s = PyUnicode_FromObject(s);
7699 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007700 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007701 if (sep != NULL) {
7702 sep = PyUnicode_FromObject(sep);
7703 if (sep == NULL) {
7704 Py_DECREF(s);
7705 return NULL;
7706 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007707 }
7708
7709 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7710
7711 Py_DECREF(s);
7712 Py_XDECREF(sep);
7713 return result;
7714}
7715
7716PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007717 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007718\n\
7719Return a list of the words in S, using sep as the\n\
7720delimiter string, starting at the end of the string and\n\
7721working to the front. If maxsplit is given, at most maxsplit\n\
7722splits are done. If sep is not specified, any whitespace string\n\
7723is a separator.");
7724
7725static PyObject*
7726unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7727{
7728 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007729 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007730
Martin v. Löwis18e16552006-02-15 17:27:45 +00007731 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007732 return NULL;
7733
7734 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007735 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007736 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007737 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007738 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007739 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007740}
7741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007742PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007743 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744\n\
7745Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007746Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
7749static PyObject*
7750unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7751{
Guido van Rossum86662912000-04-11 15:38:46 +00007752 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
Guido van Rossum86662912000-04-11 15:38:46 +00007754 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 return NULL;
7756
Guido van Rossum86662912000-04-11 15:38:46 +00007757 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
7760static
7761PyObject *unicode_str(PyUnicodeObject *self)
7762{
Fred Drakee4315f52000-05-09 19:53:39 +00007763 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764}
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007767 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768\n\
7769Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007770and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007773unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 return fixup(self, fixswapcase);
7776}
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007779 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780\n\
7781Return a copy of the string S, where all characters have been mapped\n\
7782through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007783Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7784Unmapped characters are left untouched. Characters mapped to None\n\
7785are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786
7787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007788unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
Tim Petersced69f82003-09-16 20:30:58 +00007790 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007791 self->length,
7792 table,
7793 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007797 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800
7801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 return fixup(self, fixupper);
7805}
7806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007807PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007808 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809\n\
Georg Brandl98064072008-09-09 19:26:00 +00007810Pad a numeric string S with zeros on the left, to fill a field\n\
7811of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
7813static PyObject *
7814unicode_zfill(PyUnicodeObject *self, PyObject *args)
7815{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007816 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 PyUnicodeObject *u;
7818
Martin v. Löwis18e16552006-02-15 17:27:45 +00007819 Py_ssize_t width;
7820 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 return NULL;
7822
7823 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007824 if (PyUnicode_CheckExact(self)) {
7825 Py_INCREF(self);
7826 return (PyObject*) self;
7827 }
7828 else
7829 return PyUnicode_FromUnicode(
7830 PyUnicode_AS_UNICODE(self),
7831 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007832 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 }
7834
7835 fill = width - self->length;
7836
7837 u = pad(self, fill, 0, '0');
7838
Walter Dörwald068325e2002-04-15 13:36:47 +00007839 if (u == NULL)
7840 return NULL;
7841
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 if (u->str[fill] == '+' || u->str[fill] == '-') {
7843 /* move sign to beginning of string */
7844 u->str[0] = u->str[fill];
7845 u->str[fill] = '0';
7846 }
7847
7848 return (PyObject*) u;
7849}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850
7851#if 0
7852static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007853free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007855 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856}
7857#endif
7858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007860 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007862Return True if S starts with the specified prefix, False otherwise.\n\
7863With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007864With optional end, stop comparing S at that position.\n\
7865prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
7867static PyObject *
7868unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007869 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870{
Georg Brandl24250812006-06-09 18:45:48 +00007871 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007874 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007875 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876
Georg Brandl24250812006-06-09 18:45:48 +00007877 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007878 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7879 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007880 if (PyTuple_Check(subobj)) {
7881 Py_ssize_t i;
7882 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007884 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007885 if (substring == NULL)
7886 return NULL;
7887 result = tailmatch(self, substring, start, end, -1);
7888 Py_DECREF(substring);
7889 if (result) {
7890 Py_RETURN_TRUE;
7891 }
7892 }
7893 /* nothing matched */
7894 Py_RETURN_FALSE;
7895 }
7896 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007898 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007899 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007901 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902}
7903
7904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007905PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007906 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007908Return True if S ends with the specified suffix, False otherwise.\n\
7909With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007910With optional end, stop comparing S at that position.\n\
7911suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912
7913static PyObject *
7914unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007915 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916{
Georg Brandl24250812006-06-09 18:45:48 +00007917 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007919 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007920 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007921 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922
Georg Brandl24250812006-06-09 18:45:48 +00007923 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007924 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7925 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007926 if (PyTuple_Check(subobj)) {
7927 Py_ssize_t i;
7928 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7929 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007930 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007931 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007932 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007933 result = tailmatch(self, substring, start, end, +1);
7934 Py_DECREF(substring);
7935 if (result) {
7936 Py_RETURN_TRUE;
7937 }
7938 }
7939 Py_RETURN_FALSE;
7940 }
7941 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944
Georg Brandl24250812006-06-09 18:45:48 +00007945 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007947 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948}
7949
7950
Eric Smitha9f7d622008-02-17 19:46:49 +00007951/* Implements do_string_format, which is unicode because of stringlib */
7952#include "stringlib/string_format.h"
7953
7954PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007955 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007956\n\
7957");
7958
Eric Smithdc13b792008-05-30 18:10:04 +00007959static PyObject *
7960unicode__format__(PyObject *self, PyObject *args)
7961{
7962 PyObject *format_spec;
7963 PyObject *result = NULL;
7964 PyObject *tmp = NULL;
7965
7966 /* If 2.x, convert format_spec to the same type as value */
7967 /* This is to allow things like u''.format('') */
7968 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7969 goto done;
7970 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7971 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007972 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007973 goto done;
7974 }
7975 tmp = PyObject_Unicode(format_spec);
7976 if (tmp == NULL)
7977 goto done;
7978 format_spec = tmp;
7979
7980 result = _PyUnicode_FormatAdvanced(self,
7981 PyUnicode_AS_UNICODE(format_spec),
7982 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007983 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007984 Py_XDECREF(tmp);
7985 return result;
7986}
7987
Eric Smitha9f7d622008-02-17 19:46:49 +00007988PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007989 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007990\n\
7991");
7992
Robert Schuppenies901c9972008-06-10 10:10:31 +00007993static PyObject *
7994unicode__sizeof__(PyUnicodeObject *v)
7995{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007996 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7997 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007998}
7999
8000PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008001 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00008002\n\
8003");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008004
8005static PyObject *
8006unicode_getnewargs(PyUnicodeObject *v)
8007{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008008 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008009}
8010
8011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012static PyMethodDef unicode_methods[] = {
8013
8014 /* Order is according to common usage: often used methods should
8015 appear first, since lookup is done sequentially. */
8016
Georg Brandlecdc0a92006-03-30 12:19:07 +00008017 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008018 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8019 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008020 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008021 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8022 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8023 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8024 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8025 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8026 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8027 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00008028 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008029 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8030 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8031 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008032 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00008033 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008034/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8035 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8036 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8037 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008038 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00008039 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008040 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008041 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008042 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8043 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8044 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8045 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8046 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8047 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8048 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8049 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8050 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8051 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8052 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8053 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8054 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8055 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008056 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00008057 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8058 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8059 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8060 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00008061 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008062#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008063 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064#endif
8065
8066#if 0
8067 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008068 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069#endif
8070
Benjamin Peterson857ce152009-01-31 16:29:18 +00008071 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 {NULL, NULL}
8073};
8074
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008075static PyObject *
8076unicode_mod(PyObject *v, PyObject *w)
8077{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008078 if (!PyUnicode_Check(v)) {
8079 Py_INCREF(Py_NotImplemented);
8080 return Py_NotImplemented;
8081 }
8082 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008083}
8084
8085static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008086 0, /*nb_add*/
8087 0, /*nb_subtract*/
8088 0, /*nb_multiply*/
8089 0, /*nb_divide*/
8090 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008091};
8092
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008094 (lenfunc) unicode_length, /* sq_length */
8095 PyUnicode_Concat, /* sq_concat */
8096 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8097 (ssizeargfunc) unicode_getitem, /* sq_item */
8098 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8099 0, /* sq_ass_item */
8100 0, /* sq_ass_slice */
8101 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102};
8103
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008104static PyObject*
8105unicode_subscript(PyUnicodeObject* self, PyObject* item)
8106{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008107 if (PyIndex_Check(item)) {
8108 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008109 if (i == -1 && PyErr_Occurred())
8110 return NULL;
8111 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008112 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008113 return unicode_getitem(self, i);
8114 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008115 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008116 Py_UNICODE* source_buf;
8117 Py_UNICODE* result_buf;
8118 PyObject* result;
8119
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008120 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008121 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008122 return NULL;
8123 }
8124
8125 if (slicelength <= 0) {
8126 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008127 } else if (start == 0 && step == 1 && slicelength == self->length &&
8128 PyUnicode_CheckExact(self)) {
8129 Py_INCREF(self);
8130 return (PyObject *)self;
8131 } else if (step == 1) {
8132 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008133 } else {
8134 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008135 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8136 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008137
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008138 if (result_buf == NULL)
8139 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008140
8141 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8142 result_buf[i] = source_buf[cur];
8143 }
Tim Petersced69f82003-09-16 20:30:58 +00008144
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008145 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008146 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008147 return result;
8148 }
8149 } else {
8150 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8151 return NULL;
8152 }
8153}
8154
8155static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008156 (lenfunc)unicode_length, /* mp_length */
8157 (binaryfunc)unicode_subscript, /* mp_subscript */
8158 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008159};
8160
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008163 Py_ssize_t index,
8164 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165{
8166 if (index != 0) {
8167 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008168 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 return -1;
8170 }
8171 *ptr = (void *) self->str;
8172 return PyUnicode_GET_DATA_SIZE(self);
8173}
8174
Martin v. Löwis18e16552006-02-15 17:27:45 +00008175static Py_ssize_t
8176unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008177 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178{
8179 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008180 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 return -1;
8182}
8183
8184static int
8185unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008186 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187{
8188 if (lenp)
8189 *lenp = PyUnicode_GET_DATA_SIZE(self);
8190 return 1;
8191}
8192
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008193static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008195 Py_ssize_t index,
8196 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
8198 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008199
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 if (index != 0) {
8201 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008202 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 return -1;
8204 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008205 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008207 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008208 *ptr = (void *) PyString_AS_STRING(str);
8209 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210}
8211
8212/* Helpers for PyUnicode_Format() */
8213
8214static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008217 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008219 (*p_argidx)++;
8220 if (arglen < 0)
8221 return args;
8222 else
8223 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 }
8225 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008226 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 return NULL;
8228}
8229
8230#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008231#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008233#define F_ALT (1<<3)
8234#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235
Martin v. Löwis18e16552006-02-15 17:27:45 +00008236static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008237strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008239 register Py_ssize_t i;
8240 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008242 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 return len;
8245}
8246
Neal Norwitzfc76d632006-01-10 06:03:13 +00008247static int
8248doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8249{
Tim Peters15231542006-02-16 01:08:01 +00008250 Py_ssize_t result;
8251
Neal Norwitzfc76d632006-01-10 06:03:13 +00008252 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008253 result = strtounicode(buffer, (char *)buffer);
8254 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008255}
8256
8257static int
8258longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8259{
Tim Peters15231542006-02-16 01:08:01 +00008260 Py_ssize_t result;
8261
Neal Norwitzfc76d632006-01-10 06:03:13 +00008262 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008263 result = strtounicode(buffer, (char *)buffer);
8264 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008265}
8266
Guido van Rossum078151d2002-08-11 04:24:12 +00008267/* XXX To save some code duplication, formatfloat/long/int could have been
8268 shared with stringobject.c, converting from 8-bit to Unicode after the
8269 formatting is done. */
8270
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271static int
8272formatfloat(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008273 size_t buflen,
8274 int flags,
8275 int prec,
8276 int type,
8277 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008279 /* fmt = '%#.' + `prec` + `type`
8280 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 char fmt[20];
8282 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008283
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 x = PyFloat_AsDouble(v);
8285 if (x == -1.0 && PyErr_Occurred())
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008286 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008288 prec = 6;
Eric Smithd6c393a2008-07-17 19:49:47 +00008289 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008290 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008291 /* Worst case length calc to ensure no buffer overrun:
8292
8293 'g' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008294 fmt = %#.<prec>g
8295 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8296 for any double rep.)
8297 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008298
8299 'f' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008300 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8301 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008302
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008303 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008304 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008305
8306 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008307 if (((type == 'g' || type == 'G') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008308 buflen <= (size_t)10 + (size_t)prec) ||
8309 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8310 PyErr_SetString(PyExc_OverflowError,
8311 "formatted float is too long (precision too large?)");
8312 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008313 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008314 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008315 (flags&F_ALT) ? "#" : "",
8316 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008317 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318}
8319
Tim Peters38fd5b62000-09-21 05:43:11 +00008320static PyObject*
8321formatlong(PyObject *val, int flags, int prec, int type)
8322{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008323 char *buf;
8324 int i, len;
8325 PyObject *str; /* temporary string object. */
8326 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008327
Benjamin Peterson857ce152009-01-31 16:29:18 +00008328 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8329 if (!str)
8330 return NULL;
8331 result = _PyUnicode_New(len);
8332 if (!result) {
8333 Py_DECREF(str);
8334 return NULL;
8335 }
8336 for (i = 0; i < len; i++)
8337 result->str[i] = buf[i];
8338 result->str[len] = 0;
8339 Py_DECREF(str);
8340 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008341}
8342
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343static int
8344formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008345 size_t buflen,
8346 int flags,
8347 int prec,
8348 int type,
8349 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008351 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008352 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8353 * + 1 + 1
8354 * = 24
8355 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008356 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008357 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 long x;
8359
8360 x = PyInt_AsLong(v);
8361 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008362 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008363 if (x < 0 && type == 'u') {
8364 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008365 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008366 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8367 sign = "-";
8368 else
8369 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008371 prec = 1;
8372
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008373 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8374 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008375 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008376 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008377 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008378 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008379 return -1;
8380 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008381
8382 if ((flags & F_ALT) &&
8383 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008384 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008385 * of issues that cause pain:
8386 * - when 0 is being converted, the C standard leaves off
8387 * the '0x' or '0X', which is inconsistent with other
8388 * %#x/%#X conversions and inconsistent with Python's
8389 * hex() function
8390 * - there are platforms that violate the standard and
8391 * convert 0 with the '0x' or '0X'
8392 * (Metrowerks, Compaq Tru64)
8393 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008394 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008395 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008396 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008397 * We can achieve the desired consistency by inserting our
8398 * own '0x' or '0X' prefix, and substituting %x/%X in place
8399 * of %#x/%#X.
8400 *
8401 * Note that this is the same approach as used in
8402 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008403 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008404 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8405 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008406 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008407 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008408 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8409 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008410 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008411 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008412 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008413 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008414 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008415 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416}
8417
8418static int
8419formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008420 size_t buflen,
8421 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008423 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008424 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008425 if (PyUnicode_GET_SIZE(v) != 1)
8426 goto onError;
8427 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008430 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008431 if (PyString_GET_SIZE(v) != 1)
8432 goto onError;
8433 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435
8436 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008437 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008439 x = PyInt_AsLong(v);
8440 if (x == -1 && PyErr_Occurred())
8441 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008442#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008443 if (x < 0 || x > 0x10ffff) {
8444 PyErr_SetString(PyExc_OverflowError,
8445 "%c arg not in range(0x110000) "
8446 "(wide Python build)");
8447 return -1;
8448 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008449#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008450 if (x < 0 || x > 0xffff) {
8451 PyErr_SetString(PyExc_OverflowError,
8452 "%c arg not in range(0x10000) "
8453 "(narrow Python build)");
8454 return -1;
8455 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008456#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008457 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 }
8459 buf[1] = '\0';
8460 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008461
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008462 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008463 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008464 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008465 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466}
8467
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008468/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8469
8470 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8471 chars are formatted. XXX This is a magic number. Each formatting
8472 routine does bounds checking to ensure no overflow, but a better
8473 solution may be to malloc a buffer of appropriate size for each
8474 format. For now, the current solution is sufficient.
8475*/
8476#define FORMATBUFLEN (size_t)120
8477
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008479 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
8481 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008482 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 int args_owned = 0;
8484 PyUnicodeObject *result = NULL;
8485 PyObject *dict = NULL;
8486 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008487
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008489 PyErr_BadInternalCall();
8490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 }
8492 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008493 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 fmt = PyUnicode_AS_UNICODE(uformat);
8496 fmtcnt = PyUnicode_GET_SIZE(uformat);
8497
8498 reslen = rescnt = fmtcnt + 100;
8499 result = _PyUnicode_New(reslen);
8500 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 res = PyUnicode_AS_UNICODE(result);
8503
8504 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008505 arglen = PyTuple_Size(args);
8506 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 }
8508 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008509 arglen = -1;
8510 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 }
Christian Heimese93237d2007-12-19 02:37:44 +00008512 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008513 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008514 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515
8516 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008517 if (*fmt != '%') {
8518 if (--rescnt < 0) {
8519 rescnt = fmtcnt + 100;
8520 reslen += rescnt;
8521 if (_PyUnicode_Resize(&result, reslen) < 0)
8522 goto onError;
8523 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8524 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008525 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008526 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008527 }
8528 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008529 /* Got a format specifier */
8530 int flags = 0;
8531 Py_ssize_t width = -1;
8532 int prec = -1;
8533 Py_UNICODE c = '\0';
8534 Py_UNICODE fill;
8535 int isnumok;
8536 PyObject *v = NULL;
8537 PyObject *temp = NULL;
8538 Py_UNICODE *pbuf;
8539 Py_UNICODE sign;
8540 Py_ssize_t len;
8541 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8542
8543 fmt++;
8544 if (*fmt == '(') {
8545 Py_UNICODE *keystart;
8546 Py_ssize_t keylen;
8547 PyObject *key;
8548 int pcount = 1;
8549
8550 if (dict == NULL) {
8551 PyErr_SetString(PyExc_TypeError,
8552 "format requires a mapping");
8553 goto onError;
8554 }
8555 ++fmt;
8556 --fmtcnt;
8557 keystart = fmt;
8558 /* Skip over balanced parentheses */
8559 while (pcount > 0 && --fmtcnt >= 0) {
8560 if (*fmt == ')')
8561 --pcount;
8562 else if (*fmt == '(')
8563 ++pcount;
8564 fmt++;
8565 }
8566 keylen = fmt - keystart - 1;
8567 if (fmtcnt < 0 || pcount > 0) {
8568 PyErr_SetString(PyExc_ValueError,
8569 "incomplete format key");
8570 goto onError;
8571 }
8572#if 0
8573 /* keys are converted to strings using UTF-8 and
8574 then looked up since Python uses strings to hold
8575 variables names etc. in its namespaces and we
8576 wouldn't want to break common idioms. */
8577 key = PyUnicode_EncodeUTF8(keystart,
8578 keylen,
8579 NULL);
8580#else
8581 key = PyUnicode_FromUnicode(keystart, keylen);
8582#endif
8583 if (key == NULL)
8584 goto onError;
8585 if (args_owned) {
8586 Py_DECREF(args);
8587 args_owned = 0;
8588 }
8589 args = PyObject_GetItem(dict, key);
8590 Py_DECREF(key);
8591 if (args == NULL) {
8592 goto onError;
8593 }
8594 args_owned = 1;
8595 arglen = -1;
8596 argidx = -2;
8597 }
8598 while (--fmtcnt >= 0) {
8599 switch (c = *fmt++) {
8600 case '-': flags |= F_LJUST; continue;
8601 case '+': flags |= F_SIGN; continue;
8602 case ' ': flags |= F_BLANK; continue;
8603 case '#': flags |= F_ALT; continue;
8604 case '0': flags |= F_ZERO; continue;
8605 }
8606 break;
8607 }
8608 if (c == '*') {
8609 v = getnextarg(args, arglen, &argidx);
8610 if (v == NULL)
8611 goto onError;
8612 if (!PyInt_Check(v)) {
8613 PyErr_SetString(PyExc_TypeError,
8614 "* wants int");
8615 goto onError;
8616 }
8617 width = PyInt_AsLong(v);
8618 if (width < 0) {
8619 flags |= F_LJUST;
8620 width = -width;
8621 }
8622 if (--fmtcnt >= 0)
8623 c = *fmt++;
8624 }
8625 else if (c >= '0' && c <= '9') {
8626 width = c - '0';
8627 while (--fmtcnt >= 0) {
8628 c = *fmt++;
8629 if (c < '0' || c > '9')
8630 break;
8631 if ((width*10) / 10 != width) {
8632 PyErr_SetString(PyExc_ValueError,
8633 "width too big");
8634 goto onError;
8635 }
8636 width = width*10 + (c - '0');
8637 }
8638 }
8639 if (c == '.') {
8640 prec = 0;
8641 if (--fmtcnt >= 0)
8642 c = *fmt++;
8643 if (c == '*') {
8644 v = getnextarg(args, arglen, &argidx);
8645 if (v == NULL)
8646 goto onError;
8647 if (!PyInt_Check(v)) {
8648 PyErr_SetString(PyExc_TypeError,
8649 "* wants int");
8650 goto onError;
8651 }
8652 prec = PyInt_AsLong(v);
8653 if (prec < 0)
8654 prec = 0;
8655 if (--fmtcnt >= 0)
8656 c = *fmt++;
8657 }
8658 else if (c >= '0' && c <= '9') {
8659 prec = c - '0';
8660 while (--fmtcnt >= 0) {
8661 c = Py_CHARMASK(*fmt++);
8662 if (c < '0' || c > '9')
8663 break;
8664 if ((prec*10) / 10 != prec) {
8665 PyErr_SetString(PyExc_ValueError,
8666 "prec too big");
8667 goto onError;
8668 }
8669 prec = prec*10 + (c - '0');
8670 }
8671 }
8672 } /* prec */
8673 if (fmtcnt >= 0) {
8674 if (c == 'h' || c == 'l' || c == 'L') {
8675 if (--fmtcnt >= 0)
8676 c = *fmt++;
8677 }
8678 }
8679 if (fmtcnt < 0) {
8680 PyErr_SetString(PyExc_ValueError,
8681 "incomplete format");
8682 goto onError;
8683 }
8684 if (c != '%') {
8685 v = getnextarg(args, arglen, &argidx);
8686 if (v == NULL)
8687 goto onError;
8688 }
8689 sign = 0;
8690 fill = ' ';
8691 switch (c) {
8692
8693 case '%':
8694 pbuf = formatbuf;
8695 /* presume that buffer length is at least 1 */
8696 pbuf[0] = '%';
8697 len = 1;
8698 break;
8699
8700 case 's':
8701 case 'r':
8702 if (PyUnicode_Check(v) && c == 's') {
8703 temp = v;
8704 Py_INCREF(temp);
8705 }
8706 else {
8707 PyObject *unicode;
8708 if (c == 's')
8709 temp = PyObject_Unicode(v);
8710 else
8711 temp = PyObject_Repr(v);
8712 if (temp == NULL)
8713 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008714 if (PyUnicode_Check(temp))
8715 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008716 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008717 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008718 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8719 PyString_GET_SIZE(temp),
8720 NULL,
8721 "strict");
8722 Py_DECREF(temp);
8723 temp = unicode;
8724 if (temp == NULL)
8725 goto onError;
8726 }
8727 else {
8728 Py_DECREF(temp);
8729 PyErr_SetString(PyExc_TypeError,
8730 "%s argument has non-string str()");
8731 goto onError;
8732 }
8733 }
8734 pbuf = PyUnicode_AS_UNICODE(temp);
8735 len = PyUnicode_GET_SIZE(temp);
8736 if (prec >= 0 && len > prec)
8737 len = prec;
8738 break;
8739
8740 case 'i':
8741 case 'd':
8742 case 'u':
8743 case 'o':
8744 case 'x':
8745 case 'X':
8746 if (c == 'i')
8747 c = 'd';
8748 isnumok = 0;
8749 if (PyNumber_Check(v)) {
8750 PyObject *iobj=NULL;
8751
8752 if (PyInt_Check(v) || (PyLong_Check(v))) {
8753 iobj = v;
8754 Py_INCREF(iobj);
8755 }
8756 else {
8757 iobj = PyNumber_Int(v);
8758 if (iobj==NULL) iobj = PyNumber_Long(v);
8759 }
8760 if (iobj!=NULL) {
8761 if (PyInt_Check(iobj)) {
8762 isnumok = 1;
8763 pbuf = formatbuf;
8764 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8765 flags, prec, c, iobj);
8766 Py_DECREF(iobj);
8767 if (len < 0)
8768 goto onError;
8769 sign = 1;
8770 }
8771 else if (PyLong_Check(iobj)) {
8772 isnumok = 1;
8773 temp = formatlong(iobj, flags, prec, c);
8774 Py_DECREF(iobj);
8775 if (!temp)
8776 goto onError;
8777 pbuf = PyUnicode_AS_UNICODE(temp);
8778 len = PyUnicode_GET_SIZE(temp);
8779 sign = 1;
8780 }
8781 else {
8782 Py_DECREF(iobj);
8783 }
8784 }
8785 }
8786 if (!isnumok) {
8787 PyErr_Format(PyExc_TypeError,
8788 "%%%c format: a number is required, "
8789 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8790 goto onError;
8791 }
8792 if (flags & F_ZERO)
8793 fill = '0';
8794 break;
8795
8796 case 'e':
8797 case 'E':
8798 case 'f':
8799 case 'F':
8800 case 'g':
8801 case 'G':
8802 if (c == 'F')
8803 c = 'f';
8804 pbuf = formatbuf;
8805 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8806 flags, prec, c, v);
8807 if (len < 0)
8808 goto onError;
8809 sign = 1;
8810 if (flags & F_ZERO)
8811 fill = '0';
8812 break;
8813
8814 case 'c':
8815 pbuf = formatbuf;
8816 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8817 if (len < 0)
8818 goto onError;
8819 break;
8820
8821 default:
8822 PyErr_Format(PyExc_ValueError,
8823 "unsupported format character '%c' (0x%x) "
8824 "at index %zd",
8825 (31<=c && c<=126) ? (char)c : '?',
8826 (int)c,
8827 (Py_ssize_t)(fmt - 1 -
8828 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008829 goto onError;
8830 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008831 if (sign) {
8832 if (*pbuf == '-' || *pbuf == '+') {
8833 sign = *pbuf++;
8834 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008835 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008836 else if (flags & F_SIGN)
8837 sign = '+';
8838 else if (flags & F_BLANK)
8839 sign = ' ';
8840 else
8841 sign = 0;
8842 }
8843 if (width < len)
8844 width = len;
8845 if (rescnt - (sign != 0) < width) {
8846 reslen -= rescnt;
8847 rescnt = width + fmtcnt + 100;
8848 reslen += rescnt;
8849 if (reslen < 0) {
8850 Py_XDECREF(temp);
8851 PyErr_NoMemory();
8852 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008853 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008854 if (_PyUnicode_Resize(&result, reslen) < 0) {
8855 Py_XDECREF(temp);
8856 goto onError;
8857 }
8858 res = PyUnicode_AS_UNICODE(result)
8859 + reslen - rescnt;
8860 }
8861 if (sign) {
8862 if (fill != ' ')
8863 *res++ = sign;
8864 rescnt--;
8865 if (width > len)
8866 width--;
8867 }
8868 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8869 assert(pbuf[0] == '0');
8870 assert(pbuf[1] == c);
8871 if (fill != ' ') {
8872 *res++ = *pbuf++;
8873 *res++ = *pbuf++;
8874 }
8875 rescnt -= 2;
8876 width -= 2;
8877 if (width < 0)
8878 width = 0;
8879 len -= 2;
8880 }
8881 if (width > len && !(flags & F_LJUST)) {
8882 do {
8883 --rescnt;
8884 *res++ = fill;
8885 } while (--width > len);
8886 }
8887 if (fill == ' ') {
8888 if (sign)
8889 *res++ = sign;
8890 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8891 assert(pbuf[0] == '0');
8892 assert(pbuf[1] == c);
8893 *res++ = *pbuf++;
8894 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008895 }
8896 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008897 Py_UNICODE_COPY(res, pbuf, len);
8898 res += len;
8899 rescnt -= len;
8900 while (--width >= len) {
8901 --rescnt;
8902 *res++ = ' ';
8903 }
8904 if (dict && (argidx < arglen) && c != '%') {
8905 PyErr_SetString(PyExc_TypeError,
8906 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008907 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008908 goto onError;
8909 }
8910 Py_XDECREF(temp);
8911 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 } /* until end */
8913 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008914 PyErr_SetString(PyExc_TypeError,
8915 "not all arguments converted during string formatting");
8916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 }
8918
Thomas Woutersa96affe2006-03-12 00:29:36 +00008919 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008920 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008922 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 }
8924 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 return (PyObject *)result;
8926
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008927 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 Py_XDECREF(result);
8929 Py_DECREF(uformat);
8930 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008931 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 }
8933 return NULL;
8934}
8935
8936static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 (readbufferproc) unicode_buffer_getreadbuf,
8938 (writebufferproc) unicode_buffer_getwritebuf,
8939 (segcountproc) unicode_buffer_getsegcount,
8940 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941};
8942
Jeremy Hylton938ace62002-07-17 16:30:39 +00008943static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8945
Tim Peters6d6c1a32001-08-02 04:15:00 +00008946static PyObject *
8947unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8948{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008949 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008950 static char *kwlist[] = {"string", "encoding", "errors", 0};
8951 char *encoding = NULL;
8952 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008953
Benjamin Peterson857ce152009-01-31 16:29:18 +00008954 if (type != &PyUnicode_Type)
8955 return unicode_subtype_new(type, args, kwds);
8956 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008957 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008958 return NULL;
8959 if (x == NULL)
8960 return (PyObject *)_PyUnicode_New(0);
8961 if (encoding == NULL && errors == NULL)
8962 return PyObject_Unicode(x);
8963 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008964 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008965}
8966
Guido van Rossume023fe02001-08-30 03:12:59 +00008967static PyObject *
8968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8969{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008970 PyUnicodeObject *tmp, *pnew;
8971 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008972
Benjamin Peterson857ce152009-01-31 16:29:18 +00008973 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8974 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8975 if (tmp == NULL)
8976 return NULL;
8977 assert(PyUnicode_Check(tmp));
8978 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8979 if (pnew == NULL) {
8980 Py_DECREF(tmp);
8981 return NULL;
8982 }
8983 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8984 if (pnew->str == NULL) {
8985 _Py_ForgetReference((PyObject *)pnew);
8986 PyObject_Del(pnew);
8987 Py_DECREF(tmp);
8988 return PyErr_NoMemory();
8989 }
8990 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8991 pnew->length = n;
8992 pnew->hash = tmp->hash;
8993 Py_DECREF(tmp);
8994 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008995}
8996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008997PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008998 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008999\n\
9000Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009001encoding defaults to the current default string encoding.\n\
9002errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009003
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00009005 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00009006 "unicode", /* tp_name */
9007 sizeof(PyUnicodeObject), /* tp_size */
9008 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00009010 (destructor)unicode_dealloc, /* tp_dealloc */
9011 0, /* tp_print */
9012 0, /* tp_getattr */
9013 0, /* tp_setattr */
9014 0, /* tp_compare */
9015 unicode_repr, /* tp_repr */
9016 &unicode_as_number, /* tp_as_number */
9017 &unicode_as_sequence, /* tp_as_sequence */
9018 &unicode_as_mapping, /* tp_as_mapping */
9019 (hashfunc) unicode_hash, /* tp_hash*/
9020 0, /* tp_call*/
9021 (reprfunc) unicode_str, /* tp_str */
9022 PyObject_GenericGetAttr, /* tp_getattro */
9023 0, /* tp_setattro */
9024 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009025 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009026 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00009027 unicode_doc, /* tp_doc */
9028 0, /* tp_traverse */
9029 0, /* tp_clear */
9030 PyUnicode_RichCompare, /* tp_richcompare */
9031 0, /* tp_weaklistoffset */
9032 0, /* tp_iter */
9033 0, /* tp_iternext */
9034 unicode_methods, /* tp_methods */
9035 0, /* tp_members */
9036 0, /* tp_getset */
9037 &PyBaseString_Type, /* tp_base */
9038 0, /* tp_dict */
9039 0, /* tp_descr_get */
9040 0, /* tp_descr_set */
9041 0, /* tp_dictoffset */
9042 0, /* tp_init */
9043 0, /* tp_alloc */
9044 unicode_new, /* tp_new */
9045 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046};
9047
9048/* Initialize the Unicode implementation */
9049
Thomas Wouters78890102000-07-22 19:25:51 +00009050void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009052 int i;
9053
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009054 /* XXX - move this array to unicodectype.c ? */
9055 Py_UNICODE linebreak[] = {
9056 0x000A, /* LINE FEED */
9057 0x000D, /* CARRIAGE RETURN */
9058 0x001C, /* FILE SEPARATOR */
9059 0x001D, /* GROUP SEPARATOR */
9060 0x001E, /* RECORD SEPARATOR */
9061 0x0085, /* NEXT LINE */
9062 0x2028, /* LINE SEPARATOR */
9063 0x2029, /* PARAGRAPH SEPARATOR */
9064 };
9065
Fred Drakee4315f52000-05-09 19:53:39 +00009066 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009067 free_list = NULL;
9068 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009070 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009071 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009072
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009073 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009074 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009075 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009076 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009077 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009078
9079 /* initialize the linebreak bloom filter */
9080 bloom_linebreak = make_bloom_mask(
9081 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9082 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009083
9084 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085}
9086
9087/* Finalize the Unicode implementation */
9088
Christian Heimes3b718a72008-02-14 12:47:33 +00009089int
9090PyUnicode_ClearFreeList(void)
9091{
9092 int freelist_size = numfree;
9093 PyUnicodeObject *u;
9094
9095 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009096 PyUnicodeObject *v = u;
9097 u = *(PyUnicodeObject **)u;
9098 if (v->str)
9099 PyObject_DEL(v->str);
9100 Py_XDECREF(v->defenc);
9101 PyObject_Del(v);
9102 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009103 }
9104 free_list = NULL;
9105 assert(numfree == 0);
9106 return freelist_size;
9107}
9108
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109void
Thomas Wouters78890102000-07-22 19:25:51 +00009110_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009112 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009114 Py_XDECREF(unicode_empty);
9115 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009116
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009117 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009118 if (unicode_latin1[i]) {
9119 Py_DECREF(unicode_latin1[i]);
9120 unicode_latin1[i] = NULL;
9121 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009122 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009123 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009125
Anthony Baxterac6bd462006-04-13 02:06:09 +00009126#ifdef __cplusplus
9127}
9128#endif
9129
9130
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009131/*
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009132 Local variables:
9133 c-basic-offset: 4
9134 indent-tabs-mode: nil
9135 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009136*/