blob: d05685514dedddf46ddcfe5759ea0f9916ebff82 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson339f8c62009-01-31 22:25:08 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d8a6f42008-10-02 19:49:47 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000247 if (unicode == unicode_empty ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Benjamin Peterson828a7062008-12-27 17:05:29 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Benjamin Peterson828a7062008-12-27 17:05:29 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Benjamin Peterson828a7062008-12-27 17:05:29 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000443 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000492 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
532PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534{
535 PyUnicodeObject *unicode;
536
537 if (w == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000538 PyErr_BadInternalCall();
539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540 }
541
542 unicode = _PyUnicode_New(size);
543 if (!unicode)
544 return NULL;
545
546 /* Copy the wchar_t data into the new object */
547#ifdef HAVE_USABLE_WCHAR_T
548 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000549#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000551 register Py_UNICODE *u;
552 register Py_ssize_t i;
553 u = PyUnicode_AS_UNICODE(unicode);
554 for (i = size; i > 0; i--)
555 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000556 }
557#endif
558
559 return (PyObject *)unicode;
560}
561
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000562static void
563makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
564{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000565 *fmt++ = '%';
566 if (width) {
567 if (zeropad)
568 *fmt++ = '0';
569 fmt += sprintf(fmt, "%d", width);
570 }
571 if (precision)
572 fmt += sprintf(fmt, ".%d", precision);
573 if (longflag)
574 *fmt++ = 'l';
575 else if (size_tflag) {
576 char *f = PY_FORMAT_SIZE_T;
577 while (*f)
578 *fmt++ = *f++;
579 }
580 *fmt++ = c;
581 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000582}
583
584#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
585
586PyObject *
587PyUnicode_FromFormatV(const char *format, va_list vargs)
588{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000589 va_list count;
590 Py_ssize_t callcount = 0;
591 PyObject **callresults = NULL;
592 PyObject **callresult = NULL;
593 Py_ssize_t n = 0;
594 int width = 0;
595 int precision = 0;
596 int zeropad;
597 const char* f;
598 Py_UNICODE *s;
599 PyObject *string;
600 /* used by sprintf */
601 char buffer[21];
602 /* use abuffer instead of buffer, if we need more space
603 * (which can happen if there's a format specifier with width). */
604 char *abuffer = NULL;
605 char *realbuffer;
606 Py_ssize_t abuffersize = 0;
607 char fmt[60]; /* should be enough for %0width.precisionld */
608 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000609
610#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000611 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000612#else
613#ifdef __va_copy
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000614 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000615#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000616 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000617#endif
618#endif
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000619 /* step 1: count the number of %S/%R/%s format specifications
620 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
621 * objects once during step 3 and put the result in an array) */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000622 for (f = format; *f; f++) {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000623 if (*f == '%') {
624 if (*(f+1)=='%')
625 continue;
Walter Dörwald67032252009-05-03 22:46:50 +0000626 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000627 ++callcount;
628 while (isdigit((unsigned)*f))
629 width = (width*10) + *f++ - '0';
630 while (*++f && *f != '%' && !isalpha((unsigned)*f))
631 ;
632 if (*f == 's')
633 ++callcount;
634 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000635 }
636 /* step 2: allocate memory for the results of
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000637 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000638 if (callcount) {
639 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
640 if (!callresults) {
641 PyErr_NoMemory();
642 return NULL;
643 }
644 callresult = callresults;
645 }
646 /* step 3: figure out how large a buffer we need */
647 for (f = format; *f; f++) {
648 if (*f == '%') {
649 const char* p = f;
650 width = 0;
651 while (isdigit((unsigned)*f))
652 width = (width*10) + *f++ - '0';
653 while (*++f && *f != '%' && !isalpha((unsigned)*f))
654 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000655
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000656 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
657 * they don't affect the amount of space we reserve.
658 */
659 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000660 (f[1] == 'd' || f[1] == 'u'))
661 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000662
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000663 switch (*f) {
664 case 'c':
665 (void)va_arg(count, int);
666 /* fall through... */
667 case '%':
668 n++;
669 break;
670 case 'd': case 'u': case 'i': case 'x':
671 (void) va_arg(count, int);
672 /* 20 bytes is enough to hold a 64-bit
673 integer. Decimal takes the most space.
674 This isn't enough for octal.
675 If a width is specified we need more
676 (which we allocate later). */
677 if (width < 20)
678 width = 20;
679 n += width;
680 if (abuffersize < width)
681 abuffersize = width;
682 break;
683 case 's':
684 {
685 /* UTF-8 */
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000686 unsigned char *s = va_arg(count, unsigned char*);
687 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
688 if (!str)
689 goto fail;
690 n += PyUnicode_GET_SIZE(str);
691 /* Remember the str and switch to the next slot */
692 *callresult++ = str;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000693 break;
694 }
695 case 'U':
696 {
697 PyObject *obj = va_arg(count, PyObject *);
698 assert(obj && PyUnicode_Check(obj));
699 n += PyUnicode_GET_SIZE(obj);
700 break;
701 }
702 case 'V':
703 {
704 PyObject *obj = va_arg(count, PyObject *);
705 const char *str = va_arg(count, const char *);
706 assert(obj || str);
707 assert(!obj || PyUnicode_Check(obj));
708 if (obj)
709 n += PyUnicode_GET_SIZE(obj);
710 else
711 n += strlen(str);
712 break;
713 }
714 case 'S':
715 {
716 PyObject *obj = va_arg(count, PyObject *);
717 PyObject *str;
718 assert(obj);
719 str = PyObject_Str(obj);
720 if (!str)
721 goto fail;
722 n += PyUnicode_GET_SIZE(str);
723 /* Remember the str and switch to the next slot */
724 *callresult++ = str;
725 break;
726 }
727 case 'R':
728 {
729 PyObject *obj = va_arg(count, PyObject *);
730 PyObject *repr;
731 assert(obj);
732 repr = PyObject_Repr(obj);
733 if (!repr)
734 goto fail;
735 n += PyUnicode_GET_SIZE(repr);
736 /* Remember the repr and switch to the next slot */
737 *callresult++ = repr;
738 break;
739 }
740 case 'p':
741 (void) va_arg(count, int);
742 /* maximum 64-bit pointer representation:
743 * 0xffffffffffffffff
744 * so 19 characters is enough.
745 * XXX I count 18 -- what's the extra for?
746 */
747 n += 19;
748 break;
749 default:
750 /* if we stumble upon an unknown
751 formatting code, copy the rest of
752 the format string to the output
753 string. (we cannot just skip the
754 code, since there's no way to know
755 what's in the argument list) */
756 n += strlen(p);
757 goto expand;
758 }
759 } else
760 n++;
761 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000762 expand:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000763 if (abuffersize > 20) {
764 abuffer = PyObject_Malloc(abuffersize);
765 if (!abuffer) {
766 PyErr_NoMemory();
767 goto fail;
768 }
769 realbuffer = abuffer;
770 }
771 else
772 realbuffer = buffer;
773 /* step 4: fill the buffer */
774 /* Since we've analyzed how much space we need for the worst case,
775 we don't have to resize the string.
776 There can be no errors beyond this point. */
777 string = PyUnicode_FromUnicode(NULL, n);
778 if (!string)
779 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000780
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000781 s = PyUnicode_AS_UNICODE(string);
782 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000783
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000784 for (f = format; *f; f++) {
785 if (*f == '%') {
786 const char* p = f++;
787 int longflag = 0;
788 int size_tflag = 0;
789 zeropad = (*f == '0');
790 /* parse the width.precision part */
791 width = 0;
792 while (isdigit((unsigned)*f))
793 width = (width*10) + *f++ - '0';
794 precision = 0;
795 if (*f == '.') {
796 f++;
797 while (isdigit((unsigned)*f))
798 precision = (precision*10) + *f++ - '0';
799 }
800 /* handle the long flag, but only for %ld and %lu.
801 others can be added when necessary. */
802 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
803 longflag = 1;
804 ++f;
805 }
806 /* handle the size_t flag. */
807 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
808 size_tflag = 1;
809 ++f;
810 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000811
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000812 switch (*f) {
813 case 'c':
814 *s++ = va_arg(vargs, int);
815 break;
816 case 'd':
817 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
818 if (longflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, long));
820 else if (size_tflag)
821 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
822 else
823 sprintf(realbuffer, fmt, va_arg(vargs, int));
824 appendstring(realbuffer);
825 break;
826 case 'u':
827 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
828 if (longflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
830 else if (size_tflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
832 else
833 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
834 appendstring(realbuffer);
835 break;
836 case 'i':
837 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
838 sprintf(realbuffer, fmt, va_arg(vargs, int));
839 appendstring(realbuffer);
840 break;
841 case 'x':
842 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
843 sprintf(realbuffer, fmt, va_arg(vargs, int));
844 appendstring(realbuffer);
845 break;
846 case 's':
847 {
Walter Dörwaldf11232e2009-05-03 22:38:54 +0000848 /* unused, since we already have the result */
849 (void) va_arg(vargs, char *);
850 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
851 PyUnicode_GET_SIZE(*callresult));
852 s += PyUnicode_GET_SIZE(*callresult);
853 /* We're done with the unicode()/repr() => forget it */
854 Py_DECREF(*callresult);
855 /* switch to next unicode()/repr() result */
856 ++callresult;
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000857 break;
858 }
859 case 'U':
860 {
861 PyObject *obj = va_arg(vargs, PyObject *);
862 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
863 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
864 s += size;
865 break;
866 }
867 case 'V':
868 {
869 PyObject *obj = va_arg(vargs, PyObject *);
870 const char *str = va_arg(vargs, const char *);
871 if (obj) {
872 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
873 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
874 s += size;
875 } else {
876 appendstring(str);
877 }
878 break;
879 }
880 case 'S':
881 case 'R':
882 {
883 Py_UNICODE *ucopy;
884 Py_ssize_t usize;
885 Py_ssize_t upos;
886 /* unused, since we already have the result */
887 (void) va_arg(vargs, PyObject *);
888 ucopy = PyUnicode_AS_UNICODE(*callresult);
889 usize = PyUnicode_GET_SIZE(*callresult);
890 for (upos = 0; upos<usize;)
891 *s++ = ucopy[upos++];
892 /* We're done with the unicode()/repr() => forget it */
893 Py_DECREF(*callresult);
894 /* switch to next unicode()/repr() result */
895 ++callresult;
896 break;
897 }
898 case 'p':
899 sprintf(buffer, "%p", va_arg(vargs, void*));
900 /* %p is ill-defined: ensure leading 0x. */
901 if (buffer[1] == 'X')
902 buffer[1] = 'x';
903 else if (buffer[1] != 'x') {
904 memmove(buffer+2, buffer, strlen(buffer)+1);
905 buffer[0] = '0';
906 buffer[1] = 'x';
907 }
908 appendstring(buffer);
909 break;
910 case '%':
911 *s++ = '%';
912 break;
913 default:
914 appendstring(p);
915 goto end;
916 }
917 } else
918 *s++ = *f;
919 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000920
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000921 end:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000922 if (callresults)
923 PyObject_Free(callresults);
924 if (abuffer)
925 PyObject_Free(abuffer);
926 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
927 return string;
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000928 fail:
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000929 if (callresults) {
930 PyObject **callresult2 = callresults;
931 while (callresult2 < callresult) {
932 Py_DECREF(*callresult2);
933 ++callresult2;
934 }
935 PyObject_Free(callresults);
936 }
937 if (abuffer)
938 PyObject_Free(abuffer);
939 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000940}
941
942#undef appendstring
943
944PyObject *
945PyUnicode_FromFormat(const char *format, ...)
946{
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000947 PyObject* ret;
948 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000949
950#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000951 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000952#else
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000953 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000954#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +0000955 ret = PyUnicode_FromFormatV(format, vargs);
956 va_end(vargs);
957 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000958}
959
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000961 wchar_t *w,
962 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963{
964 if (unicode == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000965 PyErr_BadInternalCall();
966 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000968
969 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000971 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000972
Guido van Rossumd57fd912000-03-10 22:53:23 +0000973#ifdef HAVE_USABLE_WCHAR_T
974 memcpy(w, unicode->str, size * sizeof(wchar_t));
975#else
976 {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000977 register Py_UNICODE *u;
978 register Py_ssize_t i;
979 u = PyUnicode_AS_UNICODE(unicode);
980 for (i = size; i > 0; i--)
981 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982 }
983#endif
984
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000985 if (size > PyUnicode_GET_SIZE(unicode))
986 return PyUnicode_GET_SIZE(unicode);
987 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000988 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989}
990
991#endif
992
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000993PyObject *PyUnicode_FromOrdinal(int ordinal)
994{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000995 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000996
997#ifdef Py_UNICODE_WIDE
998 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +0000999 PyErr_SetString(PyExc_ValueError,
1000 "unichr() arg not in range(0x110000) "
1001 "(wide Python build)");
1002 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001003 }
1004#else
1005 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001006 PyErr_SetString(PyExc_ValueError,
1007 "unichr() arg not in range(0x10000) "
1008 "(narrow Python build)");
1009 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001010 }
1011#endif
1012
Hye-Shik Chang40574832004-04-06 07:24:51 +00001013 s[0] = (Py_UNICODE)ordinal;
1014 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001015}
1016
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017PyObject *PyUnicode_FromObject(register PyObject *obj)
1018{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001019 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001020 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001022 Py_INCREF(obj);
1023 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001024 }
1025 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001026 /* For a Unicode subtype that's not a Unicode object,
1027 return a true Unicode object with the same data. */
1028 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1029 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001030 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001031 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1032}
1033
1034PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001035 const char *encoding,
1036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001037{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001038 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001039 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001040 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (obj == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001043 PyErr_BadInternalCall();
1044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001046
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001047#if 0
1048 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001049 that no encodings is given and then redirect to
1050 PyObject_Unicode() which then applies the additional logic for
1051 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001052
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001053 NOTE: This API should really only be used for object which
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001054 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001055
1056 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001057 if (PyUnicode_Check(obj)) {
1058 if (encoding) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001059 PyErr_SetString(PyExc_TypeError,
1060 "decoding Unicode is not supported");
1061 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001062 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001063 return PyObject_Unicode(obj);
1064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065#else
1066 if (PyUnicode_Check(obj)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001067 PyErr_SetString(PyExc_TypeError,
1068 "decoding Unicode is not supported");
1069 return NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001070 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001071#endif
1072
1073 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001074 if (PyString_Check(obj)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001075 s = PyString_AS_STRING(obj);
1076 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001077 }
Christian Heimes3497f942008-05-26 12:29:14 +00001078 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001079 /* Python 2.x specific */
1080 PyErr_Format(PyExc_TypeError,
1081 "decoding bytearray is not supported");
1082 return NULL;
1083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001085 /* Overwrite the error message with something more useful in
1086 case of a TypeError. */
1087 if (PyErr_ExceptionMatches(PyExc_TypeError))
1088 PyErr_Format(PyExc_TypeError,
1089 "coercing to Unicode: need string or buffer, "
1090 "%.80s found",
1091 Py_TYPE(obj)->tp_name);
1092 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001093 }
Tim Petersced69f82003-09-16 20:30:58 +00001094
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (len == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001097 Py_INCREF(unicode_empty);
1098 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 }
Tim Petersced69f82003-09-16 20:30:58 +00001100 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001101 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001102
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001103 return v;
1104
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001105 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107}
1108
1109PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001110 Py_ssize_t size,
1111 const char *encoding,
1112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113{
1114 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001115
1116 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001117 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001118
1119 /* Shortcuts for common default encodings */
1120 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001122 else if (strcmp(encoding, "latin-1") == 0)
1123 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001124#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1125 else if (strcmp(encoding, "mbcs") == 0)
1126 return PyUnicode_DecodeMBCS(s, size, errors);
1127#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001128 else if (strcmp(encoding, "ascii") == 0)
1129 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130
1131 /* Decode via the codec registry */
1132 buffer = PyBuffer_FromMemory((void *)s, size);
1133 if (buffer == NULL)
1134 goto onError;
1135 unicode = PyCodec_Decode(buffer, encoding, errors);
1136 if (unicode == NULL)
1137 goto onError;
1138 if (!PyUnicode_Check(unicode)) {
1139 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001140 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001141 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 Py_DECREF(unicode);
1143 goto onError;
1144 }
1145 Py_DECREF(buffer);
1146 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001147
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 Py_XDECREF(buffer);
1150 return NULL;
1151}
1152
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001153PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1154 const char *encoding,
1155 const char *errors)
1156{
1157 PyObject *v;
1158
1159 if (!PyUnicode_Check(unicode)) {
1160 PyErr_BadArgument();
1161 goto onError;
1162 }
1163
1164 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001165 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001166
1167 /* Decode via the codec registry */
1168 v = PyCodec_Decode(unicode, encoding, errors);
1169 if (v == NULL)
1170 goto onError;
1171 return v;
1172
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001173 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001174 return NULL;
1175}
1176
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001178 Py_ssize_t size,
1179 const char *encoding,
1180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181{
1182 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001183
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 unicode = PyUnicode_FromUnicode(s, size);
1185 if (unicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1188 Py_DECREF(unicode);
1189 return v;
1190}
1191
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001192PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1193 const char *encoding,
1194 const char *errors)
1195{
1196 PyObject *v;
1197
1198 if (!PyUnicode_Check(unicode)) {
1199 PyErr_BadArgument();
1200 goto onError;
1201 }
1202
1203 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001204 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001205
1206 /* Encode via the codec registry */
1207 v = PyCodec_Encode(unicode, encoding, errors);
1208 if (v == NULL)
1209 goto onError;
1210 return v;
1211
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001212 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001213 return NULL;
1214}
1215
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1217 const char *encoding,
1218 const char *errors)
1219{
1220 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 goto onError;
1225 }
Fred Drakee4315f52000-05-09 19:53:39 +00001226
Tim Petersced69f82003-09-16 20:30:58 +00001227 if (encoding == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001228 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001229
1230 /* Shortcuts for common default encodings */
1231 if (errors == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001232 if (strcmp(encoding, "utf-8") == 0)
1233 return PyUnicode_AsUTF8String(unicode);
1234 else if (strcmp(encoding, "latin-1") == 0)
1235 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001236#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001237 else if (strcmp(encoding, "mbcs") == 0)
1238 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001239#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001240 else if (strcmp(encoding, "ascii") == 0)
1241 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 /* Encode via the codec registry */
1245 v = PyCodec_Encode(unicode, encoding, errors);
1246 if (v == NULL)
1247 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001248 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001250 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001251 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 Py_DECREF(v);
1253 goto onError;
1254 }
1255 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001256
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001257 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 return NULL;
1259}
1260
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001261PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001262 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001263{
1264 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1265
1266 if (v)
1267 return v;
1268 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1269 if (v && errors == NULL)
1270 ((PyUnicodeObject *)unicode)->defenc = v;
1271 return v;
1272}
1273
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1275{
1276 if (!PyUnicode_Check(unicode)) {
1277 PyErr_BadArgument();
1278 goto onError;
1279 }
1280 return PyUnicode_AS_UNICODE(unicode);
1281
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001282 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 return NULL;
1284}
1285
Martin v. Löwis18e16552006-02-15 17:27:45 +00001286Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 if (!PyUnicode_Check(unicode)) {
1289 PyErr_BadArgument();
1290 goto onError;
1291 }
1292 return PyUnicode_GET_SIZE(unicode);
1293
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 return -1;
1296}
1297
Thomas Wouters78890102000-07-22 19:25:51 +00001298const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001299{
1300 return unicode_default_encoding;
1301}
1302
1303int PyUnicode_SetDefaultEncoding(const char *encoding)
1304{
1305 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001306
Fred Drakee4315f52000-05-09 19:53:39 +00001307 /* Make sure the encoding is valid. As side effect, this also
1308 loads the encoding into the codec registry cache. */
1309 v = _PyCodec_Lookup(encoding);
1310 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001311 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001312 Py_DECREF(v);
1313 strncpy(unicode_default_encoding,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001314 encoding,
1315 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001316 return 0;
1317
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001318 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001319 return -1;
1320}
1321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001322/* error handling callback helper:
1323 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001324 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325 and adjust various state variables.
1326 return 0 on success, -1 on error
1327*/
1328
1329static
1330int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001331 const char *encoding, const char *reason,
1332 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1333 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1334 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337
1338 PyObject *restuple = NULL;
1339 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001340 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1341 Py_ssize_t requiredsize;
1342 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001344 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345 int res = -1;
1346
1347 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001348 *errorHandler = PyCodec_LookupError(errors);
1349 if (*errorHandler == NULL)
1350 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 }
1352
1353 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001354 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001355 encoding, input, insize, *startinpos, *endinpos, reason);
1356 if (*exceptionObject == NULL)
1357 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 }
1359 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001360 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1361 goto onError;
1362 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1363 goto onError;
1364 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1365 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 }
1367
1368 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1369 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001370 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00001372 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 }
1375 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001376 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 if (newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001378 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001379 if (newpos<0 || newpos>insize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001380 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1381 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383
1384 /* need more space? (at least enough for what we
1385 have+the replacement+the rest of the string (starting
1386 at the new input position), so we won't have to check space
1387 when there are no errors in the rest of the string) */
1388 repptr = PyUnicode_AS_UNICODE(repunicode);
1389 repsize = PyUnicode_GET_SIZE(repunicode);
1390 requiredsize = *outpos + repsize + insize-newpos;
1391 if (requiredsize > outsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001392 if (requiredsize<2*outsize)
1393 requiredsize = 2*outsize;
1394 if (_PyUnicode_Resize(output, requiredsize) < 0)
1395 goto onError;
1396 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 }
1398 *endinpos = newpos;
1399 *inptr = input + newpos;
1400 Py_UNICODE_COPY(*outptr, repptr, repsize);
1401 *outptr += repsize;
1402 *outpos += repsize;
1403 /* we made it! */
1404 res = 0;
1405
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407 Py_XDECREF(restuple);
1408 return res;
1409}
1410
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411/* --- UTF-7 Codec -------------------------------------------------------- */
1412
1413/* see RFC2152 for details */
1414
Tim Petersced69f82003-09-16 20:30:58 +00001415static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001416char utf7_special[128] = {
1417 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1418 encoded:
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001419 0 - not special
1420 1 - special
1421 2 - whitespace (optional)
1422 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1424 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1425 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1427 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1429 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1431
1432};
1433
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001434/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1435 warnings about the comparison always being false; since
1436 utf7_special[0] is 1, we can safely make that one comparison
1437 true */
1438
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001439#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001440 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001441 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442 (encodeO && (utf7_special[(c)] == 3)))
1443
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001444#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001445 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001446#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001447 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001448#define UB64(c) \
1449 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001450 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001452#define ENCODE(out, ch, bits) \
1453 while (bits >= 6) { \
1454 *out++ = B64(ch >> (bits-6)); \
1455 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 }
1457
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001458#define DECODE(out, ch, bits, surrogate) \
1459 while (bits >= 16) { \
1460 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1461 bits -= 16; \
1462 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001463 /* We have already generated an error for the high surrogate \
1464 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001465 surrogate = 0; \
1466 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001468 it in a 16-bit character */ \
1469 surrogate = 1; \
1470 errmsg = "code pairs are not supported"; \
1471 goto utf7Error; \
1472 } else { \
1473 *out++ = outCh; \
1474 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001478 Py_ssize_t size,
1479 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001481 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1482}
1483
1484PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001485 Py_ssize_t size,
1486 const char *errors,
1487 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001490 Py_ssize_t startinpos;
1491 Py_ssize_t endinpos;
1492 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 const char *e;
1494 PyUnicodeObject *unicode;
1495 Py_UNICODE *p;
1496 const char *errmsg = "";
1497 int inShift = 0;
1498 unsigned int bitsleft = 0;
1499 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 int surrogate = 0;
1501 PyObject *errorHandler = NULL;
1502 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503
1504 unicode = _PyUnicode_New(size);
1505 if (!unicode)
1506 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001507 if (size == 0) {
1508 if (consumed)
1509 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001511 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512
1513 p = unicode->str;
1514 e = s + size;
1515
1516 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 Py_UNICODE ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001518 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001519 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520
1521 if (inShift) {
1522 if ((ch == '-') || !B64CHAR(ch)) {
1523 inShift = 0;
1524 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001525
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1527 if (bitsleft >= 6) {
1528 /* The shift sequence has a partial character in it. If
1529 bitsleft < 6 then we could just classify it as padding
1530 but that is not the case here */
1531
1532 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001533 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 }
1535 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001536 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 here so indicate the potential of a misencoded character. */
1538
1539 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1540 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1541 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001542 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 }
1544
1545 if (ch == '-') {
1546 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001547 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 inShift = 1;
1549 }
1550 } else if (SPECIAL(ch,0,0)) {
1551 errmsg = "unexpected special character";
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001552 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 } else {
1554 *p++ = ch;
1555 }
1556 } else {
1557 charsleft = (charsleft << 6) | UB64(ch);
1558 bitsleft += 6;
1559 s++;
1560 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1561 }
1562 }
1563 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 s++;
1566 if (s < e && *s == '-') {
1567 s++;
1568 *p++ = '+';
1569 } else
1570 {
1571 inShift = 1;
1572 bitsleft = 0;
1573 }
1574 }
1575 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001576 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 errmsg = "unexpected special character";
1578 s++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001579 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 }
1581 else {
1582 *p++ = ch;
1583 s++;
1584 }
1585 continue;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001586 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001587 outpos = p-PyUnicode_AS_UNICODE(unicode);
1588 endinpos = s-starts;
1589 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001590 errors, &errorHandler,
1591 "utf7", errmsg,
1592 starts, size, &startinpos, &endinpos, &exc, &s,
1593 &unicode, &outpos, &p))
1594 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 }
1596
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001597 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 outpos = p-PyUnicode_AS_UNICODE(unicode);
1599 endinpos = size;
1600 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001601 errors, &errorHandler,
1602 "utf7", "unterminated shift sequence",
1603 starts, size, &startinpos, &endinpos, &exc, &s,
1604 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 if (s < e)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001607 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001609 if (consumed) {
1610 if(inShift)
1611 *consumed = startinpos;
1612 else
1613 *consumed = s-starts;
1614 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001616 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 goto onError;
1618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619 Py_XDECREF(errorHandler);
1620 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 return (PyObject *)unicode;
1622
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001623 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 Py_XDECREF(errorHandler);
1625 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 Py_DECREF(unicode);
1627 return NULL;
1628}
1629
1630
1631PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001632 Py_ssize_t size,
1633 int encodeSetO,
1634 int encodeWhiteSpace,
1635 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636{
1637 PyObject *v;
1638 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001639 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 unsigned int bitsleft = 0;
1643 unsigned long charsleft = 0;
1644 char * out;
1645 char * start;
1646
Neal Norwitze7d8be82008-07-31 17:17:14 +00001647 if (cbAllocated / 5 != size)
1648 return PyErr_NoMemory();
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 if (size == 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001651 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001653 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 if (v == NULL)
1655 return NULL;
1656
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001657 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 for (;i < size; ++i) {
1659 Py_UNICODE ch = s[i];
1660
1661 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001662 if (ch == '+') {
1663 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 *out++ = '-';
1665 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1666 charsleft = ch;
1667 bitsleft = 16;
1668 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001669 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001671 } else {
1672 *out++ = (char) ch;
1673 }
1674 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1676 *out++ = B64(charsleft << (6-bitsleft));
1677 charsleft = 0;
1678 bitsleft = 0;
1679 /* Characters not in the BASE64 set implicitly unshift the sequence
1680 so no '-' is required, except if the character is itself a '-' */
1681 if (B64CHAR(ch) || ch == '-') {
1682 *out++ = '-';
1683 }
1684 inShift = 0;
1685 *out++ = (char) ch;
1686 } else {
1687 bitsleft += 16;
1688 charsleft = (charsleft << 16) | ch;
1689 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1690
Jesus Cea585ad8a2009-07-02 15:37:21 +00001691 /* If the next character is special then we don't need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001692 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 or '-' then the shift sequence will be terminated implicitly and we
1694 don't have to insert a '-'. */
1695
1696 if (bitsleft == 0) {
1697 if (i + 1 < size) {
1698 Py_UNICODE ch2 = s[i+1];
1699
1700 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001701
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 } else if (B64CHAR(ch2) || ch2 == '-') {
1703 *out++ = '-';
1704 inShift = 0;
1705 } else {
1706 inShift = 0;
1707 }
1708
1709 }
1710 else {
1711 *out++ = '-';
1712 inShift = 0;
1713 }
1714 }
Tim Petersced69f82003-09-16 20:30:58 +00001715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001717 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 if (bitsleft) {
1719 *out++= B64(charsleft << (6-bitsleft) );
1720 *out++ = '-';
1721 }
1722
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001723 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 return v;
1725}
1726
1727#undef SPECIAL
1728#undef B64
1729#undef B64CHAR
1730#undef UB64
1731#undef ENCODE
1732#undef DECODE
1733
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734/* --- UTF-8 Codec -------------------------------------------------------- */
1735
Tim Petersced69f82003-09-16 20:30:58 +00001736static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737char utf8_code_length[256] = {
Ezio Melotti86e5e172010-07-03 05:34:39 +00001738 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1739 illegal prefix. See RFC 3629 for details */
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti86e5e172010-07-03 05:34:39 +00001747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti86e5e172010-07-03 05:34:39 +00001751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1752 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1753 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1754 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1755 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756};
1757
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001759 Py_ssize_t size,
1760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761{
Walter Dörwald69652032004-09-07 20:24:22 +00001762 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1763}
1764
1765PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001766 Py_ssize_t size,
1767 const char *errors,
1768 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 int n;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001772 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001773 Py_ssize_t startinpos;
1774 Py_ssize_t endinpos;
1775 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 const char *e;
1777 PyUnicodeObject *unicode;
1778 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001779 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 PyObject *errorHandler = NULL;
1781 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782
1783 /* Note: size will always be longer than the resulting Unicode
1784 character count */
1785 unicode = _PyUnicode_New(size);
1786 if (!unicode)
1787 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001788 if (size == 0) {
1789 if (consumed)
1790 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793
1794 /* Unpack UTF-8 encoded data */
1795 p = unicode->str;
1796 e = s + size;
1797
1798 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001799 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800
1801 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001802 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803 s++;
1804 continue;
1805 }
1806
1807 n = utf8_code_length[ch];
1808
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001809 if (s + n > e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001810 if (consumed)
1811 break;
1812 else {
1813 errmsg = "unexpected end of data";
1814 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001815 endinpos = startinpos+1;
1816 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1817 endinpos++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001818 goto utf8Error;
1819 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00001820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 switch (n) {
1823
1824 case 0:
Ezio Melotti86e5e172010-07-03 05:34:39 +00001825 errmsg = "invalid start byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001826 startinpos = s-starts;
1827 endinpos = startinpos+1;
1828 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829
1830 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "internal error";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001832 startinpos = s-starts;
1833 endinpos = startinpos+1;
1834 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835
1836 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti86e5e172010-07-03 05:34:39 +00001838 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001839 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001840 endinpos = startinpos + 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001841 goto utf8Error;
1842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti86e5e172010-07-03 05:34:39 +00001844 assert ((ch > 0x007F) && (ch <= 0x07FF));
1845 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
1848 case 3:
Ezio Melotti86e5e172010-07-03 05:34:39 +00001849 /* XXX: surrogates shouldn't be valid UTF-8!
1850 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1851 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1852 Uncomment the 2 lines below to make them invalid,
1853 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001854 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti86e5e172010-07-03 05:34:39 +00001855 (s[2] & 0xc0) != 0x80 ||
1856 ((unsigned char)s[0] == 0xE0 &&
1857 (unsigned char)s[1] < 0xA0)/* ||
1858 ((unsigned char)s[0] == 0xED &&
1859 (unsigned char)s[1] > 0x9F)*/) {
1860 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001861 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001862 endinpos = startinpos + 1;
1863
1864 /* if s[1] first two bits are 1 and 0, then the invalid
1865 continuation byte is s[2], so increment endinpos by 1,
1866 if not, s[1] is invalid and endinpos doesn't need to
1867 be incremented. */
1868 if ((s[1] & 0xC0) == 0x80)
1869 endinpos++;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001870 goto utf8Error;
1871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti86e5e172010-07-03 05:34:39 +00001873 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1874 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001875 break;
1876
1877 case 4:
1878 if ((s[1] & 0xc0) != 0x80 ||
1879 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti86e5e172010-07-03 05:34:39 +00001880 (s[3] & 0xc0) != 0x80 ||
1881 ((unsigned char)s[0] == 0xF0 &&
1882 (unsigned char)s[1] < 0x90) ||
1883 ((unsigned char)s[0] == 0xF4 &&
1884 (unsigned char)s[1] > 0x8F)) {
1885 errmsg = "invalid continuation byte";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001886 startinpos = s-starts;
Ezio Melotti86e5e172010-07-03 05:34:39 +00001887 endinpos = startinpos + 1;
1888 if ((s[1] & 0xC0) == 0x80) {
1889 endinpos++;
1890 if ((s[2] & 0xC0) == 0x80)
1891 endinpos++;
1892 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001893 goto utf8Error;
1894 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001895 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti86e5e172010-07-03 05:34:39 +00001896 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1897 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
1898
Fredrik Lundh8f455852001-06-27 18:59:43 +00001899#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001900 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001903
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001904 /* translate from 10000..10FFFF to 0..FFFF */
1905 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001906
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001907 /* high surrogate = top 10 bits added to D800 */
1908 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001909
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001910 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914 }
1915 s += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001916 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001917
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001918 utf8Error:
1919 outpos = p-PyUnicode_AS_UNICODE(unicode);
1920 if (unicode_decode_call_errorhandler(
1921 errors, &errorHandler,
1922 "utf8", errmsg,
1923 starts, size, &startinpos, &endinpos, &exc, &s,
1924 &unicode, &outpos, &p))
1925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 }
Walter Dörwald69652032004-09-07 20:24:22 +00001927 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001928 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001931 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 goto onError;
1933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 Py_XDECREF(errorHandler);
1935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 return (PyObject *)unicode;
1937
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001938 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 Py_XDECREF(errorHandler);
1940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 Py_DECREF(unicode);
1942 return NULL;
1943}
1944
Tim Peters602f7402002-04-27 18:03:26 +00001945/* Allocation strategy: if the string is short, convert into a stack buffer
1946 and allocate exactly as much space needed at the end. Else allocate the
1947 maximum possible needed (4 result bytes per Unicode character), and return
1948 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001949*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001950PyObject *
1951PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00001952 Py_ssize_t size,
1953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954{
Tim Peters602f7402002-04-27 18:03:26 +00001955#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001956
Martin v. Löwis18e16552006-02-15 17:27:45 +00001957 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001958 PyObject *v; /* result string object */
1959 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001960 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001961 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001962 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001963
Tim Peters602f7402002-04-27 18:03:26 +00001964 assert(s != NULL);
1965 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
Tim Peters602f7402002-04-27 18:03:26 +00001967 if (size <= MAX_SHORT_UNICHARS) {
1968 /* Write into the stack buffer; nallocated can't overflow.
1969 * At the end, we'll allocate exactly as much heap space as it
1970 * turns out we need.
1971 */
1972 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1973 v = NULL; /* will allocate after we're done */
1974 p = stackbuf;
1975 }
1976 else {
1977 /* Overallocate on the heap, and give the excess back at the end. */
1978 nallocated = size * 4;
1979 if (nallocated / 4 != size) /* overflow! */
1980 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001981 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001982 if (v == NULL)
1983 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001984 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001985 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001986
Tim Peters602f7402002-04-27 18:03:26 +00001987 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001989
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001990 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001991 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001993
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001995 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001996 *p++ = (char)(0xc0 | (ch >> 6));
1997 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001998 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001999 else {
Tim Peters602f7402002-04-27 18:03:26 +00002000 /* Encode UCS2 Unicode ordinals */
2001 if (ch < 0x10000) {
2002 /* Special case: check for high surrogate */
2003 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2004 Py_UCS4 ch2 = s[i];
2005 /* Check for low surrogate and combine the two to
2006 form a UCS4 value */
2007 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002008 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002009 i++;
2010 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 }
Tim Peters602f7402002-04-27 18:03:26 +00002012 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002013 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002015 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2016 *p++ = (char)(0x80 | (ch & 0x3f));
2017 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002018 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002019 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002020 /* Encode UCS4 Unicode ordinals */
2021 *p++ = (char)(0xf0 | (ch >> 18));
2022 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2023 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2024 *p++ = (char)(0x80 | (ch & 0x3f));
2025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002027
Tim Peters602f7402002-04-27 18:03:26 +00002028 if (v == NULL) {
2029 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002030 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002031 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002032 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002033 }
2034 else {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002035 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002036 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002037 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002038 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002041
Tim Peters602f7402002-04-27 18:03:26 +00002042#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043}
2044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 if (!PyUnicode_Check(unicode)) {
2048 PyErr_BadArgument();
2049 return NULL;
2050 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002051 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002052 PyUnicode_GET_SIZE(unicode),
2053 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054}
2055
Walter Dörwald6e390802007-08-17 16:41:28 +00002056/* --- UTF-32 Codec ------------------------------------------------------- */
2057
2058PyObject *
2059PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002060 Py_ssize_t size,
2061 const char *errors,
2062 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002063{
2064 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2065}
2066
2067PyObject *
2068PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002069 Py_ssize_t size,
2070 const char *errors,
2071 int *byteorder,
2072 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002073{
2074 const char *starts = s;
2075 Py_ssize_t startinpos;
2076 Py_ssize_t endinpos;
2077 Py_ssize_t outpos;
2078 PyUnicodeObject *unicode;
2079 Py_UNICODE *p;
2080#ifndef Py_UNICODE_WIDE
Antoine Pitrou4595e512010-06-11 21:48:02 +00002081 int pairs = 0;
Walter Dörwald6e390802007-08-17 16:41:28 +00002082#else
2083 const int pairs = 0;
2084#endif
Antoine Pitrou4595e512010-06-11 21:48:02 +00002085 const unsigned char *q, *e, *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002086 int bo = 0; /* assume native ordering by default */
2087 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002088 /* Offsets from q for retrieving bytes in the right order. */
2089#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2090 int iorder[] = {0, 1, 2, 3};
2091#else
2092 int iorder[] = {3, 2, 1, 0};
2093#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002094 PyObject *errorHandler = NULL;
2095 PyObject *exc = NULL;
Antoine Pitrou4595e512010-06-11 21:48:02 +00002096
Walter Dörwald6e390802007-08-17 16:41:28 +00002097 q = (unsigned char *)s;
2098 e = q + size;
2099
2100 if (byteorder)
2101 bo = *byteorder;
2102
2103 /* Check for BOM marks (U+FEFF) in the input and adjust current
2104 byte order setting accordingly. In native mode, the leading BOM
2105 mark is skipped, in all other modes, it is copied to the output
2106 stream as-is (giving a ZWNBSP character). */
2107 if (bo == 0) {
2108 if (size >= 4) {
2109 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002110 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002111#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002112 if (bom == 0x0000FEFF) {
2113 q += 4;
2114 bo = -1;
2115 }
2116 else if (bom == 0xFFFE0000) {
2117 q += 4;
2118 bo = 1;
2119 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002120#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002121 if (bom == 0x0000FEFF) {
2122 q += 4;
2123 bo = 1;
2124 }
2125 else if (bom == 0xFFFE0000) {
2126 q += 4;
2127 bo = -1;
2128 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002129#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002130 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002131 }
2132
2133 if (bo == -1) {
2134 /* force LE */
2135 iorder[0] = 0;
2136 iorder[1] = 1;
2137 iorder[2] = 2;
2138 iorder[3] = 3;
2139 }
2140 else if (bo == 1) {
2141 /* force BE */
2142 iorder[0] = 3;
2143 iorder[1] = 2;
2144 iorder[2] = 1;
2145 iorder[3] = 0;
2146 }
2147
Antoine Pitrou4595e512010-06-11 21:48:02 +00002148 /* On narrow builds we split characters outside the BMP into two
2149 codepoints => count how much extra space we need. */
2150#ifndef Py_UNICODE_WIDE
2151 for (qq = q; qq < e; qq += 4)
2152 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2153 pairs++;
2154#endif
2155
2156 /* This might be one to much, because of a BOM */
2157 unicode = _PyUnicode_New((size+3)/4+pairs);
2158 if (!unicode)
2159 return NULL;
2160 if (size == 0)
2161 return (PyObject *)unicode;
2162
2163 /* Unpack UTF-32 encoded data */
2164 p = unicode->str;
2165
Walter Dörwald6e390802007-08-17 16:41:28 +00002166 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002167 Py_UCS4 ch;
2168 /* remaining bytes at the end? (size should be divisible by 4) */
2169 if (e-q<4) {
2170 if (consumed)
2171 break;
2172 errmsg = "truncated data";
2173 startinpos = ((const char *)q)-starts;
2174 endinpos = ((const char *)e)-starts;
2175 goto utf32Error;
2176 /* The remaining input chars are ignored if the callback
2177 chooses to skip the input */
2178 }
2179 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2180 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002181
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002182 if (ch >= 0x110000)
2183 {
2184 errmsg = "codepoint not in range(0x110000)";
2185 startinpos = ((const char *)q)-starts;
2186 endinpos = startinpos+4;
2187 goto utf32Error;
2188 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002189#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002190 if (ch >= 0x10000)
2191 {
2192 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2193 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2194 }
2195 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002196#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002197 *p++ = ch;
2198 q += 4;
2199 continue;
2200 utf32Error:
2201 outpos = p-PyUnicode_AS_UNICODE(unicode);
2202 if (unicode_decode_call_errorhandler(
2203 errors, &errorHandler,
2204 "utf32", errmsg,
Georg Brandlf7a09be2009-09-17 11:33:31 +00002205 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002206 &unicode, &outpos, &p))
2207 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002208 }
2209
2210 if (byteorder)
2211 *byteorder = bo;
2212
2213 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002214 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002215
2216 /* Adjust length */
2217 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2218 goto onError;
2219
2220 Py_XDECREF(errorHandler);
2221 Py_XDECREF(exc);
2222 return (PyObject *)unicode;
2223
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002224 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002225 Py_DECREF(unicode);
2226 Py_XDECREF(errorHandler);
2227 Py_XDECREF(exc);
2228 return NULL;
2229}
2230
2231PyObject *
2232PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002233 Py_ssize_t size,
2234 const char *errors,
2235 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002236{
2237 PyObject *v;
2238 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002239 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002240#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002241 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002242#else
2243 const int pairs = 0;
2244#endif
2245 /* Offsets from p for storing byte pairs in the right order. */
2246#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2247 int iorder[] = {0, 1, 2, 3};
2248#else
2249 int iorder[] = {3, 2, 1, 0};
2250#endif
2251
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002252#define STORECHAR(CH) \
2253 do { \
2254 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2255 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2256 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2257 p[iorder[0]] = (CH) & 0xff; \
2258 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002259 } while(0)
2260
2261 /* In narrow builds we can output surrogate pairs as one codepoint,
2262 so we need less space. */
2263#ifndef Py_UNICODE_WIDE
2264 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002265 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2266 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2267 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002268#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002269 nsize = (size - pairs + (byteorder == 0));
2270 bytesize = nsize * 4;
2271 if (bytesize / 4 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002272 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002273 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002274 if (v == NULL)
2275 return NULL;
2276
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002277 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002278 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002279 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002280 if (size == 0)
2281 return v;
2282
2283 if (byteorder == -1) {
2284 /* force LE */
2285 iorder[0] = 0;
2286 iorder[1] = 1;
2287 iorder[2] = 2;
2288 iorder[3] = 3;
2289 }
2290 else if (byteorder == 1) {
2291 /* force BE */
2292 iorder[0] = 3;
2293 iorder[1] = 2;
2294 iorder[2] = 1;
2295 iorder[3] = 0;
2296 }
2297
2298 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002299 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002300#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002301 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2302 Py_UCS4 ch2 = *s;
2303 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2304 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2305 s++;
2306 size--;
2307 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002308 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002309#endif
2310 STORECHAR(ch);
2311 }
2312 return v;
2313#undef STORECHAR
2314}
2315
2316PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2317{
2318 if (!PyUnicode_Check(unicode)) {
2319 PyErr_BadArgument();
2320 return NULL;
2321 }
2322 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002323 PyUnicode_GET_SIZE(unicode),
2324 NULL,
2325 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002326}
2327
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328/* --- UTF-16 Codec ------------------------------------------------------- */
2329
Tim Peters772747b2001-08-09 22:21:55 +00002330PyObject *
2331PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002332 Py_ssize_t size,
2333 const char *errors,
2334 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335{
Walter Dörwald69652032004-09-07 20:24:22 +00002336 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2337}
2338
2339PyObject *
2340PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002341 Py_ssize_t size,
2342 const char *errors,
2343 int *byteorder,
2344 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002345{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002347 Py_ssize_t startinpos;
2348 Py_ssize_t endinpos;
2349 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 PyUnicodeObject *unicode;
2351 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002352 const unsigned char *q, *e;
2353 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002354 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002355 /* Offsets from q for retrieving byte pairs in the right order. */
2356#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2357 int ihi = 1, ilo = 0;
2358#else
2359 int ihi = 0, ilo = 1;
2360#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 PyObject *errorHandler = NULL;
2362 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363
2364 /* Note: size will always be longer than the resulting Unicode
2365 character count */
2366 unicode = _PyUnicode_New(size);
2367 if (!unicode)
2368 return NULL;
2369 if (size == 0)
2370 return (PyObject *)unicode;
2371
2372 /* Unpack UTF-16 encoded data */
2373 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002374 q = (unsigned char *)s;
2375 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376
2377 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002378 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002380 /* Check for BOM marks (U+FEFF) in the input and adjust current
2381 byte order setting accordingly. In native mode, the leading BOM
2382 mark is skipped, in all other modes, it is copied to the output
2383 stream as-is (giving a ZWNBSP character). */
2384 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002385 if (size >= 2) {
2386 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002387#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002388 if (bom == 0xFEFF) {
2389 q += 2;
2390 bo = -1;
2391 }
2392 else if (bom == 0xFFFE) {
2393 q += 2;
2394 bo = 1;
2395 }
Tim Petersced69f82003-09-16 20:30:58 +00002396#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002397 if (bom == 0xFEFF) {
2398 q += 2;
2399 bo = 1;
2400 }
2401 else if (bom == 0xFFFE) {
2402 q += 2;
2403 bo = -1;
2404 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002405#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002406 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408
Tim Peters772747b2001-08-09 22:21:55 +00002409 if (bo == -1) {
2410 /* force LE */
2411 ihi = 1;
2412 ilo = 0;
2413 }
2414 else if (bo == 1) {
2415 /* force BE */
2416 ihi = 0;
2417 ilo = 1;
2418 }
2419
2420 while (q < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002421 Py_UNICODE ch;
2422 /* remaining bytes at the end? (size should be even) */
2423 if (e-q<2) {
2424 if (consumed)
2425 break;
2426 errmsg = "truncated data";
2427 startinpos = ((const char *)q)-starts;
2428 endinpos = ((const char *)e)-starts;
2429 goto utf16Error;
2430 /* The remaining input chars are ignored if the callback
2431 chooses to skip the input */
2432 }
2433 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002434
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002435 q += 2;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002436
2437 if (ch < 0xD800 || ch > 0xDFFF) {
2438 *p++ = ch;
2439 continue;
2440 }
2441
2442 /* UTF-16 code pair: */
2443 if (q >= e) {
2444 errmsg = "unexpected end of data";
2445 startinpos = (((const char *)q)-2)-starts;
2446 endinpos = ((const char *)e)-starts;
2447 goto utf16Error;
2448 }
2449 if (0xD800 <= ch && ch <= 0xDBFF) {
2450 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2451 q += 2;
2452 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002453#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002454 *p++ = ch;
2455 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002456#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002457 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002458#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002459 continue;
2460 }
2461 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002462 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002463 startinpos = (((const char *)q)-4)-starts;
2464 endinpos = startinpos+2;
2465 goto utf16Error;
2466 }
2467
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002468 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002469 errmsg = "illegal encoding";
2470 startinpos = (((const char *)q)-2)-starts;
2471 endinpos = startinpos+2;
2472 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002473
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002474 utf16Error:
2475 outpos = p-PyUnicode_AS_UNICODE(unicode);
2476 if (unicode_decode_call_errorhandler(
2477 errors, &errorHandler,
2478 "utf16", errmsg,
2479 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2480 &unicode, &outpos, &p))
2481 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
2483
2484 if (byteorder)
2485 *byteorder = bo;
2486
Walter Dörwald69652032004-09-07 20:24:22 +00002487 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002488 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002489
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002491 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 goto onError;
2493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 Py_XDECREF(errorHandler);
2495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 return (PyObject *)unicode;
2497
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002498 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 Py_XDECREF(errorHandler);
2501 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 return NULL;
2503}
2504
Tim Peters772747b2001-08-09 22:21:55 +00002505PyObject *
2506PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002507 Py_ssize_t size,
2508 const char *errors,
2509 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510{
2511 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002512 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002513 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002514#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002515 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002516#else
2517 const int pairs = 0;
2518#endif
Tim Peters772747b2001-08-09 22:21:55 +00002519 /* Offsets from p for storing byte pairs in the right order. */
2520#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2521 int ihi = 1, ilo = 0;
2522#else
2523 int ihi = 0, ilo = 1;
2524#endif
2525
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002526#define STORECHAR(CH) \
2527 do { \
2528 p[ihi] = ((CH) >> 8) & 0xff; \
2529 p[ilo] = (CH) & 0xff; \
2530 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002531 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002533#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002534 for (i = pairs = 0; i < size; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002535 if (s[i] >= 0x10000)
2536 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002537#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002538 /* 2 * (size + pairs + (byteorder == 0)) */
2539 if (size > PY_SSIZE_T_MAX ||
2540 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002541 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002542 nsize = size + pairs + (byteorder == 0);
2543 bytesize = nsize * 2;
2544 if (bytesize / 2 != nsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002545 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002546 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 if (v == NULL)
2548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002550 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 if (byteorder == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002552 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002553 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002554 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002555
2556 if (byteorder == -1) {
2557 /* force LE */
2558 ihi = 1;
2559 ilo = 0;
2560 }
2561 else if (byteorder == 1) {
2562 /* force BE */
2563 ihi = 0;
2564 ilo = 1;
2565 }
2566
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002567 while (size-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002568 Py_UNICODE ch = *s++;
2569 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002570#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002571 if (ch >= 0x10000) {
2572 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2573 ch = 0xD800 | ((ch-0x10000) >> 10);
2574 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002575#endif
Tim Peters772747b2001-08-09 22:21:55 +00002576 STORECHAR(ch);
2577 if (ch2)
2578 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002581#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582}
2583
2584PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2585{
2586 if (!PyUnicode_Check(unicode)) {
2587 PyErr_BadArgument();
2588 return NULL;
2589 }
2590 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002591 PyUnicode_GET_SIZE(unicode),
2592 NULL,
2593 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594}
2595
2596/* --- Unicode Escape Codec ----------------------------------------------- */
2597
Fredrik Lundh06d12682001-01-24 07:59:11 +00002598static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002599
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002601 Py_ssize_t size,
2602 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002604 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002605 Py_ssize_t startinpos;
2606 Py_ssize_t endinpos;
2607 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002612 char* message;
2613 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 PyObject *errorHandler = NULL;
2615 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 /* Escaped strings will always be longer than the resulting
2618 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 length after conversion to the true value.
2620 (but if the error callback returns a long replacement string
2621 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 v = _PyUnicode_New(size);
2623 if (v == NULL)
2624 goto onError;
2625 if (size == 0)
2626 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002630
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 while (s < end) {
2632 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002633 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635
2636 /* Non-escape characters are interpreted as Unicode ordinals */
2637 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002638 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 continue;
2640 }
2641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 /* \ - Escapes */
2644 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002645 c = *s++;
2646 if (s > end)
2647 c = '\0'; /* Invalid after \ */
2648 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002650 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 case '\n': break;
2652 case '\\': *p++ = '\\'; break;
2653 case '\'': *p++ = '\''; break;
2654 case '\"': *p++ = '\"'; break;
2655 case 'b': *p++ = '\b'; break;
2656 case 'f': *p++ = '\014'; break; /* FF */
2657 case 't': *p++ = '\t'; break;
2658 case 'n': *p++ = '\n'; break;
2659 case 'r': *p++ = '\r'; break;
2660 case 'v': *p++ = '\013'; break; /* VT */
2661 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2662
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002663 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 case '0': case '1': case '2': case '3':
2665 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002666 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002667 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002668 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002669 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002670 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002672 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 break;
2674
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002675 /* hex escapes */
2676 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002678 digits = 2;
2679 message = "truncated \\xXX escape";
2680 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002682 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002684 digits = 4;
2685 message = "truncated \\uXXXX escape";
2686 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002688 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002689 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690 digits = 8;
2691 message = "truncated \\UXXXXXXXX escape";
2692 hexescape:
2693 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 outpos = p-PyUnicode_AS_UNICODE(v);
2695 if (s+digits>end) {
2696 endinpos = size;
2697 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002698 errors, &errorHandler,
2699 "unicodeescape", "end of string in escape sequence",
2700 starts, size, &startinpos, &endinpos, &exc, &s,
2701 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002702 goto onError;
2703 goto nextByte;
2704 }
2705 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002706 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002707 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 endinpos = (s+i+1)-starts;
2709 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002710 errors, &errorHandler,
2711 "unicodeescape", message,
2712 starts, size, &startinpos, &endinpos, &exc, &s,
2713 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002714 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002716 }
2717 chr = (chr<<4) & ~0xF;
2718 if (c >= '0' && c <= '9')
2719 chr += c - '0';
2720 else if (c >= 'a' && c <= 'f')
2721 chr += 10 + c - 'a';
2722 else
2723 chr += 10 + c - 'A';
2724 }
2725 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002726 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 /* _decoding_error will have already written into the
2728 target buffer. */
2729 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002730 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002731 /* when we get here, chr is a 32-bit unicode character */
2732 if (chr <= 0xffff)
2733 /* UCS-2 character */
2734 *p++ = (Py_UNICODE) chr;
2735 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002736 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002737 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002738#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002739 *p++ = chr;
2740#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002741 chr -= 0x10000L;
2742 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002743 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002744#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002745 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 endinpos = s-starts;
2747 outpos = p-PyUnicode_AS_UNICODE(v);
2748 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002749 errors, &errorHandler,
2750 "unicodeescape", "illegal Unicode character",
2751 starts, size, &startinpos, &endinpos, &exc, &s,
2752 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002753 goto onError;
2754 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002755 break;
2756
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002757 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758 case 'N':
2759 message = "malformed \\N character escape";
2760 if (ucnhash_CAPI == NULL) {
2761 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002762 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002763 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002764 if (m == NULL)
2765 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002766 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002768 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002769 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002770 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002771 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002772 if (ucnhash_CAPI == NULL)
2773 goto ucnhashError;
2774 }
2775 if (*s == '{') {
2776 const char *start = s+1;
2777 /* look for the closing brace */
2778 while (*s != '}' && s < end)
2779 s++;
2780 if (s > start && s < end && *s == '}') {
2781 /* found a name. look it up in the unicode database */
2782 message = "unknown Unicode character name";
2783 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002784 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 goto store;
2786 }
2787 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 endinpos = s-starts;
2789 outpos = p-PyUnicode_AS_UNICODE(v);
2790 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002791 errors, &errorHandler,
2792 "unicodeescape", message,
2793 starts, size, &startinpos, &endinpos, &exc, &s,
2794 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002796 break;
2797
2798 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002799 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 message = "\\ at end of string";
2801 s--;
2802 endinpos = s-starts;
2803 outpos = p-PyUnicode_AS_UNICODE(v);
2804 if (unicode_decode_call_errorhandler(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002805 errors, &errorHandler,
2806 "unicodeescape", message,
2807 starts, size, &startinpos, &endinpos, &exc, &s,
2808 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002809 goto onError;
2810 }
2811 else {
2812 *p++ = '\\';
2813 *p++ = (unsigned char)s[-1];
2814 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002815 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002817 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002820 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002822 Py_XDECREF(errorHandler);
2823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002825
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002826 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002827 PyErr_SetString(
2828 PyExc_UnicodeError,
2829 "\\N escapes not supported (can't load unicodedata module)"
2830 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002831 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 Py_XDECREF(errorHandler);
2833 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002834 return NULL;
2835
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002836 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 Py_XDECREF(errorHandler);
2839 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 return NULL;
2841}
2842
2843/* Return a Unicode-Escape string version of the Unicode object.
2844
2845 If quotes is true, the string is enclosed in u"" or u'' quotes as
2846 appropriate.
2847
2848*/
2849
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002850Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002851 Py_ssize_t size,
2852 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002853{
2854 /* like wcschr, but doesn't stop at NULL characters */
2855
2856 while (size-- > 0) {
2857 if (*s == ch)
2858 return s;
2859 s++;
2860 }
2861
2862 return NULL;
2863}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865static
2866PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002867 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 int quotes)
2869{
2870 PyObject *repr;
2871 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002873 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002874#ifdef Py_UNICODE_WIDE
2875 const Py_ssize_t expandsize = 10;
2876#else
2877 const Py_ssize_t expandsize = 6;
2878#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879
Neal Norwitz17753ec2006-08-21 22:21:19 +00002880 /* XXX(nnorwitz): rather than over-allocating, it would be
2881 better to choose a different scheme. Perhaps scan the
2882 first N-chars of the string and allocate based on that size.
2883 */
2884 /* Initial allocation is based on the longest-possible unichr
2885 escape.
2886
2887 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2888 unichr, so in this case it's the longest unichr escape. In
2889 narrow (UTF-16) builds this is five chars per source unichr
2890 since there are two unichrs in the surrogate pair, so in narrow
2891 (UTF-16) builds it's not the longest unichr escape.
2892
2893 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2894 so in the narrow (UTF-16) build case it's the longest unichr
2895 escape.
2896 */
2897
Neal Norwitze7d8be82008-07-31 17:17:14 +00002898 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002899 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002900
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002901 repr = PyString_FromStringAndSize(NULL,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002902 2
2903 + expandsize*size
2904 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 if (repr == NULL)
2906 return NULL;
2907
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002908 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909
2910 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002912 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 !findchar(s, size, '"')) ? '"' : '\'';
2914 }
2915 while (size-- > 0) {
2916 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002917
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002918 /* Escape quotes and backslashes */
2919 if ((quotes &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002920 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 *p++ = '\\';
2922 *p++ = (char) ch;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002923 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002924 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002925
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002926#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002927 /* Map 21-bit characters to '\U00xxxxxx' */
2928 else if (ch >= 0x10000) {
2929 *p++ = '\\';
2930 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002931 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2932 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2933 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2934 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2935 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2936 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2937 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002938 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002939 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002940 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002941#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002942 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2943 else if (ch >= 0xD800 && ch < 0xDC00) {
2944 Py_UNICODE ch2;
2945 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002946
Benjamin Peterson339f8c62009-01-31 22:25:08 +00002947 ch2 = *s++;
2948 size--;
2949 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2950 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2951 *p++ = '\\';
2952 *p++ = 'U';
2953 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2954 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2955 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2956 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2957 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2958 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2959 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2960 *p++ = hexdigit[ucs & 0x0000000F];
2961 continue;
2962 }
2963 /* Fall through: isolated surrogates are copied as-is */
2964 s--;
2965 size++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00002966 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002967#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002968
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002970 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 *p++ = '\\';
2972 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002973 *p++ = hexdigit[(ch >> 12) & 0x000F];
2974 *p++ = hexdigit[(ch >> 8) & 0x000F];
2975 *p++ = hexdigit[(ch >> 4) & 0x000F];
2976 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002978
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002979 /* Map special whitespace to '\t', \n', '\r' */
2980 else if (ch == '\t') {
2981 *p++ = '\\';
2982 *p++ = 't';
2983 }
2984 else if (ch == '\n') {
2985 *p++ = '\\';
2986 *p++ = 'n';
2987 }
2988 else if (ch == '\r') {
2989 *p++ = '\\';
2990 *p++ = 'r';
2991 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002993 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002994 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002996 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002997 *p++ = hexdigit[(ch >> 4) & 0x000F];
2998 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002999 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003000
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 /* Copy everything else as-is */
3002 else
3003 *p++ = (char) ch;
3004 }
3005 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003006 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007
3008 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003009 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010 return repr;
3011}
3012
3013PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003014 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015{
3016 return unicodeescape_string(s, size, 0);
3017}
3018
3019PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3020{
3021 if (!PyUnicode_Check(unicode)) {
3022 PyErr_BadArgument();
3023 return NULL;
3024 }
3025 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003026 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027}
3028
3029/* --- Raw Unicode Escape Codec ------------------------------------------- */
3030
3031PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003032 Py_ssize_t size,
3033 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003036 Py_ssize_t startinpos;
3037 Py_ssize_t endinpos;
3038 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 const char *end;
3042 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 PyObject *errorHandler = NULL;
3044 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003045
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 /* Escaped strings will always be longer than the resulting
3047 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 length after conversion to the true value. (But decoding error
3049 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 v = _PyUnicode_New(size);
3051 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003054 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 end = s + size;
3057 while (s < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003058 unsigned char c;
3059 Py_UCS4 x;
3060 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003061 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003063 /* Non-escape characters are interpreted as Unicode ordinals */
3064 if (*s != '\\') {
3065 *p++ = (unsigned char)*s++;
3066 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003067 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003068 startinpos = s-starts;
3069
3070 /* \u-escapes are only interpreted iff the number of leading
3071 backslashes if odd */
3072 bs = s;
3073 for (;s < end;) {
3074 if (*s != '\\')
3075 break;
3076 *p++ = (unsigned char)*s++;
3077 }
3078 if (((s - bs) & 1) == 0 ||
3079 s >= end ||
3080 (*s != 'u' && *s != 'U')) {
3081 continue;
3082 }
3083 p--;
3084 count = *s=='u' ? 4 : 8;
3085 s++;
3086
3087 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3088 outpos = p-PyUnicode_AS_UNICODE(v);
3089 for (x = 0, i = 0; i < count; ++i, ++s) {
3090 c = (unsigned char)*s;
3091 if (!isxdigit(c)) {
3092 endinpos = s-starts;
3093 if (unicode_decode_call_errorhandler(
3094 errors, &errorHandler,
3095 "rawunicodeescape", "truncated \\uXXXX",
3096 starts, size, &startinpos, &endinpos, &exc, &s,
3097 &v, &outpos, &p))
3098 goto onError;
3099 goto nextByte;
3100 }
3101 x = (x<<4) & ~0xF;
3102 if (c >= '0' && c <= '9')
3103 x += c - '0';
3104 else if (c >= 'a' && c <= 'f')
3105 x += 10 + c - 'a';
3106 else
3107 x += 10 + c - 'A';
3108 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003109 if (x <= 0xffff)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003110 /* UCS-2 character */
3111 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003112 else if (x <= 0x10ffff) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003113 /* UCS-4 character. Either store directly, or as
3114 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003115#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003116 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003117#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003118 x -= 0x10000L;
3119 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3120 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003121#endif
3122 } else {
3123 endinpos = s-starts;
3124 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003125 if (unicode_decode_call_errorhandler(
3126 errors, &errorHandler,
3127 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003128 starts, size, &startinpos, &endinpos, &exc, &s,
3129 &v, &outpos, &p))
3130 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003131 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003132 nextByte:
3133 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003135 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003137 Py_XDECREF(errorHandler);
3138 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003140
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003141 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 Py_XDECREF(errorHandler);
3144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 return NULL;
3146}
3147
3148PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003149 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150{
3151 PyObject *repr;
3152 char *p;
3153 char *q;
3154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003155 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003156#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003157 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003159 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003160#endif
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003161
Neal Norwitze7d8be82008-07-31 17:17:14 +00003162 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003163 return PyErr_NoMemory();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003164
Neal Norwitze7d8be82008-07-31 17:17:14 +00003165 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 if (repr == NULL)
3167 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003168 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003169 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003171 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 while (size-- > 0) {
3173 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003174#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003175 /* Map 32-bit characters to '\Uxxxxxxxx' */
3176 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177 *p++ = '\\';
3178 *p++ = 'U';
3179 *p++ = hexdigit[(ch >> 28) & 0xf];
3180 *p++ = hexdigit[(ch >> 24) & 0xf];
3181 *p++ = hexdigit[(ch >> 20) & 0xf];
3182 *p++ = hexdigit[(ch >> 16) & 0xf];
3183 *p++ = hexdigit[(ch >> 12) & 0xf];
3184 *p++ = hexdigit[(ch >> 8) & 0xf];
3185 *p++ = hexdigit[(ch >> 4) & 0xf];
3186 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003187 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003188 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003189#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003190 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3191 if (ch >= 0xD800 && ch < 0xDC00) {
3192 Py_UNICODE ch2;
3193 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003194
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003195 ch2 = *s++;
3196 size--;
3197 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3198 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3199 *p++ = '\\';
3200 *p++ = 'U';
3201 *p++ = hexdigit[(ucs >> 28) & 0xf];
3202 *p++ = hexdigit[(ucs >> 24) & 0xf];
3203 *p++ = hexdigit[(ucs >> 20) & 0xf];
3204 *p++ = hexdigit[(ucs >> 16) & 0xf];
3205 *p++ = hexdigit[(ucs >> 12) & 0xf];
3206 *p++ = hexdigit[(ucs >> 8) & 0xf];
3207 *p++ = hexdigit[(ucs >> 4) & 0xf];
3208 *p++ = hexdigit[ucs & 0xf];
3209 continue;
3210 }
3211 /* Fall through: isolated surrogates are copied as-is */
3212 s--;
3213 size++;
3214 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003215#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003216 /* Map 16-bit characters to '\uxxxx' */
3217 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 *p++ = '\\';
3219 *p++ = 'u';
3220 *p++ = hexdigit[(ch >> 12) & 0xf];
3221 *p++ = hexdigit[(ch >> 8) & 0xf];
3222 *p++ = hexdigit[(ch >> 4) & 0xf];
3223 *p++ = hexdigit[ch & 15];
3224 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003225 /* Copy everything else as-is */
3226 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 *p++ = (char) ch;
3228 }
3229 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003230 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 return repr;
3232}
3233
3234PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3235{
3236 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003237 PyErr_BadArgument();
3238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 }
3240 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003241 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242}
3243
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003244/* --- Unicode Internal Codec ------------------------------------------- */
3245
3246PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003247 Py_ssize_t size,
3248 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003249{
3250 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003251 Py_ssize_t startinpos;
3252 Py_ssize_t endinpos;
3253 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003254 PyUnicodeObject *v;
3255 Py_UNICODE *p;
3256 const char *end;
3257 const char *reason;
3258 PyObject *errorHandler = NULL;
3259 PyObject *exc = NULL;
3260
Neal Norwitzd43069c2006-01-08 01:12:10 +00003261#ifdef Py_UNICODE_WIDE
3262 Py_UNICODE unimax = PyUnicode_GetMax();
3263#endif
3264
Armin Rigo7ccbca92006-10-04 12:17:45 +00003265 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003266 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3267 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003268 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003269 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003270 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003271 p = PyUnicode_AS_UNICODE(v);
3272 end = s + size;
3273
3274 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003275 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003276 /* We have to sanity check the raw data, otherwise doom looms for
3277 some malformed UCS-4 data. */
3278 if (
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003279#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003280 *p > unimax || *p < 0 ||
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003281#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003282 end-s < Py_UNICODE_SIZE
3283 )
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003284 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003285 startinpos = s - starts;
3286 if (end-s < Py_UNICODE_SIZE) {
3287 endinpos = end-starts;
3288 reason = "truncated input";
3289 }
3290 else {
3291 endinpos = s - starts + Py_UNICODE_SIZE;
3292 reason = "illegal code point (> 0x10FFFF)";
3293 }
3294 outpos = p - PyUnicode_AS_UNICODE(v);
3295 if (unicode_decode_call_errorhandler(
3296 errors, &errorHandler,
3297 "unicode_internal", reason,
3298 starts, size, &startinpos, &endinpos, &exc, &s,
Benjamin Peterson828a7062008-12-27 17:05:29 +00003299 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003300 goto onError;
3301 }
3302 }
3303 else {
3304 p++;
3305 s += Py_UNICODE_SIZE;
3306 }
3307 }
3308
Martin v. Löwis412fb672006-04-13 06:34:32 +00003309 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003310 goto onError;
3311 Py_XDECREF(errorHandler);
3312 Py_XDECREF(exc);
3313 return (PyObject *)v;
3314
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003315 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003316 Py_XDECREF(v);
3317 Py_XDECREF(errorHandler);
3318 Py_XDECREF(exc);
3319 return NULL;
3320}
3321
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322/* --- Latin-1 Codec ------------------------------------------------------ */
3323
3324PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003325 Py_ssize_t size,
3326 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327{
3328 PyUnicodeObject *v;
3329 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003330
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003332 if (size == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003333 Py_UNICODE r = *(unsigned char*)s;
3334 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003335 }
3336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 v = _PyUnicode_New(size);
3338 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003339 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003341 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 p = PyUnicode_AS_UNICODE(v);
3343 while (size-- > 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003344 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003346
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003347 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 Py_XDECREF(v);
3349 return NULL;
3350}
3351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352/* create or adjust a UnicodeEncodeError */
3353static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003354 const char *encoding,
3355 const Py_UNICODE *unicode, Py_ssize_t size,
3356 Py_ssize_t startpos, Py_ssize_t endpos,
3357 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (*exceptionObject == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003360 *exceptionObject = PyUnicodeEncodeError_Create(
3361 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 }
3363 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003364 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3365 goto onError;
3366 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3367 goto onError;
3368 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3369 goto onError;
3370 return;
3371 onError:
3372 Py_DECREF(*exceptionObject);
3373 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 }
3375}
3376
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377/* raises a UnicodeEncodeError */
3378static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003379 const char *encoding,
3380 const Py_UNICODE *unicode, Py_ssize_t size,
3381 Py_ssize_t startpos, Py_ssize_t endpos,
3382 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383{
3384 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003385 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003387 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388}
3389
3390/* error handling callback helper:
3391 build arguments, call the callback and check the arguments,
3392 put the result into newpos and return the replacement string, which
3393 has to be freed by the caller */
3394static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003395 PyObject **errorHandler,
3396 const char *encoding, const char *reason,
3397 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3398 Py_ssize_t startpos, Py_ssize_t endpos,
3399 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003401 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402
3403 PyObject *restuple;
3404 PyObject *resunicode;
3405
3406 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003407 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 }
3411
3412 make_encode_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003413 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416
3417 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003418 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00003422 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003423 Py_DECREF(restuple);
3424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 }
3426 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003427 &resunicode, newpos)) {
3428 Py_DECREF(restuple);
3429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 }
3431 if (*newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003432 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003433 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003434 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3435 Py_DECREF(restuple);
3436 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003437 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 Py_INCREF(resunicode);
3439 Py_DECREF(restuple);
3440 return resunicode;
3441}
3442
3443static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003444 Py_ssize_t size,
3445 const char *errors,
3446 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447{
3448 /* output object */
3449 PyObject *res;
3450 /* pointers to the beginning and end+1 of input */
3451 const Py_UNICODE *startp = p;
3452 const Py_UNICODE *endp = p + size;
3453 /* pointer to the beginning of the unencodable characters */
3454 /* const Py_UNICODE *badp = NULL; */
3455 /* pointer into the output */
3456 char *str;
3457 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003458 Py_ssize_t respos = 0;
3459 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003460 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3461 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 PyObject *errorHandler = NULL;
3463 PyObject *exc = NULL;
3464 /* the following variable is used for caching string comparisons
3465 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3466 int known_errorHandler = -1;
3467
3468 /* allocate enough for a simple encoding without
3469 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003470 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 if (res == NULL)
3472 goto onError;
3473 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003474 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003475 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 ressize = size;
3477
3478 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003479 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003481 /* can we encode this? */
3482 if (c<limit) {
3483 /* no overflow check, because we know that the space is enough */
3484 *str++ = (char)c;
3485 ++p;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003486 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003487 else {
3488 Py_ssize_t unicodepos = p-startp;
3489 Py_ssize_t requiredsize;
3490 PyObject *repunicode;
3491 Py_ssize_t repsize;
3492 Py_ssize_t newpos;
3493 Py_ssize_t respos;
3494 Py_UNICODE *uni2;
3495 /* startpos for collecting unencodable chars */
3496 const Py_UNICODE *collstart = p;
3497 const Py_UNICODE *collend = p;
3498 /* find all unecodable characters */
3499 while ((collend < endp) && ((*collend)>=limit))
3500 ++collend;
3501 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3502 if (known_errorHandler==-1) {
3503 if ((errors==NULL) || (!strcmp(errors, "strict")))
3504 known_errorHandler = 1;
3505 else if (!strcmp(errors, "replace"))
3506 known_errorHandler = 2;
3507 else if (!strcmp(errors, "ignore"))
3508 known_errorHandler = 3;
3509 else if (!strcmp(errors, "xmlcharrefreplace"))
3510 known_errorHandler = 4;
3511 else
3512 known_errorHandler = 0;
3513 }
3514 switch (known_errorHandler) {
3515 case 1: /* strict */
3516 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3517 goto onError;
3518 case 2: /* replace */
3519 while (collstart++<collend)
3520 *str++ = '?'; /* fall through */
3521 case 3: /* ignore */
3522 p = collend;
3523 break;
3524 case 4: /* xmlcharrefreplace */
3525 respos = str-PyString_AS_STRING(res);
3526 /* determine replacement size (temporarily (mis)uses p) */
3527 for (p = collstart, repsize = 0; p < collend; ++p) {
3528 if (*p<10)
3529 repsize += 2+1+1;
3530 else if (*p<100)
3531 repsize += 2+2+1;
3532 else if (*p<1000)
3533 repsize += 2+3+1;
3534 else if (*p<10000)
3535 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003536#ifndef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003537 else
3538 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003539#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003540 else if (*p<100000)
3541 repsize += 2+5+1;
3542 else if (*p<1000000)
3543 repsize += 2+6+1;
3544 else
3545 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003546#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003547 }
3548 requiredsize = respos+repsize+(endp-collend);
3549 if (requiredsize > ressize) {
3550 if (requiredsize<2*ressize)
3551 requiredsize = 2*ressize;
3552 if (_PyString_Resize(&res, requiredsize))
3553 goto onError;
3554 str = PyString_AS_STRING(res) + respos;
3555 ressize = requiredsize;
3556 }
3557 /* generate replacement (temporarily (mis)uses p) */
3558 for (p = collstart; p < collend; ++p) {
3559 str += sprintf(str, "&#%d;", (int)*p);
3560 }
3561 p = collend;
3562 break;
3563 default:
3564 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3565 encoding, reason, startp, size, &exc,
3566 collstart-startp, collend-startp, &newpos);
3567 if (repunicode == NULL)
3568 goto onError;
3569 /* need more space? (at least enough for what we
3570 have+the replacement+the rest of the string, so
3571 we won't have to check space for encodable characters) */
3572 respos = str-PyString_AS_STRING(res);
3573 repsize = PyUnicode_GET_SIZE(repunicode);
3574 requiredsize = respos+repsize+(endp-collend);
3575 if (requiredsize > ressize) {
3576 if (requiredsize<2*ressize)
3577 requiredsize = 2*ressize;
3578 if (_PyString_Resize(&res, requiredsize)) {
3579 Py_DECREF(repunicode);
3580 goto onError;
3581 }
3582 str = PyString_AS_STRING(res) + respos;
3583 ressize = requiredsize;
3584 }
3585 /* check if there is anything unencodable in the replacement
3586 and copy it to the output */
3587 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3588 c = *uni2;
3589 if (c >= limit) {
3590 raise_encode_exception(&exc, encoding, startp, size,
3591 unicodepos, unicodepos+1, reason);
3592 Py_DECREF(repunicode);
3593 goto onError;
3594 }
3595 *str = (char)c;
3596 }
3597 p = startp + newpos;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003598 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003599 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003600 }
3601 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003603 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 if (respos<ressize)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003605 /* If this falls res will be NULL */
3606 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 Py_XDECREF(errorHandler);
3608 Py_XDECREF(exc);
3609 return res;
3610
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003611 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 Py_XDECREF(res);
3613 Py_XDECREF(errorHandler);
3614 Py_XDECREF(exc);
3615 return NULL;
3616}
3617
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003619 Py_ssize_t size,
3620 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623}
3624
3625PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3626{
3627 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003628 PyErr_BadArgument();
3629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 }
3631 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003632 PyUnicode_GET_SIZE(unicode),
3633 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634}
3635
3636/* --- 7-bit ASCII Codec -------------------------------------------------- */
3637
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003639 Py_ssize_t size,
3640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 PyUnicodeObject *v;
3644 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003645 Py_ssize_t startinpos;
3646 Py_ssize_t endinpos;
3647 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 const char *e;
3649 PyObject *errorHandler = NULL;
3650 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003653 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003654 Py_UNICODE r = *(unsigned char*)s;
3655 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003656 }
Tim Petersced69f82003-09-16 20:30:58 +00003657
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 v = _PyUnicode_New(size);
3659 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003662 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 e = s + size;
3665 while (s < e) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003666 register unsigned char c = (unsigned char)*s;
3667 if (c < 128) {
3668 *p++ = c;
3669 ++s;
3670 }
3671 else {
3672 startinpos = s-starts;
3673 endinpos = startinpos + 1;
3674 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3675 if (unicode_decode_call_errorhandler(
3676 errors, &errorHandler,
3677 "ascii", "ordinal not in range(128)",
3678 starts, size, &startinpos, &endinpos, &exc, &s,
3679 &v, &outpos, &p))
3680 goto onError;
3681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003683 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003684 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3685 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_XDECREF(errorHandler);
3687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003689
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003690 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 Py_XDECREF(errorHandler);
3693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 return NULL;
3695}
3696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003698 Py_ssize_t size,
3699 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702}
3703
3704PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3705{
3706 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003707 PyErr_BadArgument();
3708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 }
3710 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003711 PyUnicode_GET_SIZE(unicode),
3712 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713}
3714
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003715#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003716
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003717/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003718
Hirokazu Yamamoto68e075e2009-03-21 13:04:41 +00003719#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003720#define NEED_RETRY
3721#endif
3722
3723/* XXX This code is limited to "true" double-byte encodings, as
3724 a) it assumes an incomplete character consists of a single byte, and
3725 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003726 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003727
3728static int is_dbcs_lead_byte(const char *s, int offset)
3729{
3730 const char *curr = s + offset;
3731
3732 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003733 const char *prev = CharPrev(s, curr);
3734 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003735 }
3736 return 0;
3737}
3738
3739/*
3740 * Decode MBCS string into unicode object. If 'final' is set, converts
3741 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3742 */
3743static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003744 const char *s, /* MBCS string */
3745 int size, /* sizeof MBCS string */
3746 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003747{
3748 Py_UNICODE *p;
3749 Py_ssize_t n = 0;
3750 int usize = 0;
3751
3752 assert(size >= 0);
3753
3754 /* Skip trailing lead-byte unless 'final' is set */
3755 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003756 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003757
3758 /* First get the size of the result */
3759 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003760 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3761 if (usize == 0) {
3762 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3763 return -1;
3764 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003765 }
3766
3767 if (*v == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003768 /* Create unicode object */
3769 *v = _PyUnicode_New(usize);
3770 if (*v == NULL)
3771 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003772 }
3773 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003774 /* Extend unicode object */
3775 n = PyUnicode_GET_SIZE(*v);
3776 if (_PyUnicode_Resize(v, n + usize) < 0)
3777 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003778 }
3779
3780 /* Do the conversion */
3781 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003782 p = PyUnicode_AS_UNICODE(*v) + n;
3783 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3784 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3785 return -1;
3786 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003787 }
3788
3789 return size;
3790}
3791
3792PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003793 Py_ssize_t size,
3794 const char *errors,
3795 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003796{
3797 PyUnicodeObject *v = NULL;
3798 int done;
3799
3800 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003801 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003802
3803#ifdef NEED_RETRY
3804 retry:
3805 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003806 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003807 else
3808#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003809 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003810
3811 if (done < 0) {
3812 Py_XDECREF(v);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003813 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003814 }
3815
3816 if (consumed)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003817 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003818
3819#ifdef NEED_RETRY
3820 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003821 s += done;
3822 size -= done;
3823 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003824 }
3825#endif
3826
3827 return (PyObject *)v;
3828}
3829
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003830PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003831 Py_ssize_t size,
3832 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003833{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003834 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3835}
3836
3837/*
3838 * Convert unicode into string object (MBCS).
3839 * Returns 0 if succeed, -1 otherwise.
3840 */
3841static int encode_mbcs(PyObject **repr,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003842 const Py_UNICODE *p, /* unicode */
3843 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003844{
3845 int mbcssize = 0;
3846 Py_ssize_t n = 0;
3847
3848 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003849
3850 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003852 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3853 if (mbcssize == 0) {
3854 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3855 return -1;
3856 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003857 }
3858
Martin v. Löwisd8251432006-06-14 05:21:04 +00003859 if (*repr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003860 /* Create string object */
3861 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3862 if (*repr == NULL)
3863 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003864 }
3865 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003866 /* Extend string object */
3867 n = PyString_Size(*repr);
3868 if (_PyString_Resize(repr, n + mbcssize) < 0)
3869 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003870 }
3871
3872 /* Do the conversion */
3873 if (size > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003874 char *s = PyString_AS_STRING(*repr) + n;
3875 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3876 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3877 return -1;
3878 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003879 }
3880
3881 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003882}
3883
3884PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003885 Py_ssize_t size,
3886 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003887{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003888 PyObject *repr = NULL;
3889 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003890
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891#ifdef NEED_RETRY
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003892 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 if (size > INT_MAX)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003894 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895 else
3896#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003897 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003898
Martin v. Löwisd8251432006-06-14 05:21:04 +00003899 if (ret < 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003900 Py_XDECREF(repr);
3901 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003902 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003903
3904#ifdef NEED_RETRY
3905 if (size > INT_MAX) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003906 p += INT_MAX;
3907 size -= INT_MAX;
3908 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003909 }
3910#endif
3911
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003912 return repr;
3913}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003914
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003915PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3916{
3917 if (!PyUnicode_Check(unicode)) {
3918 PyErr_BadArgument();
3919 return NULL;
3920 }
3921 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003922 PyUnicode_GET_SIZE(unicode),
3923 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003924}
3925
Martin v. Löwisd8251432006-06-14 05:21:04 +00003926#undef NEED_RETRY
3927
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003928#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003929
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930/* --- Character Mapping Codec -------------------------------------------- */
3931
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003933 Py_ssize_t size,
3934 PyObject *mapping,
3935 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003938 Py_ssize_t startinpos;
3939 Py_ssize_t endinpos;
3940 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 PyUnicodeObject *v;
3943 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003944 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 PyObject *errorHandler = NULL;
3946 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003947 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003948 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003949
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 /* Default to Latin-1 */
3951 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003952 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953
3954 v = _PyUnicode_New(size);
3955 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003958 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003961 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003962 mapstring = PyUnicode_AS_UNICODE(mapping);
3963 maplen = PyUnicode_GET_SIZE(mapping);
3964 while (s < e) {
3965 unsigned char ch = *s;
3966 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003968 if (ch < maplen)
3969 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003971 if (x == 0xfffe) {
3972 /* undefined mapping */
3973 outpos = p-PyUnicode_AS_UNICODE(v);
3974 startinpos = s-starts;
3975 endinpos = startinpos+1;
3976 if (unicode_decode_call_errorhandler(
3977 errors, &errorHandler,
3978 "charmap", "character maps to <undefined>",
3979 starts, size, &startinpos, &endinpos, &exc, &s,
3980 &v, &outpos, &p)) {
3981 goto onError;
3982 }
3983 continue;
3984 }
3985 *p++ = x;
3986 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00003987 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003988 }
3989 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003990 while (s < e) {
3991 unsigned char ch = *s;
3992 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003993
Benjamin Peterson339f8c62009-01-31 22:25:08 +00003994 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3995 w = PyInt_FromLong((long)ch);
3996 if (w == NULL)
3997 goto onError;
3998 x = PyObject_GetItem(mapping, w);
3999 Py_DECREF(w);
4000 if (x == NULL) {
4001 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4002 /* No mapping found means: mapping is undefined. */
4003 PyErr_Clear();
4004 x = Py_None;
4005 Py_INCREF(x);
4006 } else
4007 goto onError;
4008 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004009
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004010 /* Apply mapping */
4011 if (PyInt_Check(x)) {
4012 long value = PyInt_AS_LONG(x);
4013 if (value < 0 || value > 65535) {
4014 PyErr_SetString(PyExc_TypeError,
4015 "character mapping must be in range(65536)");
4016 Py_DECREF(x);
4017 goto onError;
4018 }
4019 *p++ = (Py_UNICODE)value;
4020 }
4021 else if (x == Py_None) {
4022 /* undefined mapping */
4023 outpos = p-PyUnicode_AS_UNICODE(v);
4024 startinpos = s-starts;
4025 endinpos = startinpos+1;
4026 if (unicode_decode_call_errorhandler(
4027 errors, &errorHandler,
4028 "charmap", "character maps to <undefined>",
4029 starts, size, &startinpos, &endinpos, &exc, &s,
4030 &v, &outpos, &p)) {
4031 Py_DECREF(x);
4032 goto onError;
4033 }
4034 Py_DECREF(x);
4035 continue;
4036 }
4037 else if (PyUnicode_Check(x)) {
4038 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004039
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004040 if (targetsize == 1)
4041 /* 1-1 mapping */
4042 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004043
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004044 else if (targetsize > 1) {
4045 /* 1-n mapping */
4046 if (targetsize > extrachars) {
4047 /* resize first */
4048 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4049 Py_ssize_t needed = (targetsize - extrachars) + \
4050 (targetsize << 2);
4051 extrachars += needed;
4052 /* XXX overflow detection missing */
4053 if (_PyUnicode_Resize(&v,
4054 PyUnicode_GET_SIZE(v) + needed) < 0) {
4055 Py_DECREF(x);
4056 goto onError;
4057 }
4058 p = PyUnicode_AS_UNICODE(v) + oldpos;
4059 }
4060 Py_UNICODE_COPY(p,
4061 PyUnicode_AS_UNICODE(x),
4062 targetsize);
4063 p += targetsize;
4064 extrachars -= targetsize;
4065 }
4066 /* 1-0 mapping: skip the character */
4067 }
4068 else {
4069 /* wrong return value */
4070 PyErr_SetString(PyExc_TypeError,
4071 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004072 Py_DECREF(x);
4073 goto onError;
4074 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004075 Py_DECREF(x);
4076 ++s;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 }
4079 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004080 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 Py_XDECREF(errorHandler);
4083 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004085
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004086 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 Py_XDECREF(errorHandler);
4088 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 Py_XDECREF(v);
4090 return NULL;
4091}
4092
Martin v. Löwis3f767792006-06-04 19:36:28 +00004093/* Charmap encoding: the lookup table */
4094
4095struct encoding_map{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004096 PyObject_HEAD
4097 unsigned char level1[32];
4098 int count2, count3;
4099 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004100};
4101
4102static PyObject*
4103encoding_map_size(PyObject *obj, PyObject* args)
4104{
4105 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004106 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004107 128*map->count3);
4108}
4109
4110static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004111 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004112 PyDoc_STR("Return the size (in bytes) of this object") },
4113 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004114};
4115
4116static void
4117encoding_map_dealloc(PyObject* o)
4118{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004119 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004120}
4121
4122static PyTypeObject EncodingMapType = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004123 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004124 "EncodingMap", /*tp_name*/
4125 sizeof(struct encoding_map), /*tp_basicsize*/
4126 0, /*tp_itemsize*/
4127 /* methods */
4128 encoding_map_dealloc, /*tp_dealloc*/
4129 0, /*tp_print*/
4130 0, /*tp_getattr*/
4131 0, /*tp_setattr*/
4132 0, /*tp_compare*/
4133 0, /*tp_repr*/
4134 0, /*tp_as_number*/
4135 0, /*tp_as_sequence*/
4136 0, /*tp_as_mapping*/
4137 0, /*tp_hash*/
4138 0, /*tp_call*/
4139 0, /*tp_str*/
4140 0, /*tp_getattro*/
4141 0, /*tp_setattro*/
4142 0, /*tp_as_buffer*/
4143 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4144 0, /*tp_doc*/
4145 0, /*tp_traverse*/
4146 0, /*tp_clear*/
4147 0, /*tp_richcompare*/
4148 0, /*tp_weaklistoffset*/
4149 0, /*tp_iter*/
4150 0, /*tp_iternext*/
4151 encoding_map_methods, /*tp_methods*/
4152 0, /*tp_members*/
4153 0, /*tp_getset*/
4154 0, /*tp_base*/
4155 0, /*tp_dict*/
4156 0, /*tp_descr_get*/
4157 0, /*tp_descr_set*/
4158 0, /*tp_dictoffset*/
4159 0, /*tp_init*/
4160 0, /*tp_alloc*/
4161 0, /*tp_new*/
4162 0, /*tp_free*/
4163 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004164};
4165
4166PyObject*
4167PyUnicode_BuildEncodingMap(PyObject* string)
4168{
4169 Py_UNICODE *decode;
4170 PyObject *result;
4171 struct encoding_map *mresult;
4172 int i;
4173 int need_dict = 0;
4174 unsigned char level1[32];
4175 unsigned char level2[512];
4176 unsigned char *mlevel1, *mlevel2, *mlevel3;
4177 int count2 = 0, count3 = 0;
4178
4179 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4180 PyErr_BadArgument();
4181 return NULL;
4182 }
4183 decode = PyUnicode_AS_UNICODE(string);
4184 memset(level1, 0xFF, sizeof level1);
4185 memset(level2, 0xFF, sizeof level2);
4186
4187 /* If there isn't a one-to-one mapping of NULL to \0,
4188 or if there are non-BMP characters, we need to use
4189 a mapping dictionary. */
4190 if (decode[0] != 0)
4191 need_dict = 1;
4192 for (i = 1; i < 256; i++) {
4193 int l1, l2;
4194 if (decode[i] == 0
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004195#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004196 || decode[i] > 0xFFFF
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004197#endif
4198 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004199 need_dict = 1;
4200 break;
4201 }
4202 if (decode[i] == 0xFFFE)
4203 /* unmapped character */
4204 continue;
4205 l1 = decode[i] >> 11;
4206 l2 = decode[i] >> 7;
4207 if (level1[l1] == 0xFF)
4208 level1[l1] = count2++;
4209 if (level2[l2] == 0xFF)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004210 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004211 }
4212
4213 if (count2 >= 0xFF || count3 >= 0xFF)
4214 need_dict = 1;
4215
4216 if (need_dict) {
4217 PyObject *result = PyDict_New();
4218 PyObject *key, *value;
4219 if (!result)
4220 return NULL;
4221 for (i = 0; i < 256; i++) {
4222 key = value = NULL;
4223 key = PyInt_FromLong(decode[i]);
4224 value = PyInt_FromLong(i);
4225 if (!key || !value)
4226 goto failed1;
4227 if (PyDict_SetItem(result, key, value) == -1)
4228 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004229 Py_DECREF(key);
4230 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004231 }
4232 return result;
4233 failed1:
4234 Py_XDECREF(key);
4235 Py_XDECREF(value);
4236 Py_DECREF(result);
4237 return NULL;
4238 }
4239
4240 /* Create a three-level trie */
4241 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4242 16*count2 + 128*count3 - 1);
4243 if (!result)
4244 return PyErr_NoMemory();
4245 PyObject_Init(result, &EncodingMapType);
4246 mresult = (struct encoding_map*)result;
4247 mresult->count2 = count2;
4248 mresult->count3 = count3;
4249 mlevel1 = mresult->level1;
4250 mlevel2 = mresult->level23;
4251 mlevel3 = mresult->level23 + 16*count2;
4252 memcpy(mlevel1, level1, 32);
4253 memset(mlevel2, 0xFF, 16*count2);
4254 memset(mlevel3, 0, 128*count3);
4255 count3 = 0;
4256 for (i = 1; i < 256; i++) {
4257 int o1, o2, o3, i2, i3;
4258 if (decode[i] == 0xFFFE)
4259 /* unmapped character */
4260 continue;
4261 o1 = decode[i]>>11;
4262 o2 = (decode[i]>>7) & 0xF;
4263 i2 = 16*mlevel1[o1] + o2;
4264 if (mlevel2[i2] == 0xFF)
4265 mlevel2[i2] = count3++;
4266 o3 = decode[i] & 0x7F;
4267 i3 = 128*mlevel2[i2] + o3;
4268 mlevel3[i3] = i;
4269 }
4270 return result;
4271}
4272
4273static int
4274encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4275{
4276 struct encoding_map *map = (struct encoding_map*)mapping;
4277 int l1 = c>>11;
4278 int l2 = (c>>7) & 0xF;
4279 int l3 = c & 0x7F;
4280 int i;
4281
4282#ifdef Py_UNICODE_WIDE
4283 if (c > 0xFFFF) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004284 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004285 }
4286#endif
4287 if (c == 0)
4288 return 0;
4289 /* level 1*/
4290 i = map->level1[l1];
4291 if (i == 0xFF) {
4292 return -1;
4293 }
4294 /* level 2*/
4295 i = map->level23[16*i+l2];
4296 if (i == 0xFF) {
4297 return -1;
4298 }
4299 /* level 3 */
4300 i = map->level23[16*map->count2 + 128*i + l3];
4301 if (i == 0) {
4302 return -1;
4303 }
4304 return i;
4305}
4306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307/* Lookup the character ch in the mapping. If the character
4308 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004309 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 PyObject *w = PyInt_FromLong((long)c);
4313 PyObject *x;
4314
4315 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004316 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 x = PyObject_GetItem(mapping, w);
4318 Py_DECREF(w);
4319 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004320 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4321 /* No mapping found means: mapping is undefined. */
4322 PyErr_Clear();
4323 x = Py_None;
4324 Py_INCREF(x);
4325 return x;
4326 } else
4327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004329 else if (x == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004330 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004332 long value = PyInt_AS_LONG(x);
4333 if (value < 0 || value > 255) {
4334 PyErr_SetString(PyExc_TypeError,
4335 "character mapping must be in range(256)");
4336 Py_DECREF(x);
4337 return NULL;
4338 }
4339 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004341 else if (PyString_Check(x))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004342 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004344 /* wrong return value */
4345 PyErr_SetString(PyExc_TypeError,
4346 "character mapping must return integer, None or str");
4347 Py_DECREF(x);
4348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
4350}
4351
Martin v. Löwis3f767792006-06-04 19:36:28 +00004352static int
4353charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4354{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004355 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4356 /* exponentially overallocate to minimize reallocations */
4357 if (requiredsize < 2*outsize)
4358 requiredsize = 2*outsize;
4359 if (_PyString_Resize(outobj, requiredsize)) {
4360 return 0;
4361 }
4362 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004363}
4364
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004365typedef enum charmapencode_result {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004366 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004367}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368/* lookup the character, put the result in the output string and adjust
4369 various state variables. Reallocate the output string if not enough
4370 space is available. Return a new reference to the object that
4371 was put in the output buffer, or Py_None, if the mapping was undefined
4372 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004373 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004375charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004376 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004378 PyObject *rep;
4379 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004380 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381
Christian Heimese93237d2007-12-19 02:37:44 +00004382 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004383 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004384 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004385 if (res == -1)
4386 return enc_FAILED;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004387 if (outsize<requiredsize)
4388 if (!charmapencode_resize(outobj, outpos, requiredsize))
4389 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004390 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004391 outstart[(*outpos)++] = (char)res;
4392 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004393 }
4394
4395 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 if (rep==NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004397 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004398 else if (rep==Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004399 Py_DECREF(rep);
4400 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004401 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004402 if (PyInt_Check(rep)) {
4403 Py_ssize_t requiredsize = *outpos+1;
4404 if (outsize<requiredsize)
4405 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4406 Py_DECREF(rep);
4407 return enc_EXCEPTION;
4408 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004409 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004410 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004411 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004412 else {
4413 const char *repchars = PyString_AS_STRING(rep);
4414 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4415 Py_ssize_t requiredsize = *outpos+repsize;
4416 if (outsize<requiredsize)
4417 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4418 Py_DECREF(rep);
4419 return enc_EXCEPTION;
4420 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004421 outstart = PyString_AS_STRING(*outobj);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004422 memcpy(outstart + *outpos, repchars, repsize);
4423 *outpos += repsize;
4424 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
Georg Brandl9f167602006-06-04 21:46:16 +00004426 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004427 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428}
4429
4430/* handle an error in PyUnicode_EncodeCharmap
4431 Return 0 on success, -1 on error */
4432static
4433int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004436 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438{
4439 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t repsize;
4441 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 Py_UNICODE *uni2;
4443 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t collstartpos = *inpos;
4445 Py_ssize_t collendpos = *inpos+1;
4446 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 char *encoding = "charmap";
4448 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004449 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 /* find all unencodable characters */
4452 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004453 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004454 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004455 int res = encoding_map_lookup(p[collendpos], mapping);
4456 if (res != -1)
4457 break;
4458 ++collendpos;
4459 continue;
4460 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004461
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004462 rep = charmapencode_lookup(p[collendpos], mapping);
4463 if (rep==NULL)
4464 return -1;
4465 else if (rep!=Py_None) {
4466 Py_DECREF(rep);
4467 break;
4468 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004469 Py_DECREF(rep);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004470 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 }
4472 /* cache callback name lookup
4473 * (if not done yet, i.e. it's the first error) */
4474 if (*known_errorHandler==-1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004475 if ((errors==NULL) || (!strcmp(errors, "strict")))
4476 *known_errorHandler = 1;
4477 else if (!strcmp(errors, "replace"))
4478 *known_errorHandler = 2;
4479 else if (!strcmp(errors, "ignore"))
4480 *known_errorHandler = 3;
4481 else if (!strcmp(errors, "xmlcharrefreplace"))
4482 *known_errorHandler = 4;
4483 else
4484 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 }
4486 switch (*known_errorHandler) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004487 case 1: /* strict */
4488 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4489 return -1;
4490 case 2: /* replace */
4491 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004492 x = charmapencode_output('?', mapping, res, respos);
4493 if (x==enc_EXCEPTION) {
4494 return -1;
4495 }
4496 else if (x==enc_FAILED) {
4497 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4498 return -1;
4499 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004500 }
4501 /* fall through */
4502 case 3: /* ignore */
4503 *inpos = collendpos;
4504 break;
4505 case 4: /* xmlcharrefreplace */
4506 /* generate replacement (temporarily (mis)uses p) */
4507 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004508 char buffer[2+29+1+1];
4509 char *cp;
4510 sprintf(buffer, "&#%d;", (int)p[collpos]);
4511 for (cp = buffer; *cp; ++cp) {
4512 x = charmapencode_output(*cp, mapping, res, respos);
4513 if (x==enc_EXCEPTION)
4514 return -1;
4515 else if (x==enc_FAILED) {
4516 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4517 return -1;
4518 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004519 }
4520 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004521 *inpos = collendpos;
4522 break;
4523 default:
4524 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004525 encoding, reason, p, size, exceptionObject,
4526 collstartpos, collendpos, &newpos);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004527 if (repunicode == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004528 return -1;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004529 /* generate replacement */
4530 repsize = PyUnicode_GET_SIZE(repunicode);
4531 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004532 x = charmapencode_output(*uni2, mapping, res, respos);
4533 if (x==enc_EXCEPTION) {
4534 return -1;
4535 }
4536 else if (x==enc_FAILED) {
4537 Py_DECREF(repunicode);
4538 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4539 return -1;
4540 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004541 }
4542 *inpos = newpos;
4543 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 }
4545 return 0;
4546}
4547
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004549 Py_ssize_t size,
4550 PyObject *mapping,
4551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 /* output object */
4554 PyObject *res = NULL;
4555 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004558 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 PyObject *errorHandler = NULL;
4560 PyObject *exc = NULL;
4561 /* the following variable is used for caching string comparisons
4562 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4563 * 3=ignore, 4=xmlcharrefreplace */
4564 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565
4566 /* Default to Latin-1 */
4567 if (mapping == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004568 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 /* allocate enough for a simple encoding without
4571 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004572 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 if (res == NULL)
4574 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004575 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004576 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 while (inpos<size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004579 /* try to encode it */
4580 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4581 if (x==enc_EXCEPTION) /* error */
4582 goto onError;
4583 if (x==enc_FAILED) { /* unencodable character */
4584 if (charmap_encoding_error(p, size, &inpos, mapping,
4585 &exc,
4586 &known_errorHandler, &errorHandler, errors,
4587 &res, &respos)) {
4588 goto onError;
4589 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004590 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004591 else
4592 /* done with this character => adjust input position */
4593 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004597 if (respos<PyString_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004598 if (_PyString_Resize(&res, respos))
4599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 }
4601 Py_XDECREF(exc);
4602 Py_XDECREF(errorHandler);
4603 return res;
4604
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004605 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 Py_XDECREF(res);
4607 Py_XDECREF(exc);
4608 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 return NULL;
4610}
4611
4612PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004613 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614{
4615 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004616 PyErr_BadArgument();
4617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618 }
4619 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004620 PyUnicode_GET_SIZE(unicode),
4621 mapping,
4622 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623}
4624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625/* create or adjust a UnicodeTranslateError */
4626static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004627 const Py_UNICODE *unicode, Py_ssize_t size,
4628 Py_ssize_t startpos, Py_ssize_t endpos,
4629 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 if (*exceptionObject == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004632 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004633 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 }
4635 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004636 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4637 goto onError;
4638 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4639 goto onError;
4640 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4641 goto onError;
4642 return;
4643 onError:
4644 Py_DECREF(*exceptionObject);
4645 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 }
4647}
4648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649/* raises a UnicodeTranslateError */
4650static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004651 const Py_UNICODE *unicode, Py_ssize_t size,
4652 Py_ssize_t startpos, Py_ssize_t endpos,
4653 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654{
4655 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004656 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 if (*exceptionObject != NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004658 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659}
4660
4661/* error handling callback helper:
4662 build arguments, call the callback and check the arguments,
4663 put the result into newpos and return the replacement string, which
4664 has to be freed by the caller */
4665static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004666 PyObject **errorHandler,
4667 const char *reason,
4668 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4669 Py_ssize_t startpos, Py_ssize_t endpos,
4670 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004672 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673
Martin v. Löwis412fb672006-04-13 06:34:32 +00004674 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 PyObject *restuple;
4676 PyObject *resunicode;
4677
4678 if (*errorHandler == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004679 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 if (*errorHandler == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 }
4683
4684 make_translate_exception(exceptionObject,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004685 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 if (*exceptionObject == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688
4689 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004690 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 if (restuple == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 if (!PyTuple_Check(restuple)) {
Georg Brandl40e15ed2009-04-05 21:48:06 +00004694 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004695 Py_DECREF(restuple);
4696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 }
4698 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004699 &resunicode, &i_newpos)) {
4700 Py_DECREF(restuple);
4701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004703 if (i_newpos<0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004704 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004705 else
4706 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004707 if (*newpos<0 || *newpos>size) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4709 Py_DECREF(restuple);
4710 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 Py_INCREF(resunicode);
4713 Py_DECREF(restuple);
4714 return resunicode;
4715}
4716
4717/* Lookup the character ch in the mapping and put the result in result,
4718 which must be decrefed by the caller.
4719 Return 0 on success, -1 on error */
4720static
4721int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4722{
4723 PyObject *w = PyInt_FromLong((long)c);
4724 PyObject *x;
4725
4726 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004727 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 x = PyObject_GetItem(mapping, w);
4729 Py_DECREF(w);
4730 if (x == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004731 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4732 /* No mapping found means: use 1:1 mapping. */
4733 PyErr_Clear();
4734 *result = NULL;
4735 return 0;
4736 } else
4737 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 }
4739 else if (x == Py_None) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004740 *result = x;
4741 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 }
4743 else if (PyInt_Check(x)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004744 long value = PyInt_AS_LONG(x);
4745 long max = PyUnicode_GetMax();
4746 if (value < 0 || value > max) {
4747 PyErr_Format(PyExc_TypeError,
4748 "character mapping must be in range(0x%lx)", max+1);
4749 Py_DECREF(x);
4750 return -1;
4751 }
4752 *result = x;
4753 return 0;
4754 }
4755 else if (PyUnicode_Check(x)) {
4756 *result = x;
4757 return 0;
4758 }
4759 else {
4760 /* wrong return value */
4761 PyErr_SetString(PyExc_TypeError,
4762 "character mapping must return integer, None or unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004763 Py_DECREF(x);
4764 return -1;
4765 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766}
4767/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004768 if not reallocate and adjust various state variables.
4769 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770static
Walter Dörwald4894c302003-10-24 14:25:28 +00004771int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004772 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004774 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004775 if (requiredsize > oldsize) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004776 /* remember old output position */
4777 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4778 /* exponentially overallocate to minimize reallocations */
4779 if (requiredsize < 2 * oldsize)
4780 requiredsize = 2 * oldsize;
4781 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4782 return -1;
4783 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 }
4785 return 0;
4786}
4787/* lookup the character, put the result in the output string and adjust
4788 various state variables. Return a new reference to the object that
4789 was put in the output buffer in *result, or Py_None, if the mapping was
4790 undefined (in which case no character was written).
4791 The called must decref result.
4792 Return 0 on success, -1 on error. */
4793static
Walter Dörwald4894c302003-10-24 14:25:28 +00004794int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004795 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4796 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797{
Walter Dörwald4894c302003-10-24 14:25:28 +00004798 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004799 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 if (*res==NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004801 /* not found => default to 1:1 mapping */
4802 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 }
4804 else if (*res==Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004805 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 else if (PyInt_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004807 /* no overflow check, because we know that the space is enough */
4808 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 }
4810 else if (PyUnicode_Check(*res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004811 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4812 if (repsize==1) {
4813 /* no overflow check, because we know that the space is enough */
4814 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4815 }
4816 else if (repsize!=0) {
4817 /* more than one character */
4818 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4819 (insize - (curinp-startinp)) +
4820 repsize - 1;
4821 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4822 return -1;
4823 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4824 *outp += repsize;
4825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 }
4827 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004828 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 return 0;
4830}
4831
4832PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004833 Py_ssize_t size,
4834 PyObject *mapping,
4835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 /* output object */
4838 PyObject *res = NULL;
4839 /* pointers to the beginning and end+1 of input */
4840 const Py_UNICODE *startp = p;
4841 const Py_UNICODE *endp = p + size;
4842 /* pointer into the output */
4843 Py_UNICODE *str;
4844 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004845 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 char *reason = "character maps to <undefined>";
4847 PyObject *errorHandler = NULL;
4848 PyObject *exc = NULL;
4849 /* the following variable is used for caching string comparisons
4850 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4851 * 3=ignore, 4=xmlcharrefreplace */
4852 int known_errorHandler = -1;
4853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 if (mapping == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004855 PyErr_BadArgument();
4856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858
4859 /* allocate enough for a simple 1:1 translation without
4860 replacements, if we need more, we'll resize */
4861 res = PyUnicode_FromUnicode(NULL, size);
4862 if (res == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 if (size == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004865 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 while (p<endp) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004869 /* try to encode it */
4870 PyObject *x = NULL;
4871 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4872 Py_XDECREF(x);
4873 goto onError;
4874 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004875 Py_XDECREF(x);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004876 if (x!=Py_None) /* it worked => adjust input pointer */
4877 ++p;
4878 else { /* untranslatable character */
4879 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4880 Py_ssize_t repsize;
4881 Py_ssize_t newpos;
4882 Py_UNICODE *uni2;
4883 /* startpos for collecting untranslatable chars */
4884 const Py_UNICODE *collstart = p;
4885 const Py_UNICODE *collend = p+1;
4886 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004888 /* find all untranslatable characters */
4889 while (collend < endp) {
4890 if (charmaptranslate_lookup(*collend, mapping, &x))
4891 goto onError;
4892 Py_XDECREF(x);
4893 if (x!=Py_None)
4894 break;
4895 ++collend;
4896 }
4897 /* cache callback name lookup
4898 * (if not done yet, i.e. it's the first error) */
4899 if (known_errorHandler==-1) {
4900 if ((errors==NULL) || (!strcmp(errors, "strict")))
4901 known_errorHandler = 1;
4902 else if (!strcmp(errors, "replace"))
4903 known_errorHandler = 2;
4904 else if (!strcmp(errors, "ignore"))
4905 known_errorHandler = 3;
4906 else if (!strcmp(errors, "xmlcharrefreplace"))
4907 known_errorHandler = 4;
4908 else
4909 known_errorHandler = 0;
4910 }
4911 switch (known_errorHandler) {
4912 case 1: /* strict */
4913 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004914 goto onError;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004915 case 2: /* replace */
4916 /* No need to check for space, this is a 1:1 replacement */
4917 for (coll = collstart; coll<collend; ++coll)
4918 *str++ = '?';
4919 /* fall through */
4920 case 3: /* ignore */
4921 p = collend;
4922 break;
4923 case 4: /* xmlcharrefreplace */
4924 /* generate replacement (temporarily (mis)uses p) */
4925 for (p = collstart; p < collend; ++p) {
4926 char buffer[2+29+1+1];
4927 char *cp;
4928 sprintf(buffer, "&#%d;", (int)*p);
4929 if (charmaptranslate_makespace(&res, &str,
4930 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4931 goto onError;
4932 for (cp = buffer; *cp; ++cp)
4933 *str++ = *cp;
4934 }
4935 p = collend;
4936 break;
4937 default:
4938 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4939 reason, startp, size, &exc,
4940 collstart-startp, collend-startp, &newpos);
4941 if (repunicode == NULL)
4942 goto onError;
4943 /* generate replacement */
4944 repsize = PyUnicode_GET_SIZE(repunicode);
4945 if (charmaptranslate_makespace(&res, &str,
4946 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4947 Py_DECREF(repunicode);
4948 goto onError;
4949 }
4950 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4951 *str++ = *uni2;
4952 p = startp + newpos;
4953 Py_DECREF(repunicode);
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004954 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00004955 }
4956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 /* Resize if we allocated to much */
4958 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004959 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004960 if (PyUnicode_Resize(&res, respos) < 0)
4961 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962 }
4963 Py_XDECREF(exc);
4964 Py_XDECREF(errorHandler);
4965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004967 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004968 Py_XDECREF(res);
4969 Py_XDECREF(exc);
4970 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 return NULL;
4972}
4973
4974PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004975 PyObject *mapping,
4976 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977{
4978 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004979
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980 str = PyUnicode_FromObject(str);
4981 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004984 PyUnicode_GET_SIZE(str),
4985 mapping,
4986 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 Py_DECREF(str);
4988 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004989
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004990 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 Py_XDECREF(str);
4992 return NULL;
4993}
Tim Petersced69f82003-09-16 20:30:58 +00004994
Guido van Rossum9e896b32000-04-05 20:11:21 +00004995/* --- Decimal Encoder ---------------------------------------------------- */
4996
4997int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00004998 Py_ssize_t length,
4999 char *output,
5000 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005001{
5002 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 PyObject *errorHandler = NULL;
5004 PyObject *exc = NULL;
5005 const char *encoding = "decimal";
5006 const char *reason = "invalid decimal Unicode string";
5007 /* the following variable is used for caching string comparisons
5008 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5009 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005010
5011 if (output == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005012 PyErr_BadArgument();
5013 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005014 }
5015
5016 p = s;
5017 end = s + length;
5018 while (p < end) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005019 register Py_UNICODE ch = *p;
5020 int decimal;
5021 PyObject *repunicode;
5022 Py_ssize_t repsize;
5023 Py_ssize_t newpos;
5024 Py_UNICODE *uni2;
5025 Py_UNICODE *collstart;
5026 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005027
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005028 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005029 *output++ = ' ';
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005030 ++p;
5031 continue;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005032 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005033 decimal = Py_UNICODE_TODECIMAL(ch);
5034 if (decimal >= 0) {
5035 *output++ = '0' + decimal;
5036 ++p;
5037 continue;
5038 }
5039 if (0 < ch && ch < 256) {
5040 *output++ = (char)ch;
5041 ++p;
5042 continue;
5043 }
5044 /* All other characters are considered unencodable */
5045 collstart = p;
5046 collend = p+1;
5047 while (collend < end) {
5048 if ((0 < *collend && *collend < 256) ||
5049 !Py_UNICODE_ISSPACE(*collend) ||
5050 Py_UNICODE_TODECIMAL(*collend))
5051 break;
5052 }
5053 /* cache callback name lookup
5054 * (if not done yet, i.e. it's the first error) */
5055 if (known_errorHandler==-1) {
5056 if ((errors==NULL) || (!strcmp(errors, "strict")))
5057 known_errorHandler = 1;
5058 else if (!strcmp(errors, "replace"))
5059 known_errorHandler = 2;
5060 else if (!strcmp(errors, "ignore"))
5061 known_errorHandler = 3;
5062 else if (!strcmp(errors, "xmlcharrefreplace"))
5063 known_errorHandler = 4;
5064 else
5065 known_errorHandler = 0;
5066 }
5067 switch (known_errorHandler) {
5068 case 1: /* strict */
5069 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5070 goto onError;
5071 case 2: /* replace */
5072 for (p = collstart; p < collend; ++p)
5073 *output++ = '?';
5074 /* fall through */
5075 case 3: /* ignore */
5076 p = collend;
5077 break;
5078 case 4: /* xmlcharrefreplace */
5079 /* generate replacement (temporarily (mis)uses p) */
5080 for (p = collstart; p < collend; ++p)
5081 output += sprintf(output, "&#%d;", (int)*p);
5082 p = collend;
5083 break;
5084 default:
5085 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5086 encoding, reason, s, length, &exc,
5087 collstart-s, collend-s, &newpos);
5088 if (repunicode == NULL)
5089 goto onError;
5090 /* generate replacement */
5091 repsize = PyUnicode_GET_SIZE(repunicode);
5092 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5093 Py_UNICODE ch = *uni2;
5094 if (Py_UNICODE_ISSPACE(ch))
5095 *output++ = ' ';
5096 else {
5097 decimal = Py_UNICODE_TODECIMAL(ch);
5098 if (decimal >= 0)
5099 *output++ = '0' + decimal;
5100 else if (0 < ch && ch < 256)
5101 *output++ = (char)ch;
5102 else {
5103 Py_DECREF(repunicode);
5104 raise_encode_exception(&exc, encoding,
5105 s, length, collstart-s, collend-s, reason);
5106 goto onError;
5107 }
5108 }
5109 }
5110 p = s + newpos;
5111 Py_DECREF(repunicode);
5112 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005113 }
5114 /* 0-terminate the output string */
5115 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 Py_XDECREF(exc);
5117 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005118 return 0;
5119
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005120 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005121 Py_XDECREF(exc);
5122 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005123 return -1;
5124}
5125
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126/* --- Helpers ------------------------------------------------------------ */
5127
Eric Smitha9f7d622008-02-17 19:46:49 +00005128#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005129
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005130#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005131
Fredrik Lundha50d2012006-05-26 17:04:58 +00005132#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005133
5134#include "stringlib/count.h"
5135#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005136#include "stringlib/partition.h"
5137
Fredrik Lundhc8162812006-05-26 19:33:03 +00005138/* helper macro to fixup start/end slice values */
5139#define FIX_START_END(obj) \
5140 if (start < 0) \
5141 start += (obj)->length; \
5142 if (start < 0) \
5143 start = 0; \
5144 if (end > (obj)->length) \
5145 end = (obj)->length; \
5146 if (end < 0) \
5147 end += (obj)->length; \
5148 if (end < 0) \
5149 end = 0;
5150
Martin v. Löwis18e16552006-02-15 17:27:45 +00005151Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005152 PyObject *substr,
5153 Py_ssize_t start,
5154 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005156 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005157 PyUnicodeObject* str_obj;
5158 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005159
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005160 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5161 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005162 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005163 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5164 if (!sub_obj) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005165 Py_DECREF(str_obj);
5166 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 }
Tim Petersced69f82003-09-16 20:30:58 +00005168
Fredrik Lundhc8162812006-05-26 19:33:03 +00005169 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005170
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005171 result = stringlib_count(
5172 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5173 );
5174
5175 Py_DECREF(sub_obj);
5176 Py_DECREF(str_obj);
5177
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 return result;
5179}
5180
Martin v. Löwis18e16552006-02-15 17:27:45 +00005181Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005182 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005183 Py_ssize_t start,
5184 Py_ssize_t end,
5185 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005187 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005188
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005189 str = PyUnicode_FromObject(str);
5190 if (!str)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005191 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005192 sub = PyUnicode_FromObject(sub);
5193 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005194 Py_DECREF(str);
5195 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
Tim Petersced69f82003-09-16 20:30:58 +00005197
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005198 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005199 result = stringlib_find_slice(
5200 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5201 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5202 start, end
5203 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005204 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005205 result = stringlib_rfind_slice(
5206 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5207 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5208 start, end
5209 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005210
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005211 Py_DECREF(str);
5212 Py_DECREF(sub);
5213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 return result;
5215}
5216
Tim Petersced69f82003-09-16 20:30:58 +00005217static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218int tailmatch(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005219 PyUnicodeObject *substring,
5220 Py_ssize_t start,
5221 Py_ssize_t end,
5222 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 if (substring->length == 0)
5225 return 1;
5226
Fredrik Lundhc8162812006-05-26 19:33:03 +00005227 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229 end -= substring->length;
5230 if (end < start)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005231 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
5233 if (direction > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005234 if (Py_UNICODE_MATCH(self, end, substring))
5235 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 } else {
5237 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005238 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 }
5240
5241 return 0;
5242}
5243
Martin v. Löwis18e16552006-02-15 17:27:45 +00005244Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005245 PyObject *substr,
5246 Py_ssize_t start,
5247 Py_ssize_t end,
5248 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005251
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 str = PyUnicode_FromObject(str);
5253 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005254 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 substr = PyUnicode_FromObject(substr);
5256 if (substr == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005257 Py_DECREF(str);
5258 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 }
Tim Petersced69f82003-09-16 20:30:58 +00005260
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005262 (PyUnicodeObject *)substr,
5263 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 Py_DECREF(str);
5265 Py_DECREF(substr);
5266 return result;
5267}
5268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269/* Apply fixfct filter to the Unicode object self and return a
5270 reference to the modified object */
5271
Tim Petersced69f82003-09-16 20:30:58 +00005272static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005274 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
5276
5277 PyUnicodeObject *u;
5278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005279 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005281 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005282
5283 Py_UNICODE_COPY(u->str, self->str, self->length);
5284
Tim Peters7a29bd52001-09-12 03:03:31 +00005285 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005286 /* fixfct should return TRUE if it modified the buffer. If
5287 FALSE, return a reference to the original buffer instead
5288 (to save space, not time) */
5289 Py_INCREF(self);
5290 Py_DECREF(u);
5291 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 }
5293 return (PyObject*) u;
5294}
5295
Tim Petersced69f82003-09-16 20:30:58 +00005296static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297int fixupper(PyUnicodeObject *self)
5298{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 Py_UNICODE *s = self->str;
5301 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005302
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005304 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005305
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005306 ch = Py_UNICODE_TOUPPER(*s);
5307 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005309 *s = ch;
5310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 s++;
5312 }
5313
5314 return status;
5315}
5316
Tim Petersced69f82003-09-16 20:30:58 +00005317static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318int fixlower(PyUnicodeObject *self)
5319{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005320 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 Py_UNICODE *s = self->str;
5322 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 while (len-- > 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005325 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005326
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005327 ch = Py_UNICODE_TOLOWER(*s);
5328 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 status = 1;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005330 *s = ch;
5331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 s++;
5333 }
5334
5335 return status;
5336}
5337
Tim Petersced69f82003-09-16 20:30:58 +00005338static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339int fixswapcase(PyUnicodeObject *self)
5340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 Py_UNICODE *s = self->str;
5343 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 while (len-- > 0) {
5346 if (Py_UNICODE_ISUPPER(*s)) {
5347 *s = Py_UNICODE_TOLOWER(*s);
5348 status = 1;
5349 } else if (Py_UNICODE_ISLOWER(*s)) {
5350 *s = Py_UNICODE_TOUPPER(*s);
5351 status = 1;
5352 }
5353 s++;
5354 }
5355
5356 return status;
5357}
5358
Tim Petersced69f82003-09-16 20:30:58 +00005359static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360int fixcapitalize(PyUnicodeObject *self)
5361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005362 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005363 Py_UNICODE *s = self->str;
5364 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005365
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005366 if (len == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005367 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005368 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005369 *s = Py_UNICODE_TOUPPER(*s);
5370 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005372 s++;
5373 while (--len > 0) {
5374 if (Py_UNICODE_ISUPPER(*s)) {
5375 *s = Py_UNICODE_TOLOWER(*s);
5376 status = 1;
5377 }
5378 s++;
5379 }
5380 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381}
5382
5383static
5384int fixtitle(PyUnicodeObject *self)
5385{
5386 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5387 register Py_UNICODE *e;
5388 int previous_is_cased;
5389
5390 /* Shortcut for single character strings */
5391 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005392 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5393 if (*p != ch) {
5394 *p = ch;
5395 return 1;
5396 }
5397 else
5398 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 }
Tim Petersced69f82003-09-16 20:30:58 +00005400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 e = p + PyUnicode_GET_SIZE(self);
5402 previous_is_cased = 0;
5403 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005404 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005405
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005406 if (previous_is_cased)
5407 *p = Py_UNICODE_TOLOWER(ch);
5408 else
5409 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005410
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005411 if (Py_UNICODE_ISLOWER(ch) ||
5412 Py_UNICODE_ISUPPER(ch) ||
5413 Py_UNICODE_ISTITLE(ch))
5414 previous_is_cased = 1;
5415 else
5416 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 }
5418 return 1;
5419}
5420
Tim Peters8ce9f162004-08-27 01:49:32 +00005421PyObject *
5422PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
Tim Peters8ce9f162004-08-27 01:49:32 +00005424 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005425 const Py_UNICODE blank = ' ';
5426 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005427 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005428 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005429 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5430 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005431 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5432 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005433 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005434 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005435 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436
Tim Peters05eba1f2004-08-27 21:32:02 +00005437 fseq = PySequence_Fast(seq, "");
5438 if (fseq == NULL) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005439 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005440 }
5441
Tim Peters91879ab2004-08-27 22:35:44 +00005442 /* Grrrr. A codec may be invoked to convert str objects to
5443 * Unicode, and so it's possible to call back into Python code
5444 * during PyUnicode_FromObject(), and so it's possible for a sick
5445 * codec to change the size of fseq (if seq is a list). Therefore
5446 * we have to keep refetching the size -- can't assume seqlen
5447 * is invariant.
5448 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 seqlen = PySequence_Fast_GET_SIZE(fseq);
5450 /* If empty sequence, return u"". */
5451 if (seqlen == 0) {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005452 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5453 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005454 }
5455 /* If singleton sequence with an exact Unicode, return that. */
5456 if (seqlen == 1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005457 item = PySequence_Fast_GET_ITEM(fseq, 0);
5458 if (PyUnicode_CheckExact(item)) {
5459 Py_INCREF(item);
5460 res = (PyUnicodeObject *)item;
5461 goto Done;
5462 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005463 }
5464
Tim Peters05eba1f2004-08-27 21:32:02 +00005465 /* At least two items to join, or one that isn't exact Unicode. */
5466 if (seqlen > 1) {
5467 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005468 if (separator == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005469 sep = &blank;
5470 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005471 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005472 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005473 internal_separator = PyUnicode_FromObject(separator);
5474 if (internal_separator == NULL)
5475 goto onError;
5476 sep = PyUnicode_AS_UNICODE(internal_separator);
5477 seplen = PyUnicode_GET_SIZE(internal_separator);
5478 /* In case PyUnicode_FromObject() mutated seq. */
5479 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005480 }
5481 }
5482
5483 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005484 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005485 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005486 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 res_p = PyUnicode_AS_UNICODE(res);
5488 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005489
Tim Peters05eba1f2004-08-27 21:32:02 +00005490 for (i = 0; i < seqlen; ++i) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005491 Py_ssize_t itemlen;
5492 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005493
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005494 item = PySequence_Fast_GET_ITEM(fseq, i);
5495 /* Convert item to Unicode. */
5496 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5497 PyErr_Format(PyExc_TypeError,
5498 "sequence item %zd: expected string or Unicode,"
5499 " %.80s found",
5500 i, Py_TYPE(item)->tp_name);
5501 goto onError;
5502 }
5503 item = PyUnicode_FromObject(item);
5504 if (item == NULL)
5505 goto onError;
5506 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005507
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005508 /* In case PyUnicode_FromObject() mutated seq. */
5509 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005510
Tim Peters8ce9f162004-08-27 01:49:32 +00005511 /* Make sure we have enough space for the separator and the item. */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005512 itemlen = PyUnicode_GET_SIZE(item);
5513 new_res_used = res_used + itemlen;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005514 if (new_res_used < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005515 goto Overflow;
5516 if (i < seqlen - 1) {
5517 new_res_used += seplen;
5518 if (new_res_used < 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005519 goto Overflow;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005520 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005521 if (new_res_used > res_alloc) {
5522 /* double allocated size until it's big enough */
5523 do {
5524 res_alloc += res_alloc;
5525 if (res_alloc <= 0)
5526 goto Overflow;
5527 } while (new_res_used > res_alloc);
5528 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5529 Py_DECREF(item);
5530 goto onError;
5531 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005533 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005534
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005535 /* Copy item, and maybe the separator. */
5536 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5537 res_p += itemlen;
5538 if (i < seqlen - 1) {
5539 Py_UNICODE_COPY(res_p, sep, seplen);
5540 res_p += seplen;
5541 }
5542 Py_DECREF(item);
5543 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005544 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005545
Tim Peters05eba1f2004-08-27 21:32:02 +00005546 /* Shrink res to match the used area; this probably can't fail,
5547 * but it's cheap to check.
5548 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005549 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005550 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005551
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005552 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005554 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 return (PyObject *)res;
5556
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005557 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005558 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005559 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005560 Py_DECREF(item);
5561 /* fall through */
5562
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005563 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005564 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005565 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005566 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 return NULL;
5568}
5569
Tim Petersced69f82003-09-16 20:30:58 +00005570static
5571PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005572 Py_ssize_t left,
5573 Py_ssize_t right,
5574 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575{
5576 PyUnicodeObject *u;
5577
5578 if (left < 0)
5579 left = 0;
5580 if (right < 0)
5581 right = 0;
5582
Tim Peters7a29bd52001-09-12 03:03:31 +00005583 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 Py_INCREF(self);
5585 return self;
5586 }
5587
Neal Norwitze7d8be82008-07-31 17:17:14 +00005588 if (left > PY_SSIZE_T_MAX - self->length ||
5589 right > PY_SSIZE_T_MAX - (left + self->length)) {
5590 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5591 return NULL;
5592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 u = _PyUnicode_New(left + self->length + right);
5594 if (u) {
5595 if (left)
5596 Py_UNICODE_FILL(u->str, fill, left);
5597 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5598 if (right)
5599 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5600 }
5601
5602 return u;
5603}
5604
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005605#define SPLIT_APPEND(data, left, right) \
5606 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5607 if (!str) \
5608 goto onError; \
5609 if (PyList_Append(list, str)) { \
5610 Py_DECREF(str); \
5611 goto onError; \
5612 } \
5613 else \
5614 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615
5616static
5617PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005618 PyObject *list,
5619 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005621 register Py_ssize_t i;
5622 register Py_ssize_t j;
5623 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005625 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
5627 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005628 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005629 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005630 i++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005631 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005632 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5633 i++;
5634 if (j < i) {
5635 if (maxcount-- <= 0)
5636 break;
5637 SPLIT_APPEND(buf, j, i);
5638 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5639 i++;
5640 j = i;
5641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 }
5643 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005644 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
5646 return list;
5647
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 Py_DECREF(list);
5650 return NULL;
5651}
5652
5653PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005654 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005656 register Py_ssize_t i;
5657 register Py_ssize_t j;
5658 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 PyObject *list;
5660 PyObject *str;
5661 Py_UNICODE *data;
5662
5663 string = PyUnicode_FromObject(string);
5664 if (string == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 data = PyUnicode_AS_UNICODE(string);
5667 len = PyUnicode_GET_SIZE(string);
5668
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 list = PyList_New(0);
5670 if (!list)
5671 goto onError;
5672
5673 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005674 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005675
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005676 /* Find a line and append it */
5677 while (i < len && !BLOOM_LINEBREAK(data[i]))
5678 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005680 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005681 eol = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005682 if (i < len) {
5683 if (data[i] == '\r' && i + 1 < len &&
5684 data[i+1] == '\n')
5685 i += 2;
5686 else
5687 i++;
5688 if (keepends)
5689 eol = i;
5690 }
5691 SPLIT_APPEND(data, j, eol);
5692 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 }
5694 if (j < len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005695 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 }
5697
5698 Py_DECREF(string);
5699 return list;
5700
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005701 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005702 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 Py_DECREF(string);
5704 return NULL;
5705}
5706
Tim Petersced69f82003-09-16 20:30:58 +00005707static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005709 PyObject *list,
5710 Py_UNICODE ch,
5711 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 register Py_ssize_t i;
5714 register Py_ssize_t j;
5715 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005717 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
5719 for (i = j = 0; i < len; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005720 if (buf[i] == ch) {
5721 if (maxcount-- <= 0)
5722 break;
5723 SPLIT_APPEND(buf, j, i);
5724 i = j = i + 1;
5725 } else
5726 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 }
5728 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005729 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 }
5731 return list;
5732
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005733 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 Py_DECREF(list);
5735 return NULL;
5736}
5737
Tim Petersced69f82003-09-16 20:30:58 +00005738static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005740 PyObject *list,
5741 PyUnicodeObject *substring,
5742 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005744 register Py_ssize_t i;
5745 register Py_ssize_t j;
5746 Py_ssize_t len = self->length;
5747 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 PyObject *str;
5749
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005750 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005751 if (Py_UNICODE_MATCH(self, i, substring)) {
5752 if (maxcount-- <= 0)
5753 break;
5754 SPLIT_APPEND(self->str, j, i);
5755 i = j = i + sublen;
5756 } else
5757 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 }
5759 if (j <= len) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005760 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
5762 return list;
5763
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005764 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 Py_DECREF(list);
5766 return NULL;
5767}
5768
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005769static
5770PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005771 PyObject *list,
5772 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005774 register Py_ssize_t i;
5775 register Py_ssize_t j;
5776 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005778 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779
5780 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005781 /* find a token */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005782 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005783 i--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005784 j = i;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005785 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5786 i--;
5787 if (j > i) {
5788 if (maxcount-- <= 0)
5789 break;
5790 SPLIT_APPEND(buf, i + 1, j + 1);
5791 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5792 i--;
5793 j = i;
5794 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005795 }
5796 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005797 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005799 if (PyList_Reverse(list) < 0)
5800 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801 return list;
5802
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005803 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 Py_DECREF(list);
5805 return NULL;
5806}
5807
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005808static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005809PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005810 PyObject *list,
5811 Py_UNICODE ch,
5812 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 register Py_ssize_t i;
5815 register Py_ssize_t j;
5816 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005818 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005819
5820 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005821 if (buf[i] == ch) {
5822 if (maxcount-- <= 0)
5823 break;
5824 SPLIT_APPEND(buf, i + 1, j + 1);
5825 j = i = i - 1;
5826 } else
5827 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005828 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005829 if (j >= -1) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005830 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005831 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005832 if (PyList_Reverse(list) < 0)
5833 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834 return list;
5835
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005836 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005837 Py_DECREF(list);
5838 return NULL;
5839}
5840
Benjamin Peterson186d9b32009-01-31 16:34:44 +00005841static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005842PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005843 PyObject *list,
5844 PyUnicodeObject *substring,
5845 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005846{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005847 register Py_ssize_t i;
5848 register Py_ssize_t j;
5849 Py_ssize_t len = self->length;
5850 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005851 PyObject *str;
5852
5853 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005854 if (Py_UNICODE_MATCH(self, i, substring)) {
5855 if (maxcount-- <= 0)
5856 break;
5857 SPLIT_APPEND(self->str, i + sublen, j);
5858 j = i;
5859 i -= sublen;
5860 } else
5861 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005862 }
5863 if (j >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005864 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005866 if (PyList_Reverse(list) < 0)
5867 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868 return list;
5869
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005870 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005871 Py_DECREF(list);
5872 return NULL;
5873}
5874
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875#undef SPLIT_APPEND
5876
5877static
5878PyObject *split(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005879 PyUnicodeObject *substring,
5880 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
5882 PyObject *list;
5883
5884 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005885 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
5887 list = PyList_New(0);
5888 if (!list)
5889 return NULL;
5890
5891 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005892 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
5894 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005895 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
5897 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005898 Py_DECREF(list);
5899 PyErr_SetString(PyExc_ValueError, "empty separator");
5900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 }
5902 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005903 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904}
5905
Tim Petersced69f82003-09-16 20:30:58 +00005906static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005907PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005908 PyUnicodeObject *substring,
5909 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005910{
5911 PyObject *list;
5912
5913 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005914 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005915
5916 list = PyList_New(0);
5917 if (!list)
5918 return NULL;
5919
5920 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005921 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005922
5923 else if (substring->length == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005924 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005925
5926 else if (substring->length == 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005927 Py_DECREF(list);
5928 PyErr_SetString(PyExc_ValueError, "empty separator");
5929 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005930 }
5931 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005932 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005933}
5934
5935static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005937 PyUnicodeObject *str1,
5938 PyUnicodeObject *str2,
5939 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
5941 PyUnicodeObject *u;
5942
5943 if (maxcount < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00005944 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
Fredrik Lundh347ee272006-05-24 16:35:18 +00005946 if (str1->length == str2->length) {
5947 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005948 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005949 if (str1->length == 1) {
5950 /* replace characters */
5951 Py_UNICODE u1, u2;
5952 if (!findchar(self->str, self->length, str1->str[0]))
5953 goto nothing;
5954 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5955 if (!u)
5956 return NULL;
5957 Py_UNICODE_COPY(u->str, self->str, self->length);
5958 u1 = str1->str[0];
5959 u2 = str2->str[0];
5960 for (i = 0; i < u->length; i++)
5961 if (u->str[i] == u1) {
5962 if (--maxcount < 0)
5963 break;
5964 u->str[i] = u2;
5965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005967 i = fastsearch(
5968 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005970 if (i < 0)
5971 goto nothing;
5972 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5973 if (!u)
5974 return NULL;
5975 Py_UNICODE_COPY(u->str, self->str, self->length);
5976 while (i <= self->length - str1->length)
5977 if (Py_UNICODE_MATCH(self, i, str1)) {
5978 if (--maxcount < 0)
5979 break;
5980 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5981 i += str1->length;
5982 } else
5983 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005986
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005987 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005988 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 Py_UNICODE *p;
5990
5991 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005992 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 if (n > maxcount)
5994 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005995 if (n == 0)
5996 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005997 /* new_size = self->length + n * (str2->length - str1->length)); */
5998 delta = (str2->length - str1->length);
5999 if (delta == 0) {
6000 new_size = self->length;
6001 } else {
6002 product = n * (str2->length - str1->length);
6003 if ((product / (str2->length - str1->length)) != n) {
6004 PyErr_SetString(PyExc_OverflowError,
6005 "replace string is too long");
6006 return NULL;
6007 }
6008 new_size = self->length + product;
6009 if (new_size < 0) {
6010 PyErr_SetString(PyExc_OverflowError,
6011 "replace string is too long");
6012 return NULL;
6013 }
6014 }
6015 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006016 if (!u)
6017 return NULL;
6018 i = 0;
6019 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006020 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006021 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006022 while (n-- > 0) {
6023 /* look for next match */
6024 j = i;
6025 while (j <= e) {
6026 if (Py_UNICODE_MATCH(self, j, str1))
6027 break;
6028 j++;
6029 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006030 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006031 if (j > e)
6032 break;
6033 /* copy unchanged part [i:j] */
6034 Py_UNICODE_COPY(p, self->str+i, j-i);
6035 p += j - i;
6036 }
6037 /* copy substitution string */
6038 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006039 Py_UNICODE_COPY(p, str2->str, str2->length);
6040 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006041 }
6042 i = j + str1->length;
6043 }
6044 if (i < self->length)
6045 /* copy tail [i:] */
6046 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006047 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006048 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006049 while (n > 0) {
6050 Py_UNICODE_COPY(p, str2->str, str2->length);
6051 p += str2->length;
6052 if (--n <= 0)
6053 break;
6054 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006056 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 }
6058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006060
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006061 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006062 /* nothing to replace; return original string (when possible) */
6063 if (PyUnicode_CheckExact(self)) {
6064 Py_INCREF(self);
6065 return (PyObject *) self;
6066 }
6067 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068}
6069
6070/* --- Unicode Object Methods --------------------------------------------- */
6071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072PyDoc_STRVAR(title__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006073 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074\n\
6075Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006076characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077
6078static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006079unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 return fixup(self, fixtitle);
6082}
6083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006084PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006085 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086\n\
6087Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006091unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 return fixup(self, fixcapitalize);
6094}
6095
6096#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006098 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099\n\
6100Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006101normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102
6103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006104unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
6106 PyObject *list;
6107 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 /* Split into words */
6111 list = split(self, NULL, -1);
6112 if (!list)
6113 return NULL;
6114
6115 /* Capitalize each word */
6116 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6117 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006118 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 if (item == NULL)
6120 goto onError;
6121 Py_DECREF(PyList_GET_ITEM(list, i));
6122 PyList_SET_ITEM(list, i, item);
6123 }
6124
6125 /* Join the words to form a new string */
6126 item = PyUnicode_Join(NULL, list);
6127
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006128 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 Py_DECREF(list);
6130 return (PyObject *)item;
6131}
6132#endif
6133
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006134/* Argument converter. Coerces to a single unicode character */
6135
6136static int
6137convert_uc(PyObject *obj, void *addr)
6138{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006139 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6140 PyObject *uniobj;
6141 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006142
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006143 uniobj = PyUnicode_FromObject(obj);
6144 if (uniobj == NULL) {
6145 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006146 "The fill character cannot be converted to Unicode");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006147 return 0;
6148 }
6149 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6150 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006151 "The fill character must be exactly one character long");
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006152 Py_DECREF(uniobj);
6153 return 0;
6154 }
6155 unistr = PyUnicode_AS_UNICODE(uniobj);
6156 *fillcharloc = unistr[0];
6157 Py_DECREF(uniobj);
6158 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006159}
6160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006161PyDoc_STRVAR(center__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006162 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006164Return S centered in a Unicode string of length width. Padding is\n\
6165done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
6167static PyObject *
6168unicode_center(PyUnicodeObject *self, PyObject *args)
6169{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006170 Py_ssize_t marg, left;
6171 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006172 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
Thomas Woutersde017742006-02-16 19:34:37 +00006174 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 return NULL;
6176
Tim Peters7a29bd52001-09-12 03:03:31 +00006177 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 Py_INCREF(self);
6179 return (PyObject*) self;
6180 }
6181
6182 marg = width - self->length;
6183 left = marg / 2 + (marg & width & 1);
6184
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006185 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186}
6187
Marc-André Lemburge5034372000-08-08 08:04:29 +00006188#if 0
6189
6190/* This code should go into some future Unicode collation support
6191 module. The basic comparison should compare ordinals on a naive
Georg Brandla3c242c2009-10-27 14:19:50 +00006192 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006193
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006194/* speedy UTF-16 code point order comparison */
6195/* gleaned from: */
6196/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6197
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006198static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006199{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006200 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006201 0, 0, 0, 0, 0, 0, 0, 0,
6202 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006203 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006204};
6205
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206static int
6207unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006209 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006210
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 Py_UNICODE *s1 = str1->str;
6212 Py_UNICODE *s2 = str2->str;
6213
6214 len1 = str1->length;
6215 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006216
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006218 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006219
6220 c1 = *s1++;
6221 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006222
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006223 if (c1 > (1<<11) * 26)
6224 c1 += utf16Fixup[c1>>11];
6225 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006226 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006227 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006228
6229 if (c1 != c2)
6230 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006231
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006232 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 }
6234
6235 return (len1 < len2) ? -1 : (len1 != len2);
6236}
6237
Marc-André Lemburge5034372000-08-08 08:04:29 +00006238#else
6239
6240static int
6241unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6242{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006243 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006244
6245 Py_UNICODE *s1 = str1->str;
6246 Py_UNICODE *s2 = str2->str;
6247
6248 len1 = str1->length;
6249 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006250
Marc-André Lemburge5034372000-08-08 08:04:29 +00006251 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006252 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006253
Fredrik Lundh45714e92001-06-26 16:39:36 +00006254 c1 = *s1++;
6255 c2 = *s2++;
6256
6257 if (c1 != c2)
6258 return (c1 < c2) ? -1 : 1;
6259
Marc-André Lemburge5034372000-08-08 08:04:29 +00006260 len1--; len2--;
6261 }
6262
6263 return (len1 < len2) ? -1 : (len1 != len2);
6264}
6265
6266#endif
6267
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268int PyUnicode_Compare(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006269 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270{
6271 PyUnicodeObject *u = NULL, *v = NULL;
6272 int result;
6273
6274 /* Coerce the two arguments */
6275 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6276 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6279 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006280 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Thomas Wouters7e474022000-07-16 12:04:32 +00006282 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 if (v == u) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006284 Py_DECREF(u);
6285 Py_DECREF(v);
6286 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
6288
6289 result = unicode_compare(u, v);
6290
6291 Py_DECREF(u);
6292 Py_DECREF(v);
6293 return result;
6294
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006295 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 Py_XDECREF(u);
6297 Py_XDECREF(v);
6298 return -1;
6299}
6300
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006301PyObject *PyUnicode_RichCompare(PyObject *left,
6302 PyObject *right,
6303 int op)
6304{
6305 int result;
6306
6307 result = PyUnicode_Compare(left, right);
6308 if (result == -1 && PyErr_Occurred())
6309 goto onError;
6310
6311 /* Convert the return value to a Boolean */
6312 switch (op) {
6313 case Py_EQ:
6314 result = (result == 0);
6315 break;
6316 case Py_NE:
6317 result = (result != 0);
6318 break;
6319 case Py_LE:
6320 result = (result <= 0);
6321 break;
6322 case Py_GE:
6323 result = (result >= 0);
6324 break;
6325 case Py_LT:
6326 result = (result == -1);
6327 break;
6328 case Py_GT:
6329 result = (result == 1);
6330 break;
6331 }
6332 return PyBool_FromLong(result);
6333
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006334 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006335
6336 /* Standard case
6337
6338 Type errors mean that PyUnicode_FromObject() could not convert
6339 one of the arguments (usually the right hand side) to Unicode,
6340 ie. we can't handle the comparison request. However, it is
6341 possible that the other object knows a comparison method, which
6342 is why we return Py_NotImplemented to give the other object a
6343 chance.
6344
6345 */
6346 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6347 PyErr_Clear();
6348 Py_INCREF(Py_NotImplemented);
6349 return Py_NotImplemented;
6350 }
6351 if (op != Py_EQ && op != Py_NE)
6352 return NULL;
6353
6354 /* Equality comparison.
6355
6356 This is a special case: we silence any PyExc_UnicodeDecodeError
6357 and instead turn it into a PyErr_UnicodeWarning.
6358
6359 */
6360 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6361 return NULL;
6362 PyErr_Clear();
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006363 if (PyErr_Warn(PyExc_UnicodeWarning,
6364 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006365 "Unicode equal comparison "
6366 "failed to convert both arguments to Unicode - "
6367 "interpreting them as being unequal" :
6368 "Unicode unequal comparison "
6369 "failed to convert both arguments to Unicode - "
6370 "interpreting them as being unequal"
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006371 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006372 return NULL;
6373 result = (op == Py_NE);
6374 return PyBool_FromLong(result);
6375}
6376
Guido van Rossum403d68b2000-03-13 15:55:09 +00006377int PyUnicode_Contains(PyObject *container,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006378 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006379{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006380 PyObject *str, *sub;
6381 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006382
6383 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006384 sub = PyUnicode_FromObject(element);
6385 if (!sub) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006386 PyErr_SetString(PyExc_TypeError,
6387 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006388 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006389 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006390
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006391 str = PyUnicode_FromObject(container);
6392 if (!str) {
6393 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006394 return -1;
6395 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006396
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006397 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006398
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006399 Py_DECREF(str);
6400 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006401
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006402 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006403}
6404
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405/* Concat to string or Unicode object giving a new Unicode object. */
6406
6407PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006408 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409{
6410 PyUnicodeObject *u = NULL, *v = NULL, *w;
6411
6412 /* Coerce the two arguments */
6413 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6414 if (u == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006415 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6417 if (v == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
6420 /* Shortcuts */
6421 if (v == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006422 Py_DECREF(v);
6423 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 }
6425 if (u == unicode_empty) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006426 Py_DECREF(u);
6427 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 }
6429
6430 /* Concat the two Unicode strings */
6431 w = _PyUnicode_New(u->length + v->length);
6432 if (w == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 Py_UNICODE_COPY(w->str, u->str, u->length);
6435 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6436
6437 Py_DECREF(u);
6438 Py_DECREF(v);
6439 return (PyObject *)w;
6440
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006441 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 Py_XDECREF(u);
6443 Py_XDECREF(v);
6444 return NULL;
6445}
6446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006447PyDoc_STRVAR(count__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006448 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006450Return the number of non-overlapping occurrences of substring sub in\n\
6451Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006452interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453
6454static PyObject *
6455unicode_count(PyUnicodeObject *self, PyObject *args)
6456{
6457 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006459 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 PyObject *result;
6461
Guido van Rossumb8872e62000-05-09 14:14:27 +00006462 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006463 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 return NULL;
6465
6466 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006467 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006469 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006470
Fredrik Lundhc8162812006-05-26 19:33:03 +00006471 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006473 result = PyInt_FromSsize_t(
6474 stringlib_count(self->str + start, end - start,
6475 substring->str, substring->length)
6476 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477
6478 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006479
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 return result;
6481}
6482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006483PyDoc_STRVAR(encode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006484 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006486Encodes S using the codec registered for encoding. encoding defaults\n\
6487to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006488handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006489a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6490'xmlcharrefreplace' as well as any other name registered with\n\
6491codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492
6493static PyObject *
6494unicode_encode(PyUnicodeObject *self, PyObject *args)
6495{
6496 char *encoding = NULL;
6497 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006498 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006499
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6501 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006502 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006503 if (v == NULL)
6504 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006505 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006506 PyErr_Format(PyExc_TypeError,
6507 "encoder did not return a string/unicode object "
6508 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006509 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006510 Py_DECREF(v);
6511 return NULL;
6512 }
6513 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006514
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006515 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006516 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006517}
6518
6519PyDoc_STRVAR(decode__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006520 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006521\n\
6522Decodes S using the codec registered for encoding. encoding defaults\n\
6523to the default encoding. errors may be given to set a different error\n\
6524handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6525a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6526as well as any other name registerd with codecs.register_error that is\n\
6527able to handle UnicodeDecodeErrors.");
6528
6529static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006530unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006531{
6532 char *encoding = NULL;
6533 char *errors = NULL;
6534 PyObject *v;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006535
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006536 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6537 return NULL;
6538 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006539 if (v == NULL)
6540 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006541 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006542 PyErr_Format(PyExc_TypeError,
6543 "decoder did not return a string/unicode object "
6544 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006545 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006546 Py_DECREF(v);
6547 return NULL;
6548 }
6549 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006550
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006551 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553}
6554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006555PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006556 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557\n\
6558Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560
6561static PyObject*
6562unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6563{
6564 Py_UNICODE *e;
6565 Py_UNICODE *p;
6566 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006567 Py_UNICODE *qe;
6568 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 PyUnicodeObject *u;
6570 int tabsize = 8;
6571
6572 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
Thomas Wouters7e474022000-07-16 12:04:32 +00006575 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006576 i = 0; /* chars up to and including most recent \n or \r */
6577 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6578 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 for (p = self->str; p < e; p++)
6580 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006581 if (tabsize > 0) {
6582 incr = tabsize - (j % tabsize); /* cannot overflow */
6583 if (j > PY_SSIZE_T_MAX - incr)
6584 goto overflow1;
6585 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006586 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006589 if (j > PY_SSIZE_T_MAX - 1)
6590 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 j++;
6592 if (*p == '\n' || *p == '\r') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006593 if (i > PY_SSIZE_T_MAX - j)
6594 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006596 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 }
6598 }
6599
Guido van Rossum5bdff602008-03-11 21:18:06 +00006600 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006601 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 /* Second pass: create output string and fill it */
6604 u = _PyUnicode_New(i + j);
6605 if (!u)
6606 return NULL;
6607
Guido van Rossum5bdff602008-03-11 21:18:06 +00006608 j = 0; /* same as in first pass */
6609 q = u->str; /* next output char */
6610 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
6612 for (p = self->str; p < e; p++)
6613 if (*p == '\t') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006614 if (tabsize > 0) {
6615 i = tabsize - (j % tabsize);
6616 j += i;
6617 while (i--) {
6618 if (q >= qe)
6619 goto overflow2;
6620 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006621 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006622 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00006623 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006624 else {
6625 if (q >= qe)
6626 goto overflow2;
6627 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006628 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 if (*p == '\n' || *p == '\r')
6630 j = 0;
6631 }
6632
6633 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006634
6635 overflow2:
6636 Py_DECREF(u);
6637 overflow1:
6638 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006642PyDoc_STRVAR(find__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006643 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644\n\
6645Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006646such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647arguments start and end are interpreted as in slice notation.\n\
6648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650
6651static PyObject *
6652unicode_find(PyUnicodeObject *self, PyObject *args)
6653{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006654 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006655 Py_ssize_t start;
6656 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006657 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
Facundo Batista57d56692007-11-16 18:04:14 +00006659 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006662 result = stringlib_find_slice(
6663 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6664 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6665 start, end
6666 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
6668 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006669
6670 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
6673static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675{
6676 if (index < 0 || index >= self->length) {
6677 PyErr_SetString(PyExc_IndexError, "string index out of range");
6678 return NULL;
6679 }
6680
6681 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6682}
6683
6684static long
6685unicode_hash(PyUnicodeObject *self)
6686{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006687 /* Since Unicode objects compare equal to their ASCII string
6688 counterparts, they should use the individual character values
6689 as basis for their hash value. This is needed to assure that
6690 strings and Unicode objects behave in the same way as
6691 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
Martin v. Löwis18e16552006-02-15 17:27:45 +00006693 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006694 register Py_UNICODE *p;
6695 register long x;
6696
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 if (self->hash != -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006698 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006699 len = PyUnicode_GET_SIZE(self);
6700 p = PyUnicode_AS_UNICODE(self);
6701 x = *p << 7;
6702 while (--len >= 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006703 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006704 x ^= PyUnicode_GET_SIZE(self);
6705 if (x == -1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006706 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006707 self->hash = x;
6708 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709}
6710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006711PyDoc_STRVAR(index__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006712 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006714Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715
6716static PyObject *
6717unicode_index(PyUnicodeObject *self, PyObject *args)
6718{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006719 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006720 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006721 Py_ssize_t start;
6722 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
Facundo Batista57d56692007-11-16 18:04:14 +00006724 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006727 result = stringlib_find_slice(
6728 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6729 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6730 start, end
6731 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 if (result < 0) {
6736 PyErr_SetString(PyExc_ValueError, "substring not found");
6737 return NULL;
6738 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006739
Martin v. Löwis18e16552006-02-15 17:27:45 +00006740 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743PyDoc_STRVAR(islower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006744 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006750unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751{
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6754 int cased;
6755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 /* Shortcut for single character strings */
6757 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006758 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006761 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006762 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 e = p + PyUnicode_GET_SIZE(self);
6765 cased = 0;
6766 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006767 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006768
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006769 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6770 return PyBool_FromLong(0);
6771 else if (!cased && Py_UNICODE_ISLOWER(ch))
6772 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006778 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006780Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782
6783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006784unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785{
6786 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6787 register const Py_UNICODE *e;
6788 int cased;
6789
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 /* Shortcut for single character strings */
6791 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006792 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006794 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006795 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006797
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 e = p + PyUnicode_GET_SIZE(self);
6799 cased = 0;
6800 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006801 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006802
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006803 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6804 return PyBool_FromLong(0);
6805 else if (!cased && Py_UNICODE_ISUPPER(ch))
6806 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809}
6810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006811PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006812 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006814Return True if S is a titlecased string and there is at least one\n\
6815character in S, i.e. upper- and titlecase characters may only\n\
6816follow uncased characters and lowercase characters only cased ones.\n\
6817Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
6819static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006820unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
6822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6823 register const Py_UNICODE *e;
6824 int cased, previous_is_cased;
6825
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 /* Shortcut for single character strings */
6827 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006828 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6829 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006831 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006832 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006833 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006834
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 e = p + PyUnicode_GET_SIZE(self);
6836 cased = 0;
6837 previous_is_cased = 0;
6838 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006839 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006840
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006841 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6842 if (previous_is_cased)
6843 return PyBool_FromLong(0);
6844 previous_is_cased = 1;
6845 cased = 1;
6846 }
6847 else if (Py_UNICODE_ISLOWER(ch)) {
6848 if (!previous_is_cased)
6849 return PyBool_FromLong(0);
6850 previous_is_cased = 1;
6851 cased = 1;
6852 }
6853 else
6854 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857}
6858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006860 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006862Return True if all characters in S are whitespace\n\
6863and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
6865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006866unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
6868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869 register const Py_UNICODE *e;
6870
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 /* Shortcut for single character strings */
6872 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006873 Py_UNICODE_ISSPACE(*p))
6874 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006876 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006877 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006878 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 e = p + PyUnicode_GET_SIZE(self);
6881 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006882 if (!Py_UNICODE_ISSPACE(*p))
6883 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886}
6887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006889 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006890\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006891Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006892and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006893
6894static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006895unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006896{
6897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6898 register const Py_UNICODE *e;
6899
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006900 /* Shortcut for single character strings */
6901 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006902 Py_UNICODE_ISALPHA(*p))
6903 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006904
6905 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006906 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908
6909 e = p + PyUnicode_GET_SIZE(self);
6910 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006911 if (!Py_UNICODE_ISALPHA(*p))
6912 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915}
6916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006917PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006918 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006919\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006920Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006922
6923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006924unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006925{
6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927 register const Py_UNICODE *e;
6928
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006929 /* Shortcut for single character strings */
6930 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006931 Py_UNICODE_ISALNUM(*p))
6932 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006933
6934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006935 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006936 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006937
6938 e = p + PyUnicode_GET_SIZE(self);
6939 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006940 if (!Py_UNICODE_ISALNUM(*p))
6941 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006944}
6945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006946PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006947 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006950False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
6952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006953unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954{
6955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6956 register const Py_UNICODE *e;
6957
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 /* Shortcut for single character strings */
6959 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006960 Py_UNICODE_ISDECIMAL(*p))
6961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006963 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006964 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006965 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006966
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 e = p + PyUnicode_GET_SIZE(self);
6968 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006969 if (!Py_UNICODE_ISDECIMAL(*p))
6970 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973}
6974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006975PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006976 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006978Return True if all characters in S are digits\n\
6979and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980
6981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006982unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983{
6984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6985 register const Py_UNICODE *e;
6986
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 /* Shortcut for single character strings */
6988 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006989 Py_UNICODE_ISDIGIT(*p))
6990 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006992 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006993 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006994 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 e = p + PyUnicode_GET_SIZE(self);
6997 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00006998 if (!Py_UNICODE_ISDIGIT(*p))
6999 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007001 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002}
7003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007004PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007005 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
7010static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007011unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012{
7013 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7014 register const Py_UNICODE *e;
7015
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 /* Shortcut for single character strings */
7017 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007018 Py_UNICODE_ISNUMERIC(*p))
7019 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007021 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007022 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007023 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007024
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 e = p + PyUnicode_GET_SIZE(self);
7026 for (; p < e; p++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007027 if (!Py_UNICODE_ISNUMERIC(*p))
7028 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007030 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031}
7032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007033PyDoc_STRVAR(join__doc__,
Georg Brandl5d2eb342009-10-27 15:08:27 +00007034 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035\n\
7036Return a string which is the concatenation of the strings in the\n\
Georg Brandl5d2eb342009-10-27 15:08:27 +00007037iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038
7039static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043}
7044
Martin v. Löwis18e16552006-02-15 17:27:45 +00007045static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046unicode_length(PyUnicodeObject *self)
7047{
7048 return self->length;
7049}
7050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007051PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007052 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007054Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007055done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056
7057static PyObject *
7058unicode_ljust(PyUnicodeObject *self, PyObject *args)
7059{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007060 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007061 Py_UNICODE fillchar = ' ';
7062
Martin v. Löwis412fb672006-04-13 06:34:32 +00007063 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return NULL;
7065
Tim Peters7a29bd52001-09-12 03:03:31 +00007066 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 Py_INCREF(self);
7068 return (PyObject*) self;
7069 }
7070
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007071 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072}
7073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007074PyDoc_STRVAR(lower__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007075 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007077Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
7079static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007080unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 return fixup(self, fixlower);
7083}
7084
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085#define LEFTSTRIP 0
7086#define RIGHTSTRIP 1
7087#define BOTHSTRIP 2
7088
7089/* Arrays indexed by above */
7090static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7091
7092#define STRIPNAME(i) (stripformat[i]+3)
7093
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094/* externally visible for str.strip(unicode) */
7095PyObject *
7096_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7097{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007098 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7099 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7100 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7101 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7102 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007104 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007105
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007106 i = 0;
7107 if (striptype != RIGHTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007108 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7109 i++;
7110 }
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007111 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007112
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007113 j = len;
7114 if (striptype != LEFTSTRIP) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007115 do {
7116 j--;
7117 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7118 j++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007119 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007121 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007122 Py_INCREF(self);
7123 return (PyObject*)self;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007124 }
7125 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007126 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007127}
7128
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007131do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007133 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7134 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007135
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007136 i = 0;
7137 if (striptype != RIGHTSTRIP) {
7138 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7139 i++;
7140 }
7141 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007142
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007143 j = len;
7144 if (striptype != LEFTSTRIP) {
7145 do {
7146 j--;
7147 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7148 j++;
7149 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007151 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7152 Py_INCREF(self);
7153 return (PyObject*)self;
7154 }
7155 else
7156 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157}
7158
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007159
7160static PyObject *
7161do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7162{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007163 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007164
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007165 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7166 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007168 if (sep != NULL && sep != Py_None) {
7169 if (PyUnicode_Check(sep))
7170 return _PyUnicode_XStrip(self, striptype, sep);
7171 else if (PyString_Check(sep)) {
7172 PyObject *res;
7173 sep = PyUnicode_FromObject(sep);
7174 if (sep==NULL)
7175 return NULL;
7176 res = _PyUnicode_XStrip(self, striptype, sep);
7177 Py_DECREF(sep);
7178 return res;
7179 }
7180 else {
7181 PyErr_Format(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007182 "%s arg must be None, unicode or str",
7183 STRIPNAME(striptype));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007184 return NULL;
7185 }
7186 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007188 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189}
7190
7191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007192PyDoc_STRVAR(strip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007193 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007194\n\
7195Return a copy of the string S with leading and trailing\n\
7196whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007197If chars is given and not None, remove characters in chars instead.\n\
7198If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199
7200static PyObject *
7201unicode_strip(PyUnicodeObject *self, PyObject *args)
7202{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007203 if (PyTuple_GET_SIZE(args) == 0)
7204 return do_strip(self, BOTHSTRIP); /* Common case */
7205 else
7206 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007207}
7208
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007211 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007212\n\
7213Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007214If chars is given and not None, remove characters in chars instead.\n\
7215If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007216
7217static PyObject *
7218unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7219{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007220 if (PyTuple_GET_SIZE(args) == 0)
7221 return do_strip(self, LEFTSTRIP); /* Common case */
7222 else
7223 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007224}
7225
7226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007227PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007228 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007229\n\
7230Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007231If chars is given and not None, remove characters in chars instead.\n\
7232If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007233
7234static PyObject *
7235unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7236{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007237 if (PyTuple_GET_SIZE(args) == 0)
7238 return do_strip(self, RIGHTSTRIP); /* Common case */
7239 else
7240 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007241}
7242
7243
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246{
7247 PyUnicodeObject *u;
7248 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007250 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
7252 if (len < 0)
7253 len = 0;
7254
Tim Peters7a29bd52001-09-12 03:03:31 +00007255 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* no repeat, return original string */
7257 Py_INCREF(str);
7258 return (PyObject*) str;
7259 }
Tim Peters8f422462000-09-09 06:13:41 +00007260
7261 /* ensure # of chars needed doesn't overflow int and # of bytes
7262 * needed doesn't overflow size_t
7263 */
7264 nchars = len * str->length;
7265 if (len && nchars / len != str->length) {
7266 PyErr_SetString(PyExc_OverflowError,
7267 "repeated string is too long");
7268 return NULL;
7269 }
7270 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7271 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7272 PyErr_SetString(PyExc_OverflowError,
7273 "repeated string is too long");
7274 return NULL;
7275 }
7276 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 if (!u)
7278 return NULL;
7279
7280 p = u->str;
7281
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007282 if (str->length == 1 && len > 0) {
7283 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007284 } else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007285 Py_ssize_t done = 0; /* number of characters copied this far */
7286 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007287 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007288 done = str->length;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007289 }
7290 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007291 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007292 Py_UNICODE_COPY(p+done, p, n);
7293 done += n;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007294 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
7297 return (PyObject*) u;
7298}
7299
7300PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007301 PyObject *subobj,
7302 PyObject *replobj,
7303 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304{
7305 PyObject *self;
7306 PyObject *str1;
7307 PyObject *str2;
7308 PyObject *result;
7309
7310 self = PyUnicode_FromObject(obj);
7311 if (self == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 str1 = PyUnicode_FromObject(subobj);
7314 if (str1 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007315 Py_DECREF(self);
7316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 }
7318 str2 = PyUnicode_FromObject(replobj);
7319 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007320 Py_DECREF(self);
7321 Py_DECREF(str1);
7322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 }
Tim Petersced69f82003-09-16 20:30:58 +00007324 result = replace((PyUnicodeObject *)self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007325 (PyUnicodeObject *)str1,
7326 (PyUnicodeObject *)str2,
7327 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 Py_DECREF(self);
7329 Py_DECREF(str1);
7330 Py_DECREF(str2);
7331 return result;
7332}
7333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007334PyDoc_STRVAR(replace__doc__,
Ezio Melotti6327bf12010-06-26 18:47:01 +00007335 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336\n\
7337Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007338old replaced by new. If the optional argument count is\n\
7339given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341static PyObject*
7342unicode_replace(PyUnicodeObject *self, PyObject *args)
7343{
7344 PyUnicodeObject *str1;
7345 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007346 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 PyObject *result;
7348
Martin v. Löwis18e16552006-02-15 17:27:45 +00007349 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 return NULL;
7351 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7352 if (str1 == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007355 if (str2 == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007356 Py_DECREF(str1);
7357 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
7360 result = replace(self, str1, str2, maxcount);
7361
7362 Py_DECREF(str1);
7363 Py_DECREF(str2);
7364 return result;
7365}
7366
7367static
7368PyObject *unicode_repr(PyObject *unicode)
7369{
7370 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007371 PyUnicode_GET_SIZE(unicode),
7372 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373}
7374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007375PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007376 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377\n\
7378Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007379such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380arguments start and end are interpreted as in slice notation.\n\
7381\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007382Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383
7384static PyObject *
7385unicode_rfind(PyUnicodeObject *self, PyObject *args)
7386{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007387 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007388 Py_ssize_t start;
7389 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007390 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391
Facundo Batista57d56692007-11-16 18:04:14 +00007392 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007395 result = stringlib_rfind_slice(
7396 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7397 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7398 start, end
7399 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007402
7403 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404}
7405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007406PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007407 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007409Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
7411static PyObject *
7412unicode_rindex(PyUnicodeObject *self, PyObject *args)
7413{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007414 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007415 Py_ssize_t start;
7416 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007417 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
Facundo Batista57d56692007-11-16 18:04:14 +00007419 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007422 result = stringlib_rfind_slice(
7423 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7424 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7425 start, end
7426 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
7428 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007429
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 if (result < 0) {
7431 PyErr_SetString(PyExc_ValueError, "substring not found");
7432 return NULL;
7433 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007434 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435}
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007438 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007440Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007441done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
7443static PyObject *
7444unicode_rjust(PyUnicodeObject *self, PyObject *args)
7445{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007446 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007447 Py_UNICODE fillchar = ' ';
7448
Martin v. Löwis412fb672006-04-13 06:34:32 +00007449 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 return NULL;
7451
Tim Peters7a29bd52001-09-12 03:03:31 +00007452 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 Py_INCREF(self);
7454 return (PyObject*) self;
7455 }
7456
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007457 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458}
7459
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007461unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462{
7463 /* standard clamping */
7464 if (start < 0)
7465 start = 0;
7466 if (end < 0)
7467 end = 0;
7468 if (end > self->length)
7469 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007470 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 /* full slice, return original string */
7472 Py_INCREF(self);
7473 return (PyObject*) self;
7474 }
7475 if (start > end)
7476 start = end;
7477 /* copy slice */
7478 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007479 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480}
7481
7482PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007483 PyObject *sep,
7484 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485{
7486 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 s = PyUnicode_FromObject(s);
7489 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007490 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007491 if (sep != NULL) {
7492 sep = PyUnicode_FromObject(sep);
7493 if (sep == NULL) {
7494 Py_DECREF(s);
7495 return NULL;
7496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 }
7498
7499 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7500
7501 Py_DECREF(s);
7502 Py_XDECREF(sep);
7503 return result;
7504}
7505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506PyDoc_STRVAR(split__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007507 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508\n\
7509Return a list of the words in S, using sep as the\n\
7510delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007511splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007512whitespace string is a separator and empty strings are\n\
7513removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
7515static PyObject*
7516unicode_split(PyUnicodeObject *self, PyObject *args)
7517{
7518 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007519 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520
Martin v. Löwis18e16552006-02-15 17:27:45 +00007521 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 return NULL;
7523
7524 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007525 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007527 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007529 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530}
7531
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007532PyObject *
7533PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7534{
7535 PyObject* str_obj;
7536 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007537 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007538
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007539 str_obj = PyUnicode_FromObject(str_in);
7540 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007541 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007542 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007543 if (!sep_obj) {
7544 Py_DECREF(str_obj);
7545 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007546 }
7547
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007548 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007549 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7550 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7551 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007552
Fredrik Lundhb9479482006-05-26 17:22:38 +00007553 Py_DECREF(sep_obj);
7554 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007555
7556 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007557}
7558
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007559
7560PyObject *
7561PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7562{
7563 PyObject* str_obj;
7564 PyObject* sep_obj;
7565 PyObject* out;
7566
7567 str_obj = PyUnicode_FromObject(str_in);
7568 if (!str_obj)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007569 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007570 sep_obj = PyUnicode_FromObject(sep_in);
7571 if (!sep_obj) {
7572 Py_DECREF(str_obj);
7573 return NULL;
7574 }
7575
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007576 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007577 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7578 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7579 );
7580
7581 Py_DECREF(sep_obj);
7582 Py_DECREF(str_obj);
7583
7584 return out;
7585}
7586
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007587PyDoc_STRVAR(partition__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007588 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007589\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007590Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007591the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007592found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007593
7594static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007595unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007596{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007597 return PyUnicode_Partition((PyObject *)self, separator);
7598}
7599
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007600PyDoc_STRVAR(rpartition__doc__,
Ezio Melottidabb5f72010-01-25 11:46:11 +00007601 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007602\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007603Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007604the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonbe2c0a92008-10-04 21:33:08 +00007605separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007606
7607static PyObject*
7608unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7609{
7610 return PyUnicode_RPartition((PyObject *)self, separator);
7611}
7612
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007613PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007614 PyObject *sep,
7615 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007616{
7617 PyObject *result;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007618
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007619 s = PyUnicode_FromObject(s);
7620 if (s == NULL)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007621 return NULL;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007622 if (sep != NULL) {
7623 sep = PyUnicode_FromObject(sep);
7624 if (sep == NULL) {
7625 Py_DECREF(s);
7626 return NULL;
7627 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007628 }
7629
7630 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7631
7632 Py_DECREF(s);
7633 Py_XDECREF(sep);
7634 return result;
7635}
7636
7637PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007638 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007639\n\
7640Return a list of the words in S, using sep as the\n\
7641delimiter string, starting at the end of the string and\n\
7642working to the front. If maxsplit is given, at most maxsplit\n\
7643splits are done. If sep is not specified, any whitespace string\n\
7644is a separator.");
7645
7646static PyObject*
7647unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7648{
7649 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007650 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007651
Martin v. Löwis18e16552006-02-15 17:27:45 +00007652 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007653 return NULL;
7654
7655 if (substring == Py_None)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007656 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007657 else if (PyUnicode_Check(substring))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007658 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007659 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007660 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007661}
7662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007663PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007664 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665\n\
7666Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007667Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007668is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670static PyObject*
7671unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7672{
Guido van Rossum86662912000-04-11 15:38:46 +00007673 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674
Guido van Rossum86662912000-04-11 15:38:46 +00007675 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 return NULL;
7677
Guido van Rossum86662912000-04-11 15:38:46 +00007678 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679}
7680
7681static
7682PyObject *unicode_str(PyUnicodeObject *self)
7683{
Fred Drakee4315f52000-05-09 19:53:39 +00007684 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685}
7686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007687PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007688 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689\n\
7690Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007691and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692
7693static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007694unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 return fixup(self, fixswapcase);
7697}
7698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007699PyDoc_STRVAR(translate__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007700 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701\n\
7702Return a copy of the string S, where all characters have been mapped\n\
7703through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007704Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7705Unmapped characters are left untouched. Characters mapped to None\n\
7706are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707
7708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007709unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710{
Tim Petersced69f82003-09-16 20:30:58 +00007711 return PyUnicode_TranslateCharmap(self->str,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007712 self->length,
7713 table,
7714 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715}
7716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717PyDoc_STRVAR(upper__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007718 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007720Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
7722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007723unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 return fixup(self, fixupper);
7726}
7727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007728PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007729 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730\n\
Georg Brandl98064072008-09-09 19:26:00 +00007731Pad a numeric string S with zeros on the left, to fill a field\n\
7732of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733
7734static PyObject *
7735unicode_zfill(PyUnicodeObject *self, PyObject *args)
7736{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 PyUnicodeObject *u;
7739
Martin v. Löwis18e16552006-02-15 17:27:45 +00007740 Py_ssize_t width;
7741 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 return NULL;
7743
7744 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007745 if (PyUnicode_CheckExact(self)) {
7746 Py_INCREF(self);
7747 return (PyObject*) self;
7748 }
7749 else
7750 return PyUnicode_FromUnicode(
7751 PyUnicode_AS_UNICODE(self),
7752 PyUnicode_GET_SIZE(self)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007753 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 }
7755
7756 fill = width - self->length;
7757
7758 u = pad(self, fill, 0, '0');
7759
Walter Dörwald068325e2002-04-15 13:36:47 +00007760 if (u == NULL)
7761 return NULL;
7762
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 if (u->str[fill] == '+' || u->str[fill] == '-') {
7764 /* move sign to beginning of string */
7765 u->str[0] = u->str[fill];
7766 u->str[fill] = '0';
7767 }
7768
7769 return (PyObject*) u;
7770}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772#if 0
7773static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007774free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007776 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777}
7778#endif
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007781 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007783Return True if S starts with the specified prefix, False otherwise.\n\
7784With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007785With optional end, stop comparing S at that position.\n\
7786prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
7788static PyObject *
7789unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007790 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791{
Georg Brandl24250812006-06-09 18:45:48 +00007792 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007794 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007795 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007796 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
Georg Brandl24250812006-06-09 18:45:48 +00007798 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007799 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7800 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007801 if (PyTuple_Check(subobj)) {
7802 Py_ssize_t i;
7803 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7804 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007805 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007806 if (substring == NULL)
7807 return NULL;
7808 result = tailmatch(self, substring, start, end, -1);
7809 Py_DECREF(substring);
7810 if (result) {
7811 Py_RETURN_TRUE;
7812 }
7813 }
7814 /* nothing matched */
7815 Py_RETURN_FALSE;
7816 }
7817 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007819 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007820 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007822 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823}
7824
7825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007826PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007827 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007829Return True if S ends with the specified suffix, False otherwise.\n\
7830With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007831With optional end, stop comparing S at that position.\n\
7832suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834static PyObject *
7835unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007836 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837{
Georg Brandl24250812006-06-09 18:45:48 +00007838 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007840 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007841 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007842 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843
Georg Brandl24250812006-06-09 18:45:48 +00007844 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007845 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7846 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007847 if (PyTuple_Check(subobj)) {
7848 Py_ssize_t i;
7849 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7850 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007851 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007852 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007853 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007854 result = tailmatch(self, substring, start, end, +1);
7855 Py_DECREF(substring);
7856 if (result) {
7857 Py_RETURN_TRUE;
7858 }
7859 }
7860 Py_RETURN_FALSE;
7861 }
7862 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863 if (substring == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865
Georg Brandl24250812006-06-09 18:45:48 +00007866 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007868 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869}
7870
7871
Eric Smitha9f7d622008-02-17 19:46:49 +00007872/* Implements do_string_format, which is unicode because of stringlib */
7873#include "stringlib/string_format.h"
7874
7875PyDoc_STRVAR(format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007876 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007877\n\
7878");
7879
Eric Smithdc13b792008-05-30 18:10:04 +00007880static PyObject *
7881unicode__format__(PyObject *self, PyObject *args)
7882{
7883 PyObject *format_spec;
7884 PyObject *result = NULL;
7885 PyObject *tmp = NULL;
7886
7887 /* If 2.x, convert format_spec to the same type as value */
7888 /* This is to allow things like u''.format('') */
7889 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7890 goto done;
7891 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7892 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007893 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007894 goto done;
7895 }
7896 tmp = PyObject_Unicode(format_spec);
7897 if (tmp == NULL)
7898 goto done;
7899 format_spec = tmp;
7900
7901 result = _PyUnicode_FormatAdvanced(self,
7902 PyUnicode_AS_UNICODE(format_spec),
7903 PyUnicode_GET_SIZE(format_spec));
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007904 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007905 Py_XDECREF(tmp);
7906 return result;
7907}
7908
Eric Smitha9f7d622008-02-17 19:46:49 +00007909PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007910 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007911\n\
7912");
7913
Robert Schuppenies901c9972008-06-10 10:10:31 +00007914static PyObject *
7915unicode__sizeof__(PyUnicodeObject *v)
7916{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007917 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7918 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007919}
7920
7921PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007922 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007923\n\
7924");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007925
7926static PyObject *
7927unicode_getnewargs(PyUnicodeObject *v)
7928{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007929 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007930}
7931
7932
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933static PyMethodDef unicode_methods[] = {
7934
7935 /* Order is according to common usage: often used methods should
7936 appear first, since lookup is done sequentially. */
7937
Georg Brandlecdc0a92006-03-30 12:19:07 +00007938 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007939 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7940 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007941 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007942 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7943 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7944 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7945 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7946 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7947 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7948 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007949 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007950 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7951 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7952 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007953 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007954 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007955/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7956 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7957 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7958 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007959 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007960 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007961 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007962 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007963 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7964 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7965 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7966 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7967 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7968 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7969 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7970 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7971 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7972 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7973 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7974 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7975 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7976 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007977 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007978 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7979 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7980 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7981 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007982 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007983#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985#endif
7986
7987#if 0
7988 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007989 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990#endif
7991
Benjamin Peterson186d9b32009-01-31 16:34:44 +00007992 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 {NULL, NULL}
7994};
7995
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007996static PyObject *
7997unicode_mod(PyObject *v, PyObject *w)
7998{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00007999 if (!PyUnicode_Check(v)) {
8000 Py_INCREF(Py_NotImplemented);
8001 return Py_NotImplemented;
8002 }
8003 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008004}
8005
8006static PyNumberMethods unicode_as_number = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008007 0, /*nb_add*/
8008 0, /*nb_subtract*/
8009 0, /*nb_multiply*/
8010 0, /*nb_divide*/
8011 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008012};
8013
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008015 (lenfunc) unicode_length, /* sq_length */
8016 PyUnicode_Concat, /* sq_concat */
8017 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8018 (ssizeargfunc) unicode_getitem, /* sq_item */
8019 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8020 0, /* sq_ass_item */
8021 0, /* sq_ass_slice */
8022 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023};
8024
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008025static PyObject*
8026unicode_subscript(PyUnicodeObject* self, PyObject* item)
8027{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008028 if (PyIndex_Check(item)) {
8029 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008030 if (i == -1 && PyErr_Occurred())
8031 return NULL;
8032 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008033 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008034 return unicode_getitem(self, i);
8035 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008036 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008037 Py_UNICODE* source_buf;
8038 Py_UNICODE* result_buf;
8039 PyObject* result;
8040
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008041 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008042 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008043 return NULL;
8044 }
8045
8046 if (slicelength <= 0) {
8047 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008048 } else if (start == 0 && step == 1 && slicelength == self->length &&
8049 PyUnicode_CheckExact(self)) {
8050 Py_INCREF(self);
8051 return (PyObject *)self;
8052 } else if (step == 1) {
8053 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008054 } else {
8055 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008056 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8057 sizeof(Py_UNICODE));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008058
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008059 if (result_buf == NULL)
8060 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008061
8062 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8063 result_buf[i] = source_buf[cur];
8064 }
Tim Petersced69f82003-09-16 20:30:58 +00008065
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008066 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008067 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008068 return result;
8069 }
8070 } else {
8071 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8072 return NULL;
8073 }
8074}
8075
8076static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008077 (lenfunc)unicode_length, /* mp_length */
8078 (binaryfunc)unicode_subscript, /* mp_subscript */
8079 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008080};
8081
Martin v. Löwis18e16552006-02-15 17:27:45 +00008082static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008084 Py_ssize_t index,
8085 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
8087 if (index != 0) {
8088 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008089 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 return -1;
8091 }
8092 *ptr = (void *) self->str;
8093 return PyUnicode_GET_DATA_SIZE(self);
8094}
8095
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096static Py_ssize_t
8097unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008098 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099{
8100 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008101 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 return -1;
8103}
8104
8105static int
8106unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008107 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
8109 if (lenp)
8110 *lenp = PyUnicode_GET_DATA_SIZE(self);
8111 return 1;
8112}
8113
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008114static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008116 Py_ssize_t index,
8117 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118{
8119 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008120
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 if (index != 0) {
8122 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008123 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 return -1;
8125 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008126 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 if (str == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008128 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008129 *ptr = (void *) PyString_AS_STRING(str);
8130 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131}
8132
8133/* Helpers for PyUnicode_Format() */
8134
8135static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008136getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008138 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 if (argidx < arglen) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008140 (*p_argidx)++;
8141 if (arglen < 0)
8142 return args;
8143 else
8144 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 }
8146 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008147 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 return NULL;
8149}
8150
8151#define F_LJUST (1<<0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008152#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153#define F_BLANK (1<<2)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008154#define F_ALT (1<<3)
8155#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008158strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 register Py_ssize_t i;
8161 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 for (i = len - 1; i >= 0; i--)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008163 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 return len;
8166}
8167
Neal Norwitzfc76d632006-01-10 06:03:13 +00008168static int
8169doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8170{
Tim Peters15231542006-02-16 01:08:01 +00008171 Py_ssize_t result;
8172
Neal Norwitzfc76d632006-01-10 06:03:13 +00008173 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008174 result = strtounicode(buffer, (char *)buffer);
8175 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008176}
8177
8178static int
8179longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8180{
Tim Peters15231542006-02-16 01:08:01 +00008181 Py_ssize_t result;
8182
Neal Norwitzfc76d632006-01-10 06:03:13 +00008183 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008184 result = strtounicode(buffer, (char *)buffer);
8185 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008186}
8187
Guido van Rossum078151d2002-08-11 04:24:12 +00008188/* XXX To save some code duplication, formatfloat/long/int could have been
8189 shared with stringobject.c, converting from 8-bit to Unicode after the
8190 formatting is done. */
8191
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192static int
8193formatfloat(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008194 size_t buflen,
8195 int flags,
8196 int prec,
8197 int type,
8198 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008200 /* fmt = '%#.' + `prec` + `type`
8201 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202 char fmt[20];
8203 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008204
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 x = PyFloat_AsDouble(v);
8206 if (x == -1.0 && PyErr_Occurred())
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008207 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 if (prec < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008209 prec = 6;
Mark Dickinson75be68b2009-08-28 20:57:42 +00008210#if SIZEOF_INT > 4
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008211 /* make sure that the decimal representation of precision really does
8212 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
Mark Dickinson75be68b2009-08-28 20:57:42 +00008213 if (prec > 0x7fffffff) {
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008214 PyErr_SetString(PyExc_OverflowError,
8215 "outrageously large precision "
8216 "for formatted float");
8217 return -1;
8218 }
Mark Dickinson75be68b2009-08-28 20:57:42 +00008219#endif
Mark Dickinsondb6fa182009-03-29 16:25:46 +00008220
Mark Dickinsona30f3492009-03-29 15:06:29 +00008221 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008222 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008223 /* Worst case length calc to ensure no buffer overrun:
8224
8225 'g' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008226 fmt = %#.<prec>g
8227 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8228 for any double rep.)
8229 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008230
8231 'f' formats:
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008232 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8233 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008234
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008235 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008236 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008237
8238 */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008239 if (((type == 'g' || type == 'G') &&
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008240 buflen <= (size_t)10 + (size_t)prec) ||
8241 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8242 PyErr_SetString(PyExc_OverflowError,
8243 "formatted float is too long (precision too large?)");
8244 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008246 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008247 (flags&F_ALT) ? "#" : "",
8248 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008249 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250}
8251
Tim Peters38fd5b62000-09-21 05:43:11 +00008252static PyObject*
8253formatlong(PyObject *val, int flags, int prec, int type)
8254{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008255 char *buf;
8256 int i, len;
8257 PyObject *str; /* temporary string object. */
8258 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008259
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008260 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8261 if (!str)
8262 return NULL;
8263 result = _PyUnicode_New(len);
8264 if (!result) {
8265 Py_DECREF(str);
8266 return NULL;
8267 }
8268 for (i = 0; i < len; i++)
8269 result->str[i] = buf[i];
8270 result->str[len] = 0;
8271 Py_DECREF(str);
8272 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008273}
8274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275static int
8276formatint(Py_UNICODE *buf,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008277 size_t buflen,
8278 int flags,
8279 int prec,
8280 int type,
8281 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008283 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008284 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8285 * + 1 + 1
8286 * = 24
8287 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008288 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008289 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 long x;
8291
8292 x = PyInt_AsLong(v);
8293 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008294 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008295 if (x < 0 && type == 'u') {
8296 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008297 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008298 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8299 sign = "-";
8300 else
8301 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008303 prec = 1;
8304
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008305 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8306 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008307 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008309 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008310 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008311 return -1;
8312 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313
8314 if ((flags & F_ALT) &&
8315 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008316 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008317 * of issues that cause pain:
8318 * - when 0 is being converted, the C standard leaves off
8319 * the '0x' or '0X', which is inconsistent with other
8320 * %#x/%#X conversions and inconsistent with Python's
8321 * hex() function
8322 * - there are platforms that violate the standard and
8323 * convert 0 with the '0x' or '0X'
8324 * (Metrowerks, Compaq Tru64)
8325 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008326 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008327 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008328 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008329 * We can achieve the desired consistency by inserting our
8330 * own '0x' or '0X' prefix, and substituting %x/%X in place
8331 * of %#x/%#X.
8332 *
8333 * Note that this is the same approach as used in
8334 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008335 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008336 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8337 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008338 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008339 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008340 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8341 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008342 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008343 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008344 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008345 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008346 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008347 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348}
8349
8350static int
8351formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008352 size_t buflen,
8353 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354{
Ezio Melotti85ddea72010-02-25 17:51:33 +00008355 PyObject *unistr;
8356 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008357 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008358 if (PyUnicode_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008359 if (PyUnicode_GET_SIZE(v) != 1)
8360 goto onError;
8361 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008364 else if (PyString_Check(v)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008365 if (PyString_GET_SIZE(v) != 1)
8366 goto onError;
Ezio Melotti85ddea72010-02-25 17:51:33 +00008367 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8368 with a UnicodeDecodeError if 'char' is not decodable with the
8369 default encoding (usually ASCII, but it might be something else) */
8370 str = PyString_AS_STRING(v);
8371 if ((unsigned char)str[0] > 0x7F) {
8372 /* the char is not ASCII; try to decode the string using the
8373 default encoding and return -1 to let the UnicodeDecodeError
8374 be raised if the string can't be decoded */
8375 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8376 if (unistr == NULL)
8377 return -1;
8378 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8379 Py_DECREF(unistr);
8380 }
8381 else
8382 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008383 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384
8385 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008386 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 long x;
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008388 x = PyInt_AsLong(v);
8389 if (x == -1 && PyErr_Occurred())
8390 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008391#ifdef Py_UNICODE_WIDE
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008392 if (x < 0 || x > 0x10ffff) {
8393 PyErr_SetString(PyExc_OverflowError,
8394 "%c arg not in range(0x110000) "
8395 "(wide Python build)");
8396 return -1;
8397 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008398#else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008399 if (x < 0 || x > 0xffff) {
8400 PyErr_SetString(PyExc_OverflowError,
8401 "%c arg not in range(0x10000) "
8402 "(narrow Python build)");
8403 return -1;
8404 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008405#endif
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008406 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 }
8408 buf[1] = '\0';
8409 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008410
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008411 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008412 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008413 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008414 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415}
8416
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008417/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8418
8419 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8420 chars are formatted. XXX This is a magic number. Each formatting
8421 routine does bounds checking to ensure no overflow, but a better
8422 solution may be to malloc a buffer of appropriate size for each
8423 format. For now, the current solution is sufficient.
8424*/
8425#define FORMATBUFLEN (size_t)120
8426
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008428 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429{
8430 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 int args_owned = 0;
8433 PyUnicodeObject *result = NULL;
8434 PyObject *dict = NULL;
8435 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008436
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 if (format == NULL || args == NULL) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008438 PyErr_BadInternalCall();
8439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 }
8441 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008442 if (uformat == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008443 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 fmt = PyUnicode_AS_UNICODE(uformat);
8445 fmtcnt = PyUnicode_GET_SIZE(uformat);
8446
8447 reslen = rescnt = fmtcnt + 100;
8448 result = _PyUnicode_New(reslen);
8449 if (result == NULL)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008450 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 res = PyUnicode_AS_UNICODE(result);
8452
8453 if (PyTuple_Check(args)) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008454 arglen = PyTuple_Size(args);
8455 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 }
8457 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008458 arglen = -1;
8459 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 }
Christian Heimese93237d2007-12-19 02:37:44 +00008461 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008462 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008463 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464
8465 while (--fmtcnt >= 0) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008466 if (*fmt != '%') {
8467 if (--rescnt < 0) {
8468 rescnt = fmtcnt + 100;
8469 reslen += rescnt;
8470 if (_PyUnicode_Resize(&result, reslen) < 0)
8471 goto onError;
8472 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8473 --rescnt;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008474 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008475 *res++ = *fmt++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008476 }
8477 else {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008478 /* Got a format specifier */
8479 int flags = 0;
8480 Py_ssize_t width = -1;
8481 int prec = -1;
8482 Py_UNICODE c = '\0';
8483 Py_UNICODE fill;
8484 int isnumok;
8485 PyObject *v = NULL;
8486 PyObject *temp = NULL;
8487 Py_UNICODE *pbuf;
8488 Py_UNICODE sign;
8489 Py_ssize_t len;
8490 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8491
8492 fmt++;
8493 if (*fmt == '(') {
8494 Py_UNICODE *keystart;
8495 Py_ssize_t keylen;
8496 PyObject *key;
8497 int pcount = 1;
8498
8499 if (dict == NULL) {
8500 PyErr_SetString(PyExc_TypeError,
8501 "format requires a mapping");
8502 goto onError;
8503 }
8504 ++fmt;
8505 --fmtcnt;
8506 keystart = fmt;
8507 /* Skip over balanced parentheses */
8508 while (pcount > 0 && --fmtcnt >= 0) {
8509 if (*fmt == ')')
8510 --pcount;
8511 else if (*fmt == '(')
8512 ++pcount;
8513 fmt++;
8514 }
8515 keylen = fmt - keystart - 1;
8516 if (fmtcnt < 0 || pcount > 0) {
8517 PyErr_SetString(PyExc_ValueError,
8518 "incomplete format key");
8519 goto onError;
8520 }
8521#if 0
8522 /* keys are converted to strings using UTF-8 and
8523 then looked up since Python uses strings to hold
8524 variables names etc. in its namespaces and we
8525 wouldn't want to break common idioms. */
8526 key = PyUnicode_EncodeUTF8(keystart,
8527 keylen,
8528 NULL);
8529#else
8530 key = PyUnicode_FromUnicode(keystart, keylen);
8531#endif
8532 if (key == NULL)
8533 goto onError;
8534 if (args_owned) {
8535 Py_DECREF(args);
8536 args_owned = 0;
8537 }
8538 args = PyObject_GetItem(dict, key);
8539 Py_DECREF(key);
8540 if (args == NULL) {
8541 goto onError;
8542 }
8543 args_owned = 1;
8544 arglen = -1;
8545 argidx = -2;
8546 }
8547 while (--fmtcnt >= 0) {
8548 switch (c = *fmt++) {
8549 case '-': flags |= F_LJUST; continue;
8550 case '+': flags |= F_SIGN; continue;
8551 case ' ': flags |= F_BLANK; continue;
8552 case '#': flags |= F_ALT; continue;
8553 case '0': flags |= F_ZERO; continue;
8554 }
8555 break;
8556 }
8557 if (c == '*') {
8558 v = getnextarg(args, arglen, &argidx);
8559 if (v == NULL)
8560 goto onError;
8561 if (!PyInt_Check(v)) {
8562 PyErr_SetString(PyExc_TypeError,
8563 "* wants int");
8564 goto onError;
8565 }
8566 width = PyInt_AsLong(v);
8567 if (width < 0) {
8568 flags |= F_LJUST;
8569 width = -width;
8570 }
8571 if (--fmtcnt >= 0)
8572 c = *fmt++;
8573 }
8574 else if (c >= '0' && c <= '9') {
8575 width = c - '0';
8576 while (--fmtcnt >= 0) {
8577 c = *fmt++;
8578 if (c < '0' || c > '9')
8579 break;
8580 if ((width*10) / 10 != width) {
8581 PyErr_SetString(PyExc_ValueError,
8582 "width too big");
8583 goto onError;
8584 }
8585 width = width*10 + (c - '0');
8586 }
8587 }
8588 if (c == '.') {
8589 prec = 0;
8590 if (--fmtcnt >= 0)
8591 c = *fmt++;
8592 if (c == '*') {
8593 v = getnextarg(args, arglen, &argidx);
8594 if (v == NULL)
8595 goto onError;
8596 if (!PyInt_Check(v)) {
8597 PyErr_SetString(PyExc_TypeError,
8598 "* wants int");
8599 goto onError;
8600 }
8601 prec = PyInt_AsLong(v);
8602 if (prec < 0)
8603 prec = 0;
8604 if (--fmtcnt >= 0)
8605 c = *fmt++;
8606 }
8607 else if (c >= '0' && c <= '9') {
8608 prec = c - '0';
8609 while (--fmtcnt >= 0) {
8610 c = Py_CHARMASK(*fmt++);
8611 if (c < '0' || c > '9')
8612 break;
8613 if ((prec*10) / 10 != prec) {
8614 PyErr_SetString(PyExc_ValueError,
8615 "prec too big");
8616 goto onError;
8617 }
8618 prec = prec*10 + (c - '0');
8619 }
8620 }
8621 } /* prec */
8622 if (fmtcnt >= 0) {
8623 if (c == 'h' || c == 'l' || c == 'L') {
8624 if (--fmtcnt >= 0)
8625 c = *fmt++;
8626 }
8627 }
8628 if (fmtcnt < 0) {
8629 PyErr_SetString(PyExc_ValueError,
8630 "incomplete format");
8631 goto onError;
8632 }
8633 if (c != '%') {
8634 v = getnextarg(args, arglen, &argidx);
8635 if (v == NULL)
8636 goto onError;
8637 }
8638 sign = 0;
8639 fill = ' ';
8640 switch (c) {
8641
8642 case '%':
8643 pbuf = formatbuf;
8644 /* presume that buffer length is at least 1 */
8645 pbuf[0] = '%';
8646 len = 1;
8647 break;
8648
8649 case 's':
8650 case 'r':
Victor Stinner4fd2ff92010-03-22 12:56:39 +00008651 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008652 temp = v;
8653 Py_INCREF(temp);
8654 }
8655 else {
8656 PyObject *unicode;
8657 if (c == 's')
8658 temp = PyObject_Unicode(v);
8659 else
8660 temp = PyObject_Repr(v);
8661 if (temp == NULL)
8662 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008663 if (PyUnicode_Check(temp))
8664 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008665 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008666 /* convert to string to Unicode */
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008667 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8668 PyString_GET_SIZE(temp),
8669 NULL,
8670 "strict");
8671 Py_DECREF(temp);
8672 temp = unicode;
8673 if (temp == NULL)
8674 goto onError;
8675 }
8676 else {
8677 Py_DECREF(temp);
8678 PyErr_SetString(PyExc_TypeError,
8679 "%s argument has non-string str()");
8680 goto onError;
8681 }
8682 }
8683 pbuf = PyUnicode_AS_UNICODE(temp);
8684 len = PyUnicode_GET_SIZE(temp);
8685 if (prec >= 0 && len > prec)
8686 len = prec;
8687 break;
8688
8689 case 'i':
8690 case 'd':
8691 case 'u':
8692 case 'o':
8693 case 'x':
8694 case 'X':
8695 if (c == 'i')
8696 c = 'd';
8697 isnumok = 0;
8698 if (PyNumber_Check(v)) {
8699 PyObject *iobj=NULL;
8700
8701 if (PyInt_Check(v) || (PyLong_Check(v))) {
8702 iobj = v;
8703 Py_INCREF(iobj);
8704 }
8705 else {
8706 iobj = PyNumber_Int(v);
8707 if (iobj==NULL) iobj = PyNumber_Long(v);
8708 }
8709 if (iobj!=NULL) {
8710 if (PyInt_Check(iobj)) {
8711 isnumok = 1;
8712 pbuf = formatbuf;
8713 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8714 flags, prec, c, iobj);
8715 Py_DECREF(iobj);
8716 if (len < 0)
8717 goto onError;
8718 sign = 1;
8719 }
8720 else if (PyLong_Check(iobj)) {
8721 isnumok = 1;
8722 temp = formatlong(iobj, flags, prec, c);
8723 Py_DECREF(iobj);
8724 if (!temp)
8725 goto onError;
8726 pbuf = PyUnicode_AS_UNICODE(temp);
8727 len = PyUnicode_GET_SIZE(temp);
8728 sign = 1;
8729 }
8730 else {
8731 Py_DECREF(iobj);
8732 }
8733 }
8734 }
8735 if (!isnumok) {
8736 PyErr_Format(PyExc_TypeError,
8737 "%%%c format: a number is required, "
8738 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8739 goto onError;
8740 }
8741 if (flags & F_ZERO)
8742 fill = '0';
8743 break;
8744
8745 case 'e':
8746 case 'E':
8747 case 'f':
8748 case 'F':
8749 case 'g':
8750 case 'G':
8751 if (c == 'F')
8752 c = 'f';
8753 pbuf = formatbuf;
8754 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8755 flags, prec, c, v);
8756 if (len < 0)
8757 goto onError;
8758 sign = 1;
8759 if (flags & F_ZERO)
8760 fill = '0';
8761 break;
8762
8763 case 'c':
8764 pbuf = formatbuf;
8765 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8766 if (len < 0)
8767 goto onError;
8768 break;
8769
8770 default:
8771 PyErr_Format(PyExc_ValueError,
8772 "unsupported format character '%c' (0x%x) "
8773 "at index %zd",
8774 (31<=c && c<=126) ? (char)c : '?',
8775 (int)c,
8776 (Py_ssize_t)(fmt - 1 -
8777 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008778 goto onError;
8779 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008780 if (sign) {
8781 if (*pbuf == '-' || *pbuf == '+') {
8782 sign = *pbuf++;
8783 len--;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008784 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008785 else if (flags & F_SIGN)
8786 sign = '+';
8787 else if (flags & F_BLANK)
8788 sign = ' ';
8789 else
8790 sign = 0;
8791 }
8792 if (width < len)
8793 width = len;
8794 if (rescnt - (sign != 0) < width) {
8795 reslen -= rescnt;
8796 rescnt = width + fmtcnt + 100;
8797 reslen += rescnt;
8798 if (reslen < 0) {
8799 Py_XDECREF(temp);
8800 PyErr_NoMemory();
8801 goto onError;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008802 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008803 if (_PyUnicode_Resize(&result, reslen) < 0) {
8804 Py_XDECREF(temp);
8805 goto onError;
8806 }
8807 res = PyUnicode_AS_UNICODE(result)
8808 + reslen - rescnt;
8809 }
8810 if (sign) {
8811 if (fill != ' ')
8812 *res++ = sign;
8813 rescnt--;
8814 if (width > len)
8815 width--;
8816 }
8817 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8818 assert(pbuf[0] == '0');
8819 assert(pbuf[1] == c);
8820 if (fill != ' ') {
8821 *res++ = *pbuf++;
8822 *res++ = *pbuf++;
8823 }
8824 rescnt -= 2;
8825 width -= 2;
8826 if (width < 0)
8827 width = 0;
8828 len -= 2;
8829 }
8830 if (width > len && !(flags & F_LJUST)) {
8831 do {
8832 --rescnt;
8833 *res++ = fill;
8834 } while (--width > len);
8835 }
8836 if (fill == ' ') {
8837 if (sign)
8838 *res++ = sign;
8839 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8840 assert(pbuf[0] == '0');
8841 assert(pbuf[1] == c);
8842 *res++ = *pbuf++;
8843 *res++ = *pbuf++;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008844 }
8845 }
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008846 Py_UNICODE_COPY(res, pbuf, len);
8847 res += len;
8848 rescnt -= len;
8849 while (--width >= len) {
8850 --rescnt;
8851 *res++ = ' ';
8852 }
8853 if (dict && (argidx < arglen) && c != '%') {
8854 PyErr_SetString(PyExc_TypeError,
8855 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008856 Py_XDECREF(temp);
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008857 goto onError;
8858 }
8859 Py_XDECREF(temp);
8860 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 } /* until end */
8862 if (argidx < arglen && !dict) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008863 PyErr_SetString(PyExc_TypeError,
8864 "not all arguments converted during string formatting");
8865 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866 }
8867
Thomas Woutersa96affe2006-03-12 00:29:36 +00008868 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008871 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 }
8873 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 return (PyObject *)result;
8875
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008876 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 Py_XDECREF(result);
8878 Py_DECREF(uformat);
8879 if (args_owned) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008880 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881 }
8882 return NULL;
8883}
8884
8885static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008886 (readbufferproc) unicode_buffer_getreadbuf,
8887 (writebufferproc) unicode_buffer_getwritebuf,
8888 (segcountproc) unicode_buffer_getsegcount,
8889 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890};
8891
Jeremy Hylton938ace62002-07-17 16:30:39 +00008892static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008893unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8894
Tim Peters6d6c1a32001-08-02 04:15:00 +00008895static PyObject *
8896unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8897{
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008898 PyObject *x = NULL;
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008899 static char *kwlist[] = {"string", "encoding", "errors", 0};
8900 char *encoding = NULL;
8901 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008903 if (type != &PyUnicode_Type)
8904 return unicode_subtype_new(type, args, kwds);
8905 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008906 kwlist, &x, &encoding, &errors))
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008907 return NULL;
8908 if (x == NULL)
8909 return (PyObject *)_PyUnicode_New(0);
8910 if (encoding == NULL && errors == NULL)
8911 return PyObject_Unicode(x);
8912 else
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008913 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008914}
8915
Guido van Rossume023fe02001-08-30 03:12:59 +00008916static PyObject *
8917unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8918{
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008919 PyUnicodeObject *tmp, *pnew;
8920 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008921
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008922 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8923 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8924 if (tmp == NULL)
8925 return NULL;
8926 assert(PyUnicode_Check(tmp));
8927 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8928 if (pnew == NULL) {
8929 Py_DECREF(tmp);
8930 return NULL;
8931 }
8932 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8933 if (pnew->str == NULL) {
8934 _Py_ForgetReference((PyObject *)pnew);
8935 PyObject_Del(pnew);
8936 Py_DECREF(tmp);
8937 return PyErr_NoMemory();
8938 }
8939 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8940 pnew->length = n;
8941 pnew->hash = tmp->hash;
8942 Py_DECREF(tmp);
8943 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008944}
8945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008946PyDoc_STRVAR(unicode_doc,
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008947 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008948\n\
8949Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008950encoding defaults to the current default string encoding.\n\
8951errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008952
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008954 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008955 "unicode", /* tp_name */
8956 sizeof(PyUnicodeObject), /* tp_size */
8957 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 /* Slots */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008959 (destructor)unicode_dealloc, /* tp_dealloc */
8960 0, /* tp_print */
8961 0, /* tp_getattr */
8962 0, /* tp_setattr */
8963 0, /* tp_compare */
8964 unicode_repr, /* tp_repr */
8965 &unicode_as_number, /* tp_as_number */
8966 &unicode_as_sequence, /* tp_as_sequence */
8967 &unicode_as_mapping, /* tp_as_mapping */
8968 (hashfunc) unicode_hash, /* tp_hash*/
8969 0, /* tp_call*/
8970 (reprfunc) unicode_str, /* tp_str */
8971 PyObject_GenericGetAttr, /* tp_getattro */
8972 0, /* tp_setattro */
8973 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008974 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Peterson339f8c62009-01-31 22:25:08 +00008975 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson186d9b32009-01-31 16:34:44 +00008976 unicode_doc, /* tp_doc */
8977 0, /* tp_traverse */
8978 0, /* tp_clear */
8979 PyUnicode_RichCompare, /* tp_richcompare */
8980 0, /* tp_weaklistoffset */
8981 0, /* tp_iter */
8982 0, /* tp_iternext */
8983 unicode_methods, /* tp_methods */
8984 0, /* tp_members */
8985 0, /* tp_getset */
8986 &PyBaseString_Type, /* tp_base */
8987 0, /* tp_dict */
8988 0, /* tp_descr_get */
8989 0, /* tp_descr_set */
8990 0, /* tp_dictoffset */
8991 0, /* tp_init */
8992 0, /* tp_alloc */
8993 unicode_new, /* tp_new */
8994 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995};
8996
8997/* Initialize the Unicode implementation */
8998
Thomas Wouters78890102000-07-22 19:25:51 +00008999void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009001 int i;
9002
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009003 /* XXX - move this array to unicodectype.c ? */
9004 Py_UNICODE linebreak[] = {
9005 0x000A, /* LINE FEED */
9006 0x000D, /* CARRIAGE RETURN */
9007 0x001C, /* FILE SEPARATOR */
9008 0x001D, /* GROUP SEPARATOR */
9009 0x001E, /* RECORD SEPARATOR */
9010 0x0085, /* NEXT LINE */
9011 0x2028, /* LINE SEPARATOR */
9012 0x2029, /* PARAGRAPH SEPARATOR */
9013 };
9014
Fred Drakee4315f52000-05-09 19:53:39 +00009015 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009016 free_list = NULL;
9017 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009019 if (!unicode_empty)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009020 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009021
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009022 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009023 for (i = 0; i < 256; i++)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009024 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009025 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009026 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009027
9028 /* initialize the linebreak bloom filter */
9029 bloom_linebreak = make_bloom_mask(
9030 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9031 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009032
9033 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034}
9035
9036/* Finalize the Unicode implementation */
9037
Christian Heimes3b718a72008-02-14 12:47:33 +00009038int
9039PyUnicode_ClearFreeList(void)
9040{
9041 int freelist_size = numfree;
9042 PyUnicodeObject *u;
9043
9044 for (u = free_list; u != NULL;) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009045 PyUnicodeObject *v = u;
9046 u = *(PyUnicodeObject **)u;
9047 if (v->str)
9048 PyObject_DEL(v->str);
9049 Py_XDECREF(v->defenc);
9050 PyObject_Del(v);
9051 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009052 }
9053 free_list = NULL;
9054 assert(numfree == 0);
9055 return freelist_size;
9056}
9057
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058void
Thomas Wouters78890102000-07-22 19:25:51 +00009059_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009061 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009063 Py_XDECREF(unicode_empty);
9064 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009065
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009066 for (i = 0; i < 256; i++) {
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009067 if (unicode_latin1[i]) {
9068 Py_DECREF(unicode_latin1[i]);
9069 unicode_latin1[i] = NULL;
9070 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009071 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009072 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009074
Anthony Baxterac6bd462006-04-13 02:06:09 +00009075#ifdef __cplusplus
9076}
9077#endif
9078
9079
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009080/*
Benjamin Peterson339f8c62009-01-31 22:25:08 +00009081 Local variables:
9082 c-basic-offset: 4
9083 indent-tabs-mode: nil
9084 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009085*/